summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChunseok Lee <chunseok.lee@samsung.com>2020-07-30 11:40:16 +0900
committerChunseok Lee <chunseok.lee@samsung.com>2020-07-30 11:40:16 +0900
commit9e45ab56bd165609118989c0d1bec309c3754560 (patch)
tree4979e8674abc7d21a6471770c1355e0e6c0e8a3f
parent05e0ec30a632339a8533082476f27bda31ccde16 (diff)
downloadnnfw-9e45ab56bd165609118989c0d1bec309c3754560.tar.gz
nnfw-9e45ab56bd165609118989c0d1bec309c3754560.tar.bz2
nnfw-9e45ab56bd165609118989c0d1bec309c3754560.zip
Change-Id: Id38b617d325ef7e854995a47f032bdf482a779b3
-rw-r--r--.ahub/tcchecker-tca/config.yaml43
-rw-r--r--compiler/.ahub/tcchecker-tca/config.yaml54
-rw-r--r--compiler/bcq-tools/CMakeLists.txt27
-rw-r--r--compiler/bcq-tools/README.md78
-rw-r--r--compiler/bcq-tools/generate_bcq_output_arrays90
-rw-r--r--compiler/bcq-tools/preserve_bcq_info116
-rw-r--r--compiler/circle-quantizer/CMakeLists.txt1
-rw-r--r--compiler/circle-quantizer/requires.cmake1
-rw-r--r--compiler/circle-quantizer/src/CircleQuantizer.cpp18
-rw-r--r--compiler/circle-tensordump/driver/Driver.cpp2
-rw-r--r--compiler/circle-tensordump/src/Dump.cpp48
-rw-r--r--compiler/circle-verify/src/Driver.cpp2
-rw-r--r--compiler/circle2circle-dredd-recipe-test/CMakeLists.txt93
-rw-r--r--compiler/circle2circle-dredd-recipe-test/requires.cmake4
-rw-r--r--compiler/circle2circle-dredd-recipe-test/test.lst3
-rwxr-xr-xcompiler/circle2circle-dredd-recipe-test/testall.sh13
-rw-r--r--compiler/circle2circle/CMakeLists.txt2
-rw-r--r--compiler/circle2circle/requires.cmake1
-rw-r--r--compiler/circle2circle/src/Circle2Circle.cpp14
-rw-r--r--compiler/circlechef/CMakeLists.txt4
-rw-r--r--compiler/circlechef/circle/src/RecipeChef.cpp2
-rw-r--r--compiler/circlechef/core/src/ModelChef.cpp1
-rw-r--r--compiler/circlechef/proto/circlechef.proto1
-rw-r--r--compiler/circlechef/tools/file/Driver.cpp2
-rw-r--r--compiler/circlechef/tools/reverse/Driver.cpp2
-rw-r--r--compiler/circledump/driver/Driver.cpp2
-rw-r--r--compiler/circledump/src/OpPrinter.cpp15
-rw-r--r--compiler/common-artifacts/exclude.lst31
-rw-r--r--compiler/hermes/src/hermes.test.cpp25
-rw-r--r--compiler/locomotiv/src/Node/BiasEncode.test.cpp14
-rw-r--r--compiler/locomotiv/src/Node/MatMul.test.cpp4
-rw-r--r--compiler/locop/src/FormattedGraph.test.cpp2
-rw-r--r--compiler/locop/src/FormattedTensorShape.test.cpp2
-rw-r--r--compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h9
-rw-r--r--compiler/luci-interpreter/src/core/KernelParams.h5
-rw-r--r--compiler/luci-interpreter/src/kernels/CMakeLists.txt9
-rw-r--r--compiler/luci-interpreter/src/kernels/DepthToSpace.cpp90
-rw-r--r--compiler/luci-interpreter/src/kernels/DepthToSpace.h45
-rw-r--r--compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp60
-rw-r--r--compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp9
-rw-r--r--compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp11
-rw-r--r--compiler/luci-interpreter/src/kernels/Logistic.test.cpp6
-rw-r--r--compiler/luci-interpreter/src/kernels/Reverse.cpp81
-rw-r--r--compiler/luci-interpreter/src/kernels/Reverse.h43
-rw-r--r--compiler/luci-interpreter/src/kernels/Reverse.test.cpp66
-rw-r--r--compiler/luci-interpreter/src/kernels/Slice.cpp149
-rw-r--r--compiler/luci-interpreter/src/kernels/Slice.h44
-rw-r--r--compiler/luci-interpreter/src/kernels/Slice.test.cpp64
-rw-r--r--compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp23
-rw-r--r--compiler/luci-interpreter/src/loader/CMakeLists.txt7
-rw-r--r--compiler/luci-interpreter/src/loader/GraphLoader.cpp23
-rw-r--r--compiler/luci-interpreter/src/loader/GraphLoader.h18
-rw-r--r--compiler/luci-interpreter/src/loader/KernelBuilder.cpp108
-rw-r--r--compiler/luci-interpreter/src/loader/KernelBuilder.h17
-rw-r--r--compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp743
-rw-r--r--compiler/luci-interpreter/src/loader/ModuleLoader.cpp7
-rw-r--r--compiler/luci-interpreter/src/loader/ModuleLoader.h5
-rwxr-xr-xcompiler/luci-value-test/evalverify.sh6
-rw-r--r--compiler/luci-value-test/test.lst110
-rw-r--r--compiler/luci/export/src/CircleOperationExporter.cpp2
-rw-r--r--compiler/luci/export/src/CircleTensorExporter.cpp5
-rw-r--r--compiler/luci/import/src/CircleReader.cpp2
-rw-r--r--compiler/luci/import/src/Importer.test.cpp7
-rw-r--r--compiler/luci/import/src/Nodes/CircleLogistic.cpp14
-rw-r--r--compiler/luci/import/src/Nodes/CircleTransposeConv.cpp18
-rw-r--r--compiler/luci/lang/include/luci/IR/CircleNodes.lst1
-rw-r--r--compiler/luci/lang/include/luci/IR/CircleQuantParam.h1
-rw-r--r--compiler/luci/lang/src/Module.test.cpp2
-rw-r--r--compiler/luci/lang/src/Nodes/CircleCustom.test.cpp7
-rw-r--r--compiler/luci/lang/src/Nodes/CircleIf.test.cpp4
-rw-r--r--compiler/luci/lang/src/Nodes/CircleWhile.test.cpp4
-rw-r--r--compiler/luci/pass/src/CircleOptimizer.cpp4
-rw-r--r--compiler/luci/pass/src/FuseBCQPass.cpp426
-rw-r--r--compiler/luci/pass/src/QuantizationUtils.cpp7
-rw-r--r--compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp21
-rw-r--r--compiler/luci/tests/test.lst9
-rw-r--r--compiler/one-cmds/one-codegen25
-rw-r--r--compiler/one-cmds/one-import25
-rw-r--r--compiler/one-cmds/one-import-tf30
-rw-r--r--compiler/one-cmds/one-import-tflite20
-rw-r--r--compiler/one-cmds/one-optimize20
-rw-r--r--compiler/one-cmds/one-pack23
-rw-r--r--compiler/one-cmds/one-quantize23
-rw-r--r--compiler/one-cmds/requires.cmake1
-rw-r--r--compiler/record-minmax/CMakeLists.txt5
-rw-r--r--compiler/record-minmax/driver/Driver.cpp16
-rw-r--r--compiler/record-minmax/requires.cmake1
-rw-r--r--compiler/record-minmax/src/HDF5Importer.cpp1
-rw-r--r--compiler/record-minmax/src/MinMaxObserver.cpp3
-rw-r--r--compiler/record-minmax/src/RecordMinMax.cpp2
-rw-r--r--compiler/record-minmax/tests/RecordFunction.test.cpp14
-rw-r--r--compiler/tfl-verify/CMakeLists.txt1
-rw-r--r--compiler/tfl-verify/requires.cmake1
-rw-r--r--compiler/tfl-verify/src/Driver.cpp19
-rw-r--r--compiler/tflchef/core/src/ModelChef.cpp1
-rw-r--r--compiler/tflchef/proto/tflchef.proto1
-rw-r--r--compiler/tflchef/tflite/src/RecipeChef.cpp2
-rw-r--r--compiler/tflchef/tools/file/Driver.cpp2
-rw-r--r--compiler/tflchef/tools/reverse/Driver.cpp2
-rw-r--r--compiler/tfldump/driver/Driver.cpp2
-rw-r--r--compiler/tflite2circle/CMakeLists.txt1
-rw-r--r--compiler/tflite2circle/driver/Driver.cpp17
-rw-r--r--compiler/tflite2circle/requires.cmake1
-rw-r--r--compiler/vconone/CMakeLists.txt31
-rw-r--r--compiler/vconone/README.md14
-rw-r--r--compiler/vconone/driver/driver.cpp36
-rw-r--r--compiler/vconone/include/vconone/vconone.h61
-rw-r--r--compiler/vconone/src/version.cpp63
-rw-r--r--compiler/vconone/src/version.test.cpp49
-rw-r--r--compiler/vconone/version_cfg.h.in22
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h124
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h121
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h82
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h117
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h83
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h82
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h109
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h88
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h96
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h96
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h118
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h100
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h97
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h11
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h129
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h69
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h75
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h68
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h201
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h4
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h142
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h62
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h64
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h103
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h120
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h68
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h81
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h176
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h102
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h65
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h7
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h79
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h78
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h70
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h4
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h170
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h63
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h130
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h99
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h136
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h79
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h68
-rw-r--r--compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp39
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl137
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl191
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl233
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl185
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h206
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h185
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl120
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl138
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl185
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp181
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp132
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp140
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp372
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp3
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp2
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp210
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp3
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp1
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp148
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp188
-rw-r--r--compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp118
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp671
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp181
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp221
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp291
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp2
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp181
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp144
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp2
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp52
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp52
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp267
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp2
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp16
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp4
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp16
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp180
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp2
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp2
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp2
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp63
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp163
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp8
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp52
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp250
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp92
-rw-r--r--compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp4
-rw-r--r--compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp53
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp4
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp6
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp60
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp63
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp4
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp14
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp7
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp513
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp4
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp4
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp55
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp161
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp180
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp114
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp64
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp231
-rw-r--r--compute/cker/CMakeLists.txt3
-rw-r--r--compute/cker/include/cker/Types.h11
-rw-r--r--compute/cker/include/cker/Utils.h62
-rw-r--r--compute/cker/include/cker/operation/FullyConnected.h13
-rw-r--r--compute/cker/include/cker/operation/L2Normalize.h94
-rw-r--r--compute/cker/include/cker/operation/Logistic.h9
-rw-r--r--compute/cker/include/cker/operation/Pad.h15
-rw-r--r--compute/cker/include/cker/operation/Quantize.h47
-rw-r--r--compute/cker/include/cker/operation/SpaceToDepth.h71
-rw-r--r--compute/cker/include/cker/ruy/RuySupport.h2
-rw-r--r--docs/howto/how-to-build-runtime.md6
-rw-r--r--docs/nnfw/howto/CrossBuildForAndroid.md4
-rw-r--r--docs/runtime/core.md4
-rw-r--r--docs/runtime/heterogeneous-execution.md4
-rw-r--r--infra/cmake/packages/ARMComputeSourceConfig.cmake2
-rw-r--r--infra/cmake/packages/FlatBuffersConfig.cmake3
-rw-r--r--infra/cmake/packages/HDF5Config.cmake1
-rw-r--r--infra/cmake/packages/Pybind11Config.cmake21
-rw-r--r--infra/cmake/packages/Pybind11SourceConfig.cmake18
-rw-r--r--infra/docker/Dockerfile3
-rw-r--r--infra/docker/Dockerfile.18047
-rw-r--r--infra/nncc/CMakeLists.txt1
-rw-r--r--infra/nncc/command/utcount2
-rw-r--r--infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt2
-rw-r--r--infra/nnfw/config/gbs.conf6
-rw-r--r--infra/packaging/preset/2020063014
-rw-r--r--infra/packaging/res/tf2nnpkg.2020063019
-rw-r--r--infra/scripts/build-tcm.sh24
-rw-r--r--infra/scripts/compiler_modules.sh2
-rwxr-xr-xinfra/scripts/docker_build_cross_aarch64_runtime.sh2
-rwxr-xr-xinfra/scripts/docker_build_cross_arm_runtime.sh2
-rwxr-xr-xinfra/scripts/docker_build_cross_arm_runtime_release.sh2
-rwxr-xr-xinfra/scripts/docker_build_cross_coverage.sh2
-rwxr-xr-xinfra/scripts/docker_build_nncc.sh10
-rwxr-xr-xinfra/scripts/docker_build_tizen_cross.sh2
-rwxr-xr-xinfra/scripts/docker_collect_nnpkg_resources.sh2
-rwxr-xr-xinfra/scripts/tizen_xu4_test.sh2
-rw-r--r--master_diff_1.7.0.patch30424
-rw-r--r--packaging/nnfw.spec2
-rw-r--r--res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe26
-rw-r--r--res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe44
-rw-r--r--res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule3
-rw-r--r--res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe61
-rw-r--r--res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe22
-rw-r--r--res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe19
-rw-r--r--res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe2
-rw-r--r--res/TensorFlowLiteRecipes/Unique_000/test.recipe27
-rw-r--r--res/TensorFlowLiteRecipes/Unique_000/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/Unique_001/test.recipe27
-rw-r--r--res/TensorFlowLiteRecipes/Unique_001/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/Unique_002/test.recipe27
-rw-r--r--res/TensorFlowLiteRecipes/Unique_002/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/Unique_003/test.recipe27
-rw-r--r--res/TensorFlowLiteRecipes/Unique_003/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe28
-rw-r--r--res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse0
-rw-r--r--res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe28
-rw-r--r--res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse0
-rw-r--r--runtime/libs/benchmark/CMakeLists.txt3
-rw-r--r--runtime/libs/benchmark/src/Result.cpp2
-rw-r--r--runtime/onert/api/include/nnfw.h18
-rw-r--r--runtime/onert/api/src/nnfw_api.cc1
-rw-r--r--runtime/onert/api/src/nnfw_api_internal.cc31
-rw-r--r--runtime/onert/backend/acl_cl/KernelGenerator.cc804
-rw-r--r--runtime/onert/backend/acl_common/AclKernelGen.h269
-rw-r--r--runtime/onert/backend/acl_neon/KernelGenerator.cc777
-rw-r--r--runtime/onert/backend/cpu/ConstantInitializer.cc35
-rw-r--r--runtime/onert/backend/cpu/ConstantInitializer.h9
-rw-r--r--runtime/onert/backend/cpu/KernelGenerator.cc509
-rw-r--r--runtime/onert/backend/cpu/KernelGenerator.h3
-rw-r--r--runtime/onert/backend/cpu/StaticTensorManager.cc104
-rw-r--r--runtime/onert/backend/cpu/StaticTensorManager.h61
-rw-r--r--runtime/onert/backend/cpu/Tensor.h15
-rw-r--r--runtime/onert/backend/cpu/TensorBuilder.cc18
-rw-r--r--runtime/onert/backend/cpu/TensorBuilder.h13
-rw-r--r--runtime/onert/backend/cpu/ops/CompareLayer.cc238
-rw-r--r--runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc35
-rw-r--r--runtime/onert/backend/cpu/ops/FullyConnectedLayer.h3
-rw-r--r--runtime/onert/backend/cpu/ops/L2NormLayer.cc71
-rw-r--r--runtime/onert/backend/cpu/ops/L2NormLayer.h55
-rw-r--r--runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc4
-rw-r--r--runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h7
-rw-r--r--runtime/onert/backend/cpu/ops/OperationUtils.h11
-rw-r--r--runtime/onert/backend/cpu/ops/PadLayer.cc25
-rw-r--r--runtime/onert/backend/cpu/ops/PadLayer.h8
-rw-r--r--runtime/onert/backend/cpu/ops/QuantizeLayer.cc63
-rw-r--r--runtime/onert/backend/cpu/ops/QuantizeLayer.h56
-rw-r--r--runtime/onert/backend/cpu/ops/SliceLayer.cc16
-rw-r--r--runtime/onert/backend/cpu/ops/SliceLayer.h3
-rw-r--r--runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc70
-rw-r--r--runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h54
-rw-r--r--runtime/onert/core/include/backend/ITensorBuilder.h4
-rw-r--r--runtime/onert/core/include/backend/ITensorRegistry.h68
-rw-r--r--runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h4
-rw-r--r--runtime/onert/core/include/compiler/StaticShapeInference.h1
-rw-r--r--runtime/onert/core/include/exec/DynamicShapeInference.h1
-rw-r--r--runtime/onert/core/include/ir/Operations.Include.h1
-rw-r--r--runtime/onert/core/include/ir/Operations.lst1
-rw-r--r--runtime/onert/core/include/ir/operation/LogSoftmax.h2
-rw-r--r--runtime/onert/core/include/ir/operation/Pad.h2
-rw-r--r--runtime/onert/core/include/ir/operation/Quantize.h49
-rw-r--r--runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc14
-rw-r--r--runtime/onert/core/src/backend/controlflow/KernelGenerator.cc22
-rw-r--r--runtime/onert/core/src/backend/controlflow/TensorBuilder.cc6
-rw-r--r--runtime/onert/core/src/backend/controlflow/UserTensor.h1
-rw-r--r--runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc10
-rw-r--r--runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc28
-rw-r--r--runtime/onert/core/src/compiler/ExecutorFactory.cc37
-rw-r--r--runtime/onert/core/src/compiler/ExecutorFactory.h3
-rw-r--r--runtime/onert/core/src/compiler/HEScheduler.h10
-rw-r--r--runtime/onert/core/src/compiler/OperationValidator.cc161
-rw-r--r--runtime/onert/core/src/compiler/OperationValidator.h4
-rw-r--r--runtime/onert/core/src/compiler/StaticShapeInference.cc5
-rw-r--r--runtime/onert/core/src/compiler/TensorBuilders.h12
-rw-r--r--runtime/onert/core/src/exec/DynamicShapeInference.cc5
-rw-r--r--runtime/onert/core/src/exec/ExecutorBase.cc4
-rw-r--r--runtime/onert/core/src/interp/operations/Pad.cc4
-rw-r--r--runtime/onert/core/src/ir/LoweredGraph.cc3
-rw-r--r--runtime/onert/core/src/ir/operation/Quantize.cc37
-rw-r--r--runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc195
-rw-r--r--runtime/onert/core/src/ir/pass/PermutationEliminationPass.h86
-rw-r--r--runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc15
-rw-r--r--runtime/onert/frontend/base_loader/include/base_loader.h36
-rw-r--r--runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc337
-rw-r--r--runtime/onert/test/core/exec/ExecInstance.cc94
-rw-r--r--tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl18
-rw-r--r--tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon19
-rw-r--r--tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu13
-rw-r--r--tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl18
-rw-r--r--tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon19
-rw-r--r--tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu13
-rw-r--r--tests/nnapi/nnapi_gtest.skip.noarch.interp16
-rw-r--r--tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu13
-rw-r--r--tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py30
-rw-r--r--tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py (renamed from tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py)0
-rw-r--r--tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py (renamed from tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py)0
-rw-r--r--tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py (renamed from tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py)0
-rw-r--r--tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py (renamed from tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py)0
-rw-r--r--tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py (renamed from tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py)0
-rw-r--r--tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py (renamed from tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py)0
-rw-r--r--tests/nnapi/specs/V1_2/quantize.mod.py (renamed from tests/nnapi/specs/skip/V1_2/quantize.mod.py)0
-rw-r--r--tests/nnfw_api/src/ValidationTestAddModelLoaded.cc19
-rw-r--r--tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc6
-rw-r--r--tests/nnfw_api/src/ValidationTestSessionCreated.cc28
-rwxr-xr-xtests/scripts/benchmark_nnapi.sh23
-rwxr-xr-xtests/scripts/common.sh11
-rwxr-xr-xtests/scripts/framework/run_test.sh60
-rwxr-xr-xtests/scripts/test-driver.sh17
-rwxr-xr-xtests/scripts/test_framework.sh10
-rw-r--r--tests/tools/nnpackage_run/CMakeLists.txt2
-rw-r--r--tests/tools/nnpackage_run/src/args.cc246
-rw-r--r--tests/tools/nnpackage_run/src/h5formatter.cc8
-rw-r--r--tests/tools/tflite_loader/CMakeLists.txt2
-rw-r--r--tests/tools/tflite_run/CMakeLists.txt2
-rwxr-xr-xtools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh5
-rwxr-xr-xtools/tflitefile_tool/select_operator.py21
-rw-r--r--tools/tflkit/README.md12
-rw-r--r--tools/update_version/update-version11
386 files changed, 38093 insertions, 13423 deletions
diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml
new file mode 100644
index 000000000..cd34d792f
--- /dev/null
+++ b/.ahub/tcchecker-tca/config.yaml
@@ -0,0 +1,43 @@
+version: 2
+test:
+ - name: NN Runtime
+ testCaseLanguage: CPP
+ testFW: GTEST
+ testCaseFolder:
+ - ./compute/test/cker
+ - ./runtime/onert/core/src/backend/cpu_common
+ - ./runtime/onert/frontend/nnapi
+ - ./runtime/onert/test/core/compiler
+ - ./runtime/onert/test/core/exec
+ - ./runtime/onert/test/core/interp
+ - ./runtime/onert/test/graph
+ - ./runtime/onert/test/graph/operand
+ - ./runtime/onert/test/graph/operation
+ - ./runtime/onert/test/graph/verifier
+ - ./runtime/onert/test/ir
+ - ./runtime/onert/test/util
+ - ./tests/nnapi/src
+ - ./tests/nnfw_api/src
+ - ./tests/tools/tflite_run/src
+
+ testFile:
+ - extension: cpp
+ any: true
+ - extension: cc
+ any: true
+
+ testCase:
+ - condition:
+ - functionName:
+ starts:
+ - TEST
+
+ negativeTestCase:
+ - condition:
+ - testName:
+ starts:
+ - neg_
+
+ positiveTestCase:
+ - condition:
+ - inverse: negativeTestCase
diff --git a/compiler/.ahub/tcchecker-tca/config.yaml b/compiler/.ahub/tcchecker-tca/config.yaml
new file mode 100644
index 000000000..ef681de1a
--- /dev/null
+++ b/compiler/.ahub/tcchecker-tca/config.yaml
@@ -0,0 +1,54 @@
+version: 2
+test:
+ - name: NN Compiler
+ testCaseLanguage: CPP
+ testFW: GTEST
+ testCaseFolder:
+ - ./angkor
+ - ./arser
+ - ./circle2circle
+ - ./circle-quantizer
+ - ./cwrap
+ - ./foder
+ - ./hermes
+ - ./hermes-std
+ - ./loco
+ - ./locomotiv
+ - ./locop
+ - ./logo
+ - ./logo-core
+ - ./luci
+ - ./luci-interpreter
+ - ./luci-value-test
+ - ./mio-circle
+ - ./mio-tflite
+ - ./oops
+ - ./pepper-assert
+ - ./pepper-str
+ - ./pepper-strcast
+ - ./pp
+ - ./record-minmax
+ - ./safemain
+ - ./souschef
+ - ./stdex
+ - ./tflite2circle
+
+ testFile:
+ - extension: .test.cpp
+ any: true
+
+ testCase:
+ - condition:
+ - functionName:
+ starts:
+ - TEST
+
+ negativeTestCase:
+ - condition:
+ - testName:
+ ends:
+ - _NEG
+
+ positiveTestCase:
+ - condition:
+ - inverse: negativeTestCase
diff --git a/compiler/bcq-tools/CMakeLists.txt b/compiler/bcq-tools/CMakeLists.txt
new file mode 100644
index 000000000..ae231bd53
--- /dev/null
+++ b/compiler/bcq-tools/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(BCQ_TOOLS_FILES
+ generate_bcq_output_arrays
+ preserve_bcq_info
+)
+
+foreach(BCQ_TOOLS IN ITEMS ${BCQ_TOOLS_FILES})
+
+ set(BCQ_TOOLS_FILE ${BCQ_TOOLS})
+ set(BCQ_TOOLS_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${BCQ_TOOLS_FILE}")
+ set(BCQ_TOOLS_BIN "${CMAKE_CURRENT_BINARY_DIR}/${BCQ_TOOLS_FILE}")
+ set(BCQ_TOOLS_TARGET "${BCQ_TOOLS}_target")
+
+ add_custom_command(OUTPUT ${BCQ_TOOLS_BIN}
+ COMMAND ${CMAKE_COMMAND} -E copy "${BCQ_TOOLS_SRC}" "${BCQ_TOOLS_BIN}"
+ DEPENDS ${BCQ_TOOLS_SRC}
+ COMMENT "Generate ${BCQ_TOOLS_BIN}"
+ )
+
+ add_custom_target(${BCQ_TOOLS_TARGET} ALL DEPENDS ${BCQ_TOOLS_BIN})
+
+ install(FILES ${BCQ_TOOLS_BIN}
+ PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+ GROUP_READ GROUP_WRITE GROUP_EXECUTE
+ WORLD_READ WORLD_EXECUTE
+ DESTINATION bin)
+
+endforeach(BCQ_TOOLS)
diff --git a/compiler/bcq-tools/README.md b/compiler/bcq-tools/README.md
new file mode 100644
index 000000000..18b0f4826
--- /dev/null
+++ b/compiler/bcq-tools/README.md
@@ -0,0 +1,78 @@
+# BCQ Tools
+
+This directory includes some tools related with BCQ.
+
+## preserve_bcq_info
+
+### Purpose
+
+`preserve_bcq_info` is for preserving constant nodes which include BCQ information.
+When `.pb` file is converted to `.tflite` file by TFlite converter, constant nodes whose values are exactly same are removed and then linked to only one representative node.
+This makes us impossible to know what constant node should be linked to a node which we want to apply BCQ.
+One of the solutions is making all the same constant nodes different by inserting unique values and ignore the newly generated unique values when BCQ fusing is applied.
+`preserve_bcq_info` will generate and insert unique dummy values to the constant nodes whose values are same not to be removed by Tensorflow Lite converter.
+As a result, BCQ information will be preserved.
+
+### How to use
+
+```bash
+preserve_bcq_info \
+--input_path /path/to/original_model.pb \
+--output_path /path/to/preserved_model.pb
+```
+
+### How it works
+
+If we add unique dummy value at the end of each constant nodes, all the constant nodes would be different. Following is an example.
+
+```
+[Original Constant Nodes]
+const(value=[1, 2, 3], name='const1')
+const(value=[1, 2, 3], name='const2')
+const(value=[1, 2, 3], name='const3')
+
+[After BCQ information preserved]
+const(value=[1, 2, 3, -1], name='const1')
+const(value=[1, 2, 3, -2], name='const2')
+const(value=[1, 2, 3, -3], name='const3')
+```
+
+For dummy values, negative values are used instead of positive values.
+This is because positive valus may be confused with original constant node values.
+For your information, unique dummy value starts from -1 and moves to -2, -3, ..., -N, where N is the number of preserved constant nodes.
+
+### Caution
+
+- Newly generated dummy values should be ignored when the constant nodes are used.
+
+## generate_bcq_output_arrays
+
+### Purpose
+
+To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished.
+However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big.
+`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes.
+
+### How to use
+
+```bash
+generate_bcq_output_arrays \
+--input_path /path/to/original_model.pb \
+--output_path /path/to/output_arrays.txt
+```
+
+### How it works
+
+```
+[Original BCQ information nodes]
+const(value=[1, 2, 3, -1], name='const1')
+const(value=[1, 2, 3, -2], name='const2')
+const(value=[1, 2, 3, -3], name='const3')
+
+[Generated output_arrays]
+,const1,const2,const3
+```
+
+### Caution
+
+- Generated output_arrays will be start with comma.
diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays
new file mode 100644
index 000000000..48e8a9373
--- /dev/null
+++ b/compiler/bcq-tools/generate_bcq_output_arrays
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+
+import tensorflow as tf
+
+import argparse
+import sys
+
+
+def _get_parser():
+ """
+ Returns an ArgumentParser for generating output_arrays.
+ """
+ parser = argparse.ArgumentParser(
+ description=("Command line tool to generated output_arrays of BCQ nodes"))
+
+ # Input and output path.
+ parser.add_argument(
+ "-i",
+ "--input_path",
+ type=str,
+ help="Full filepath of the input file.",
+ required=True)
+ parser.add_argument(
+ "-o",
+ "--output_path",
+ type=str,
+ help="Full filepath of the output file.",
+ required=True)
+
+ return parser
+
+
+def load_graph(frozen_graph_filename):
+ """
+ Load graph from frozen pb file
+ """
+ with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
+ graph_def = tf.compat.v1.GraphDef()
+ graph_def.ParseFromString(f.read())
+ with tf.Graph().as_default() as graph:
+ tf.import_graph_def(graph_def, name='')
+ return graph
+
+
+def dtype2str(dtype):
+ if dtype == "int32":
+ return "TF_INT32"
+ elif dtype == "int64":
+ return "TF_INT64"
+ elif dtype == "float32":
+ return "TF_FLOAT"
+ elif dtype == "bool":
+ return "TF_BOOL"
+ else:
+ raise Exception("Not supported dtype")
+
+
+def print_output_arrays(flags):
+ graph_model = load_graph(flags.input_path)
+ graph_model_def = graph_model.as_graph_def()
+ ops = graph_model.get_operations()
+
+ output_names = [op.outputs[0].name for op in ops
+ if op.type == "Const" and "bcqinfo_" in op.outputs[0].name]
+
+ output_arrays = ""
+ for output_name in output_names:
+ output_arrays += ","
+
+ colon_index = output_name.find(":")
+ if colon_index == -1:
+ output_arrays += output_name
+ else:
+ output_arrays += output_name[:colon_index]
+
+ f = open(flags.output_path, 'w')
+ f.write(output_arrays)
+ f.close()
+
+
+def main():
+ # Parse argument.
+ parser = _get_parser()
+ flags = parser.parse_known_args(args=sys.argv[1:])
+
+ print_output_arrays(flags[0])
+
+
+if __name__ == "__main__":
+ main()
diff --git a/compiler/bcq-tools/preserve_bcq_info b/compiler/bcq-tools/preserve_bcq_info
new file mode 100644
index 000000000..2ede8d4d0
--- /dev/null
+++ b/compiler/bcq-tools/preserve_bcq_info
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+import tensorflow as tf
+import numpy as np
+
+import argparse
+import sys
+
+
+def _get_parser():
+ """
+ Returns an ArgumentParser for preserving BCQ information.
+ """
+ parser = argparse.ArgumentParser(
+ description=("Command line tool to preserve BCQ information"))
+
+ # Input and output path.
+ parser.add_argument(
+ "-i",
+ "--input_path",
+ type=str,
+ help="Full filepath of the input file.",
+ required=True)
+ parser.add_argument(
+ "-o",
+ "--output_path",
+ type=str,
+ help="Full filepath of the output file.",
+ required=True)
+
+ return parser
+
+
+def load_graph(frozen_graph_filename):
+ """
+ Load graph from frozen pb file
+ """
+ with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
+ graph_def = tf.compat.v1.GraphDef()
+ graph_def.ParseFromString(f.read())
+ with tf.Graph().as_default() as graph:
+ tf.import_graph_def(graph_def, name='')
+ return graph
+
+
+def preserve_bcq_info(flags):
+ """
+ Generate unique dummy value from -1 to -N.
+
+ We use negative values to preserve BCQ information because
+ positive values may cause some confusion with real BCQ information values.
+ """
+
+ class UniqueValueGen:
+ def __init__(self):
+ self.unique_value = -1
+
+ def gen(self):
+ val = self.unique_value
+ self.unique_value = val - 1
+ return val
+
+ unique_value = UniqueValueGen()
+
+ original_graph_model = load_graph(flags.input_path)
+ original_graph_model_def = original_graph_model.as_graph_def()
+
+ new_graph = tf.compat.v1.GraphDef()
+ substitution_dict = {}
+
+ DT_INT32 = None # Just for copying DT_INT32 attribute value
+
+ for node in original_graph_model_def.node:
+ if node.op == "Const":
+ # Because bcqinfo_do_w_x is BOOL type, we cannot add dummy value at the end.
+ # Therefore we should convert the type to INT32 type.
+ if "/bcqinfo_do_w_x" in node.name:
+ original_tensor = tf.make_ndarray(node.attr["value"].tensor)
+ substitution_dict[node.name] = tf.make_tensor_proto(
+ [int(original_tensor[0]), unique_value.gen()], tf.int32)
+
+ preserved_bcqinfo_list = ["/bcqinfo_number_of_clusters", "/bcqinfo_size_of_clusters",
+ "/bcqinfo_qbits_of_clusters"]
+
+ if any(name in node.name for name in preserved_bcqinfo_list):
+ original_tensor = tf.make_ndarray(
+ node.attr["value"].tensor) # variable name change
+ substitution_dict[node.name] = tf.make_tensor_proto(
+ np.append(original_tensor, unique_value.gen()), tf.int32)
+ DT_INT32 = node.attr["dtype"]
+
+ for node in original_graph_model_def.node:
+ if node.name in substitution_dict:
+ new_node = new_graph.node.add()
+ new_node.op = "Const"
+ new_node.name = node.name
+ new_node.attr["dtype"].CopyFrom(DT_INT32)
+ new_node.attr["value"].tensor.CopyFrom(substitution_dict[node.name])
+ else:
+ new_node = new_graph.node.add()
+ new_node.CopyFrom(node)
+
+ tf.io.write_graph(new_graph, '.', flags.output_path, False)
+
+
+def main():
+ # Parse argument.
+ parser = _get_parser()
+ flags = parser.parse_known_args(args=sys.argv[1:])
+
+ # Generate a new pb file, which BCQ information is preserved.
+ preserve_bcq_info(flags[0])
+
+
+if __name__ == "__main__":
+ main()
diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt
index 1335057eb..009bfabea 100644
--- a/compiler/circle-quantizer/CMakeLists.txt
+++ b/compiler/circle-quantizer/CMakeLists.txt
@@ -13,5 +13,6 @@ target_link_libraries(circle-quantizer luci_service)
target_link_libraries(circle-quantizer luci_pass)
target_link_libraries(circle-quantizer luci_export)
target_link_libraries(circle-quantizer arser)
+target_link_libraries(circle-quantizer vconone)
install(TARGETS circle-quantizer DESTINATION bin)
diff --git a/compiler/circle-quantizer/requires.cmake b/compiler/circle-quantizer/requires.cmake
index 2293e53f8..c21e28e8d 100644
--- a/compiler/circle-quantizer/requires.cmake
+++ b/compiler/circle-quantizer/requires.cmake
@@ -5,3 +5,4 @@ require("safemain")
require("luci")
require("oops")
require("arser")
+require("vconone")
diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
index b56b547a9..8d3a80c91 100644
--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
+++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
@@ -25,6 +25,7 @@
#include <oops/InternalExn.h>
#include <arser/arser.h>
+#include <vconone/vconone.h>
#include <functional>
#include <iostream>
@@ -36,6 +37,12 @@ using OptionHook = std::function<int(const char **)>;
using Algorithms = luci::CircleOptimizer::Options::Algorithm;
using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
+void print_version(void)
+{
+ std::cout << "circle-quantizer version " << vconone::get_string() << std::endl;
+ std::cout << vconone::get_copyright() << std::endl;
+}
+
int entry(int argc, char **argv)
{
// Simple argument parser (based on map)
@@ -49,13 +56,20 @@ int entry(int argc, char **argv)
arser::Arser arser("circle-quantizer provides circle model quantization");
+ arser.add_argument("--version")
+ .nargs(0)
+ .required(false)
+ .default_value(false)
+ .help("Show version information and exit")
+ .exit_with(print_version);
+
arser.add_argument(qdqw)
.nargs(3)
.type(arser::DataType::STR_VEC)
.required(false)
.help("Quantize-dequantize weight values required action before quantization. "
"Three arguments required: input_dtype(float32) "
- "output_dtype(uint8) granularity(layer)");
+ "output_dtype(uint8) granularity(layer, channel)");
arser.add_argument(qwmm)
.nargs(3)
@@ -63,7 +77,7 @@ int entry(int argc, char **argv)
.required(false)
.help("Quantize with min/max values. "
"Three arguments required: input_dtype(float32) "
- "output_dtype(uint8) granularity(layer)");
+ "output_dtype(uint8) granularity(layer, channel)");
arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp
index a55cd4574..38e3073aa 100644
--- a/compiler/circle-tensordump/driver/Driver.cpp
+++ b/compiler/circle-tensordump/driver/Driver.cpp
@@ -46,7 +46,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << std::endl;
std::cout << arser;
- return 0;
+ return 255;
}
std::unique_ptr<circletensordump::DumpInterface> dump;
diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp
index dfa78f031..a8d32564f 100644
--- a/compiler/circle-tensordump/src/Dump.cpp
+++ b/compiler/circle-tensordump/src/Dump.cpp
@@ -136,6 +136,7 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
auto max = quant_param->max();
auto scale = quant_param->scale();
auto zero_point = quant_param->zero_point();
+ auto quantized_dimension = quant_param->quantized_dimension();
os << " " + print_format2 + "   ├── min : ";
::print_comma_sepearted(os, min);
@@ -146,9 +147,11 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
os << " " + print_format2 + "   ├── scale : ";
::print_comma_sepearted(os, scale);
os << std::endl;
- os << " " + print_format2 + "   └── zero_point : ";
+ os << " " + print_format2 + "   ├── zero_point : ";
::print_comma_sepearted(os, zero_point);
os << std::endl;
+ os << " " + print_format2 + "   └── quantized_dimension : " << quantized_dimension;
+ os << std::endl;
}
// buffer
@@ -229,7 +232,7 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
}
/**
- * This function writes data to given hdf5 file like below.
+ * This function writes vector data to given hdf5 file like below.
*
* GROUP "group_name"
* ㄴDATATYPE "type"
@@ -238,9 +241,9 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
* ㄴDATA "data"
*/
template <typename T>
-void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
- const H5::PredType &type, const flatbuffers::Vector<T> *data,
- std::vector<hsize_t> dims)
+void write_vector_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
+ const H5::PredType &type, const flatbuffers::Vector<T> *data,
+ std::vector<hsize_t> dims)
{
if (data == nullptr)
return;
@@ -250,6 +253,17 @@ void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string d
dataset->write(data->data(), type);
}
+/// @brief This function writes scalar data to given hdf5 file
+template <typename T>
+void write_scalar_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
+ const H5::PredType &type, T data)
+{
+ auto dataspace = std::make_unique<H5::DataSpace>(H5S_SCALAR);
+ auto dataset = std::make_unique<H5::DataSet>(
+ file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
+ dataset->write(&data, type);
+}
+
} // namespace
namespace circletensordump
@@ -297,8 +311,9 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
auto buff_data_ptr = reader.buffers()->Get(buff_idx)->data();
if (buff_data_ptr)
{
- ::write_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
- buff_data_ptr, ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
+ ::write_vector_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
+ buff_data_ptr,
+ ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
}
// write quantization parameters
@@ -306,17 +321,20 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
if (quant_param)
{
auto min = quant_param->min();
- ::write_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
- ::hdf5_dims_cast(min));
+ ::write_vector_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
+ ::hdf5_dims_cast(min));
auto max = quant_param->max();
- ::write_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
- ::hdf5_dims_cast(max));
+ ::write_vector_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
+ ::hdf5_dims_cast(max));
auto scale = quant_param->scale();
- ::write_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
- ::hdf5_dims_cast(scale));
+ ::write_vector_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
+ ::hdf5_dims_cast(scale));
auto zero_point = quant_param->zero_point();
- ::write_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, zero_point,
- ::hdf5_dims_cast(zero_point));
+ ::write_vector_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64,
+ zero_point, ::hdf5_dims_cast(zero_point));
+ auto quantized_dimension = quant_param->quantized_dimension();
+ ::write_scalar_data_to_hdf5(file, group_name, "quantized_dimension",
+ H5::PredType::NATIVE_INT32, quantized_dimension);
}
}
}
diff --git a/compiler/circle-verify/src/Driver.cpp b/compiler/circle-verify/src/Driver.cpp
index 1af31d986..7a44c65b9 100644
--- a/compiler/circle-verify/src/Driver.cpp
+++ b/compiler/circle-verify/src/Driver.cpp
@@ -35,7 +35,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << std::endl;
std::cout << arser;
- return 0;
+ return 255;
}
auto verifier = std::make_unique<VerifyFlatbuffers>();
diff --git a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
index 6663cb938..4bcaae347 100644
--- a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
+++ b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
@@ -1,25 +1,12 @@
nnas_include(TargetRequire)
unset(REQUIRED_TARGETS)
-list(APPEND REQUIRED_TARGETS circlechef)
list(APPEND REQUIRED_TARGETS circle-inspect)
list(APPEND REQUIRED_TARGETS circle-verify)
list(APPEND REQUIRED_TARGETS circle2circle)
list(APPEND REQUIRED_TARGETS dredd_rule_lib)
-list(APPEND REQUIRED_TARGETS tflchef)
-list(APPEND REQUIRED_TARGETS tflite2circle)
TargetRequire_Return(${REQUIRED_TARGETS})
-nncc_find_resource(TensorFlowLiteRecipes)
-nncc_find_resource(CircleRecipes)
-
-set(TFLITE_RECIPE_REPO "${TensorFlowLiteRecipes_DIR}")
-set(CIRCLE_RECIPE_REPO "${CircleRecipes_DIR}")
-unset(RECIPE_REPO)
-
-set(TEST_RECIPE_FILENAME "test.recipe")
-set(TEST_RULE_FILENAME "test.rule")
-
unset(TEST_DEPS)
unset(TEST_NAMES)
@@ -27,21 +14,9 @@ set(options "")
set(oneValueArgs "")
set(multiValueArgs PASS)
-macro(Add RECIPE)
- if(NOT EXISTS "${TFLITE_RECIPE_REPO}/${RECIPE}/test.recipe")
- if(NOT EXISTS "${CIRCLE_RECIPE_REPO}/${RECIPE}/test.recipe")
- message(FATAL_ERROR "Missing recipe of '${RECIPE}' test")
- else()
- set(RECIPE_REPO ${CIRCLE_RECIPE_REPO})
- endif()
- else()
- set(RECIPE_REPO ${TFLITE_RECIPE_REPO})
- endif()
-
- if(NOT EXISTS "${RECIPE_REPO}/${RECIPE}/test.rule")
- message(FATAL_ERROR "Missing rule of '${RECIPE}' test")
- endif()
+get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
+macro(Add RECIPE)
cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
unset(OPT_OPTIONS)
foreach(src ${ARG_PASS})
@@ -49,71 +24,20 @@ macro(Add RECIPE)
list(APPEND OPT_OPTIONS "--${src}")
endforeach(src ${ARG_PASS})
- set(RECIPE_FILE "${RECIPE}.recipe")
- set(RECIPE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RECIPE_FILENAME}")
- set(RECIPE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE_FILE}")
-
- set(RULE_FILE "${RECIPE}.rule")
- set(RULE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RULE_FILENAME}")
- set(RULE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RULE_FILE}")
-
- set(TFLITE_FILE "${RECIPE}.tflite")
- set(TFLITE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${TFLITE_FILE}")
-
set(CIRCLE_FILE "${RECIPE}.circle")
- set(CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${CIRCLE_FILE}")
+ set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${CIRCLE_FILE}")
set(OPT_CIRCLE_FILE "${RECIPE}.opt.circle")
set(OPT_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${OPT_CIRCLE_FILE}")
- # Copy .recipe
- add_custom_command(OUTPUT ${RECIPE_BINARY_PATH}
- COMMAND ${CMAKE_COMMAND} -E copy "${RECIPE_SOURCE_PATH}" "${RECIPE_BINARY_PATH}"
- DEPENDS ${RECIPE_SOURCE_PATH}
- COMMENT "Generate ${RECIPE_FILE}"
- )
-
- # Copy .rule
- add_custom_command(OUTPUT ${RULE_BINARY_PATH}
- COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}"
- DEPENDS ${RULE_SOURCE_PATH}
- COMMENT "Generate ${RULE_FILE}"
- )
-
- if(${RECIPE_REPO} STREQUAL ${TFLITE_RECIPE_REPO})
- # Generate .tflite
- add_custom_command(OUTPUT ${TFLITE_OUTPUT_PATH}
- COMMAND $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH} ${TFLITE_OUTPUT_PATH}
- DEPENDS $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH}
- COMMENT "Generate ${TFLITE_FILE}"
- )
-
- # Generate .circle
- add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
- COMMAND $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH}
- DEPENDS $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH}
- COMMENT "Generate ${CIRCLE_FILE}"
- )
-
- list(APPEND TEST_DEPS ${TFLITE_OUTPUT_PATH})
- else()
- # Generate .circle
- add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
- COMMAND $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH}
- DEPENDS $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH}
- COMMENT "Generate ${CIRCLE_FILE}"
- )
- endif()
-
# Generate optimized .circle
add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH}
- COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
- DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_OUTPUT_PATH}
+ COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
+ DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_PATH}
COMMENT "Generate ${OPT_CIRCLE_FILE}"
)
- list(APPEND TEST_DEPS ${RECIPE_BINARY_PATH} ${RULE_BINARY_PATH}
- ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH})
+ list(APPEND TEST_DEPS ${OPT_CIRCLE_OUTPUT_PATH})
list(APPEND TEST_NAMES ${RECIPE})
endmacro(Add)
@@ -174,12 +98,15 @@ list(APPEND TEST_DEPS "${RULE_LIB_BINARY_PATH}")
# Generate dependencies
add_custom_target(circle2circle_dredd_recipe_test ALL DEPENDS ${TEST_DEPS})
+add_dependencies(circle2circle_dredd_recipe_test common_artifacts_deps)
+
+get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
# Run tests
add_test(
NAME circle2circle_dredd_recipe_test
COMMAND "${TEST_RUNNER}"
"${TEST_CONFIG}"
- "${CMAKE_CURRENT_BINARY_DIR}"
+ "${ARTIFACTS_BIN_PATH}"
${TEST_NAMES}
)
diff --git a/compiler/circle2circle-dredd-recipe-test/requires.cmake b/compiler/circle2circle-dredd-recipe-test/requires.cmake
index e4a5b71a7..70e7c5295 100644
--- a/compiler/circle2circle-dredd-recipe-test/requires.cmake
+++ b/compiler/circle2circle-dredd-recipe-test/requires.cmake
@@ -1,7 +1,5 @@
-require("circlechef")
require("circle2circle")
require("circle-inspect")
require("circle-verify")
+require("common-artifacts")
require("dredd-rule-lib")
-require("tflchef")
-require("tflite2circle")
diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index 202f66938..6328a64db 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -11,9 +11,10 @@
## TFLITE RECIPE
Add(Net_InstanceNorm_001 PASS fuse_instnorm)
-# Add(Net_InstanceNorm_002 PASS fuse_instnorm)
+Add(Net_InstanceNorm_002 PASS fuse_instnorm)
Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul)
Add(MatMul_000 PASS resolve_customop_matmul)
+Add(DepthwiseConv2D_003 PASS)
## CIRCLE RECIPE
diff --git a/compiler/circle2circle-dredd-recipe-test/testall.sh b/compiler/circle2circle-dredd-recipe-test/testall.sh
index 33a2036bb..2899587ba 100755
--- a/compiler/circle2circle-dredd-recipe-test/testall.sh
+++ b/compiler/circle2circle-dredd-recipe-test/testall.sh
@@ -13,21 +13,22 @@ if [[ $# -lt 2 ]]; then
exit 255
fi
+WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
CONFIG_PATH="$1"; shift
-WORKDIR="$1"; shift
+RESOURCE_DIR="$1"; shift
source "${CONFIG_PATH}"
echo "-- Found circle-inspect: ${CIRCLE_INSPECT_PATH}"
echo "-- Found circle-verify: ${CIRCLE_VERIFY_PATH}"
echo "-- Found circle2circle: ${CIRCLE2CIRCLE_PATH}"
-echo "-- Found workdir: ${WORKDIR}"
+echo "-- Found common-artifacts: ${RESOURCE_DIR}"
TESTED=()
PASSED=()
FAILED=()
-pushd "${WORKDIR}"
+pushd ${WORKDIR}
while [[ $# -ne 0 ]]; do
PREFIX="$1"; shift
@@ -40,7 +41,7 @@ while [[ $# -ne 0 ]]; do
cat > "${PREFIX}.log" <(
exec 2>&1
- echo "-- Found tflite: ${PREFIX}.tflite"
+ echo "-- Found circle: ${PREFIX}.opt.circle"
# Exit immediately if any command fails
set -e
@@ -55,7 +56,7 @@ while [[ $# -ne 0 ]]; do
set +x
# (COMPILED_FILE, INSPECT_PROG_PATH, VERIFY_PROG_PATH, ERROR_LOG) must be set for rule-lib.sh
- COMPILED_FILE="${WORKDIR}/${PREFIX}.opt.circle"
+ COMPILED_FILE="${PREFIX}.opt.circle"
INSPECT_PROG_PATH=${CIRCLE_INSPECT_PATH}
VERIFY_PROG_PATH=${CIRCLE_VERIFY_PATH}
ERROR_LOG="${PREFIX}.error"
@@ -66,7 +67,7 @@ while [[ $# -ne 0 ]]; do
trap 'echo "** ERROR **" ; cat "${ERROR_LOG}"' ERR
source rule-lib.sh
- source "${PREFIX}.rule"
+ source "${RESOURCE_DIR}/${PREFIX}.rule"
# unset
trap - ERR
diff --git a/compiler/circle2circle/CMakeLists.txt b/compiler/circle2circle/CMakeLists.txt
index 7b2bf9b02..f60c896d8 100644
--- a/compiler/circle2circle/CMakeLists.txt
+++ b/compiler/circle2circle/CMakeLists.txt
@@ -19,6 +19,7 @@ target_link_libraries(circle2circle luci_service)
target_link_libraries(circle2circle luci_pass)
target_link_libraries(circle2circle luci_export)
target_link_libraries(circle2circle arser)
+target_link_libraries(circle2circle vconone)
install(TARGETS circle2circle DESTINATION bin)
@@ -44,3 +45,4 @@ target_link_libraries(circle2circle_test luci_service)
target_link_libraries(circle2circle_test luci_pass)
target_link_libraries(circle2circle_test luci_export)
target_link_libraries(circle2circle_test arser)
+target_link_libraries(circle2circle_test vconone)
diff --git a/compiler/circle2circle/requires.cmake b/compiler/circle2circle/requires.cmake
index 8cbb90dbf..36a9efd16 100644
--- a/compiler/circle2circle/requires.cmake
+++ b/compiler/circle2circle/requires.cmake
@@ -9,3 +9,4 @@ require("hermes")
require("hermes-std")
require("luci")
require("arser")
+require("vconone")
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index 6888d26e3..849597b46 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -26,6 +26,7 @@
#include <oops/InternalExn.h>
#include <arser/arser.h>
+#include <vconone/vconone.h>
#include <functional>
#include <iostream>
@@ -34,6 +35,12 @@
using Algorithms = luci::CircleOptimizer::Options::Algorithm;
using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
+void print_version(void)
+{
+ std::cout << "circle2circle version " << vconone::get_string() << std::endl;
+ std::cout << vconone::get_copyright() << std::endl;
+}
+
int entry(int argc, char **argv)
{
// Simple argument parser (based on map)
@@ -44,6 +51,13 @@ int entry(int argc, char **argv)
arser::Arser arser("circle2circle provides circle model optimization and transformations");
+ arser.add_argument("--version")
+ .nargs(0)
+ .required(false)
+ .default_value(false)
+ .help("Show version information and exit")
+ .exit_with(print_version);
+
arser.add_argument("--all").nargs(0).required(false).default_value(false).help(
"Enable all optimize options");
diff --git a/compiler/circlechef/CMakeLists.txt b/compiler/circlechef/CMakeLists.txt
index cba7d0a4e..3e2ddcbb3 100644
--- a/compiler/circlechef/CMakeLists.txt
+++ b/compiler/circlechef/CMakeLists.txt
@@ -18,4 +18,6 @@ add_subdirectory(core)
add_subdirectory(circle)
# Tools
add_subdirectory(tools)
-add_subdirectory(tests)
+if(ENABLE_TEST)
+ add_subdirectory(tests)
+endif(ENABLE_TEST)
diff --git a/compiler/circlechef/circle/src/RecipeChef.cpp b/compiler/circlechef/circle/src/RecipeChef.cpp
index 17ef1be6e..51326c7f8 100644
--- a/compiler/circlechef/circle/src/RecipeChef.cpp
+++ b/compiler/circlechef/circle/src/RecipeChef.cpp
@@ -181,6 +181,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const circle::Model *model)
for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
chef_quant->add_zero_point(quant->zero_point()->Get(idx));
}
+ circlechef::TensorQuantization *chef_quant = operand->mutable_quant();
+ chef_quant->set_quantized_dimension(quant->quantized_dimension());
}
}
diff --git a/compiler/circlechef/core/src/ModelChef.cpp b/compiler/circlechef/core/src/ModelChef.cpp
index 76aeacdd9..d81467d68 100644
--- a/compiler/circlechef/core/src/ModelChef.cpp
+++ b/compiler/circlechef/core/src/ModelChef.cpp
@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
quant_builder.add_min(quant_min);
quant_builder.add_scale(quant_scale);
quant_builder.add_zero_point(quant_zero_point);
+ quant_builder.add_quantized_dimension(quant.quantized_dimension());
// Update QuantizationParameters Index
quant_index = quant_builder.Finish();
diff --git a/compiler/circlechef/proto/circlechef.proto b/compiler/circlechef/proto/circlechef.proto
index b8c009b38..3e5e6b168 100644
--- a/compiler/circlechef/proto/circlechef.proto
+++ b/compiler/circlechef/proto/circlechef.proto
@@ -35,6 +35,7 @@ message TensorQuantization {
repeated float max = 2;
repeated float scale = 3;
repeated int64 zero_point = 4;
+ optional int32 quantized_dimension = 5 [default = 0];
}
message Operand {
diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp
index a15da4002..bcc0c7ae9 100644
--- a/compiler/circlechef/tools/file/Driver.cpp
+++ b/compiler/circlechef/tools/file/Driver.cpp
@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << std::endl;
std::cout << arser;
- return 0;
+ return 255;
}
int32_t model_version = 1;
diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp
index 9c0b9ea24..8a2b85fc7 100644
--- a/compiler/circlechef/tools/reverse/Driver.cpp
+++ b/compiler/circlechef/tools/reverse/Driver.cpp
@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << std::endl;
std::cout << arser;
- return 0;
+ return 255;
}
std::string circle_path = arser.get<std::string>("circle");
diff --git a/compiler/circledump/driver/Driver.cpp b/compiler/circledump/driver/Driver.cpp
index b8f561fee..657f24fe0 100644
--- a/compiler/circledump/driver/Driver.cpp
+++ b/compiler/circledump/driver/Driver.cpp
@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << '\n';
std::cout << arser;
- return 0;
+ return 255;
}
std::string circle_path = arser.get<std::string>("circle");
diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
index 2c0320396..5aa5d51c1 100644
--- a/compiler/circledump/src/OpPrinter.cpp
+++ b/compiler/circledump/src/OpPrinter.cpp
@@ -593,6 +593,20 @@ public:
}
};
+class UniquePrinter : public OpPrinter
+{
+public:
+ void options(const circle::Operator *op, std::ostream &os) const override
+ {
+ if (auto *params = op->builtin_options_as_UniqueOptions())
+ {
+ os << " ";
+ os << "idx_out_type(" << EnumNameTensorType(params->idx_out_type()) << ") ";
+ os << std::endl;
+ }
+ }
+};
+
class WhilePrinter : public OpPrinter
{
public:
@@ -744,6 +758,7 @@ OpPrinterRegistry::OpPrinterRegistry()
_op_map[circle::BuiltinOperator_SUM] = make_unique<ReducerPrinter>();
_op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
// There is no Option for TOPK_V2
+ _op_map[circle::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
_op_map[circle::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
_op_map[circle::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
index b614b7182..d3f560179 100644
--- a/compiler/common-artifacts/exclude.lst
+++ b/compiler/common-artifacts/exclude.lst
@@ -5,9 +5,12 @@
#[[ optimize : Exclude from circle optimization(circle2circle) ]]
## TensorFlowLiteRecipes
-optimize(ReLU6_000)
-optimize(Where_000)
-optimize(Where_001)
+optimize(Unique_000)
+optimize(Unique_001)
+optimize(Unique_002)
+optimize(Unique_003)
+optimize(Unique_U8_000)
+optimize(Unique_U8_001)
## CircleRecipes
@@ -46,6 +49,7 @@ tcgenerate(DepthToSpace_000)
tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation
tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation
tcgenerate(DepthwiseConv2D_U8_000)
+tcgenerate(DepthwiseConv2D_U8_001) # luci-interpreter doesn't support channel-wise quantization yet
tcgenerate(Div_000)
tcgenerate(ELU_000)
tcgenerate(Equal_000)
@@ -96,7 +100,7 @@ tcgenerate(Neg_000)
tcgenerate(Net_Dangle_001)
tcgenerate(Net_InstanceNorm_001)
tcgenerate(Net_InstanceNorm_002)
-tcgenerate(Net_ZeroDim_001) # fix luci
+tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim
tcgenerate(NotEqual_000)
tcgenerate(OneHot_000)
tcgenerate(OneHot_001)
@@ -120,9 +124,9 @@ tcgenerate(ReduceProd_001)
tcgenerate(ReduceProd_002)
tcgenerate(ReduceProd_003)
tcgenerate(ReLU_000)
-tcgenerate(ReLU6_000) # luci NYI
+tcgenerate(ReLU6_000)
tcgenerate(ReLUN1To1_000)
-tcgenerate(Reshape_003) # fix luci
+tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option
tcgenerate(Reshape_U8_000)
tcgenerate(ResizeBilinear_000)
tcgenerate(ResizeNearestNeighbor_000)
@@ -148,7 +152,7 @@ tcgenerate(SpaceToBatchND_002)
tcgenerate(SpaceToBatchND_003)
tcgenerate(SpaceToDepth_000)
tcgenerate(SparseToDense_000)
-tcgenerate(SplitV_000) # fix luci
+tcgenerate(SplitV_000)
tcgenerate(Sqrt_000)
tcgenerate(Square_000)
tcgenerate(SquaredDifference_000)
@@ -164,22 +168,21 @@ tcgenerate(Sum_001)
tcgenerate(Tanh_000)
tcgenerate(Tile_000)
tcgenerate(Tile_U8_000)
-tcgenerate(TopKV2_000) # fix luci
-tcgenerate(TopKV2_001) # fix luci
-tcgenerate(TransposeConv_000) # fix interpreter
+tcgenerate(TopKV2_000)
+tcgenerate(TopKV2_001)
tcgenerate(Unique_000)
tcgenerate(Unique_001)
tcgenerate(Unique_002)
tcgenerate(Unique_003)
tcgenerate(Unique_U8_000)
tcgenerate(Unique_U8_001)
-tcgenerate(Where_000) # luci NYI
-tcgenerate(Where_001) # luci NYI
-tcgenerate(While_000) # fix luci
+tcgenerate(Where_000)
+tcgenerate(Where_001)
+tcgenerate(While_000)
tcgenerate(While_001)
tcgenerate(While_002)
tcgenerate(While_003)
-tcgenerate(YUV_TO_RGB_000) # fix luci
+tcgenerate(YUV_TO_RGB_000)
tcgenerate(YUV_TO_RGB_U8_000)
tcgenerate(ZerosLike_000)
diff --git a/compiler/hermes/src/hermes.test.cpp b/compiler/hermes/src/hermes.test.cpp
index 2cbc0939d..ea7ef65d8 100644
--- a/compiler/hermes/src/hermes.test.cpp
+++ b/compiler/hermes/src/hermes.test.cpp
@@ -18,7 +18,28 @@
#include <gtest/gtest.h>
-TEST(HermesTest, simple_usecase)
+namespace
{
- // TO BE FILLED
+
+class Logger final : public hermes::Source
+{
+public:
+ Logger(hermes::Context *ctx);
+ ~Logger();
+};
+
+Logger::Logger(hermes::Context *ctx) { activate(ctx->sources(), ctx->bus()); }
+Logger::~Logger() { deactivate(); }
+
+} // namespace
+
+TEST(HermesTest, logger_constructor_NEG)
+{
+ hermes::Context context;
+ // we expect segmentfault from nullptr->sources()
+ ASSERT_DEATH(Logger logger(&context), "");
+
+ SUCCEED();
}
+
+// TODO add HermesTest simple_usecase
diff --git a/compiler/locomotiv/src/Node/BiasEncode.test.cpp b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
index cdb255ccb..4680f5c5a 100644
--- a/compiler/locomotiv/src/Node/BiasEncode.test.cpp
+++ b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
@@ -90,6 +90,16 @@ template <typename T> void test()
}
} // namespace
-TEST(NodeExecution_BiasEncode, s32) { test<int32_t>(); }
+TEST(NodeExecution_BiasEncode, s32)
+{
+ test<int32_t>();
+
+ SUCCEED();
+}
-TEST(NodeExecution_BiasEncode, f32) { test<float>(); }
+TEST(NodeExecution_BiasEncode, f32)
+{
+ test<float>();
+
+ SUCCEED();
+}
diff --git a/compiler/locomotiv/src/Node/MatMul.test.cpp b/compiler/locomotiv/src/Node/MatMul.test.cpp
index f1f3a52d3..7d942e1d0 100644
--- a/compiler/locomotiv/src/Node/MatMul.test.cpp
+++ b/compiler/locomotiv/src/Node/MatMul.test.cpp
@@ -142,6 +142,8 @@ TEST(NodeExecution_MatMul, f32_2x3_3x3)
};
run_test<float>(lhs, rhs, out, Shape{2, 3}, Shape{3, 3}, Shape{2, 3}, loco::DataType::FLOAT32);
+
+ SUCCEED();
}
/* from the code below:
@@ -183,6 +185,8 @@ TEST(NodeExecution_MatMul, s32_4x2_2x6)
};
run_test<int32_t>(lhs, rhs, out, Shape{4, 2}, Shape{2, 6}, Shape{4, 6}, loco::DataType::S32);
+
+ SUCCEED();
}
// clang-format on
diff --git a/compiler/locop/src/FormattedGraph.test.cpp b/compiler/locop/src/FormattedGraph.test.cpp
index c9808d3a2..aff9ebe5f 100644
--- a/compiler/locop/src/FormattedGraph.test.cpp
+++ b/compiler/locop/src/FormattedGraph.test.cpp
@@ -28,6 +28,8 @@ TEST(LinearV1FormatterTest, simple)
// TODO Validate the output (when the implementation becomes stable)
std::cout << locop::fmt<locop::LinearV1>(g) << std::endl;
+
+ SUCCEED();
}
TEST(LinearV1FormatterTest, user_defined_node_summary_builder)
diff --git a/compiler/locop/src/FormattedTensorShape.test.cpp b/compiler/locop/src/FormattedTensorShape.test.cpp
index 0f0017ab4..fc85df3a6 100644
--- a/compiler/locop/src/FormattedTensorShape.test.cpp
+++ b/compiler/locop/src/FormattedTensorShape.test.cpp
@@ -30,4 +30,6 @@ TEST(FormattedTensorShapeTest, BracketFormat)
tensor_shape->dim(0) = 4;
std::cout << fmt<TensorShapeFormat::Bracket>(tensor_shape.get()) << std::endl;
+
+ SUCCEED();
}
diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
index 998789882..4ac3d8660 100644
--- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
@@ -79,12 +79,11 @@ private:
//
// Note that due to historical and performance reasons, per-tensor quantization uses unsigned
// integer types, while per-channel uses signed types assuming 'zero_point' == 0.
-//
-// TODO Add 'quantized_dimension' field for per-channel case when IR provides it.
struct AffineQuantization
{
std::vector<float> scale;
std::vector<int32_t> zero_point;
+ int32_t quantized_dimension;
};
class Tensor
@@ -108,6 +107,12 @@ public:
return _quantization.zero_point[0];
}
+ const std::vector<float> &scales() const { return _quantization.scale; }
+
+ const std::vector<int32_t> &zero_points() const { return _quantization.zero_point; }
+
+ int32_t quantized_dimension() const { return _quantization.quantized_dimension; }
+
template <typename T> const T *data() const { return reinterpret_cast<const T *>(_data.get()); }
template <typename T> T *data() { return reinterpret_cast<T *>(_data.get()); }
diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
index a32e0d4a5..65d119761 100644
--- a/compiler/luci-interpreter/src/core/KernelParams.h
+++ b/compiler/luci-interpreter/src/core/KernelParams.h
@@ -56,6 +56,11 @@ struct Conv2DParams
Activation activation;
};
+struct DepthToSpaceParams
+{
+ int block_size;
+};
+
struct DepthwiseConv2DParams
{
Padding padding;
diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
index fe3623135..a1fd1deaf 100644
--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
@@ -12,6 +12,8 @@ set(SOURCES
Concatenation.cpp
Conv2D.h
Conv2D.cpp
+ DepthToSpace.h
+ DepthToSpace.cpp
DepthwiseConv2D.h
DepthwiseConv2D.cpp
Elu.h
@@ -40,6 +42,10 @@ set(SOURCES
Pad.cpp
Reshape.h
Reshape.cpp
+ Reverse.h
+ Reverse.cpp
+ Slice.h
+ Slice.cpp
Softmax.h
Softmax.cpp
SpaceToDepth.h
@@ -77,6 +83,7 @@ set(TEST_SOURCES
AveragePool2D.test.cpp
Concatenation.test.cpp
Conv2D.test.cpp
+ DepthToSpace.test.cpp
DepthwiseConv2D.test.cpp
Elu.test.cpp
FullyConnected.test.cpp
@@ -91,6 +98,8 @@ set(TEST_SOURCES
Mul.test.cpp
Pad.test.cpp
Reshape.test.cpp
+ Reverse.test.cpp
+ Slice.test.cpp
Softmax.test.cpp
SpaceToDepth.test.cpp
Split.test.cpp
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
new file mode 100644
index 000000000..cab63e26d
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthToSpace.h"
+#include "Utils.h"
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params)
+ : KernelWithParams<DepthToSpaceParams>({input}, {output}, params)
+{
+}
+
+void DepthToSpace::configure()
+{
+ if (input()->shape().num_dims() != 4)
+ {
+ throw std::runtime_error("Invalid input num_dims.");
+ }
+ if (output()->element_type() != DataType::FLOAT32 && output()->element_type() != DataType::U8 &&
+ output()->element_type() != DataType::S8 && output()->element_type() != DataType::S32 &&
+ output()->element_type() != DataType::S64)
+ {
+ throw std::runtime_error("Invalid output type");
+ }
+ if (input()->element_type() != output()->element_type())
+ {
+ throw std::runtime_error("Type mismatch on input and output.");
+ }
+ const int block_size = params().block_size;
+ const int32_t input_height = input()->shape().dim(1);
+ const int32_t input_width = input()->shape().dim(2);
+ const int32_t input_channels = input()->shape().dim(3);
+ int32_t output_height = input_height * block_size;
+ int32_t output_width = input_width * block_size;
+ int32_t output_channels = input_channels / block_size / block_size;
+
+ assert(input_height == output_height / block_size);
+ assert(input_width == output_width / block_size);
+ assert(input_channels == output_channels * block_size * block_size);
+
+ Shape output_shape(4);
+ output_shape.dim(0) = input()->shape().dim(0);
+ output_shape.dim(1) = output_height;
+ output_shape.dim(2) = output_width;
+ output_shape.dim(3) = output_channels;
+
+ output()->resize(output_shape);
+}
+
+void DepthToSpace::execute() const
+{
+ tflite::DepthToSpaceParams op_params;
+ op_params.block_size = params().block_size;
+ switch (input()->element_type())
+ {
+ case DataType::FLOAT32:
+ tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
+ getTensorData<float>(input()), getTensorShape(output()),
+ getTensorData<float>(output()));
+ break;
+ case DataType::U8:
+ tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
+ getTensorData<uint8_t>(input()), getTensorShape(output()),
+ getTensorData<uint8_t>(output()));
+ break;
+ default:
+ throw std::runtime_error("Unsupported Type.");
+ }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.h b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
new file mode 100644
index 000000000..63ce37610
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class DepthToSpace : public KernelWithParams<DepthToSpaceParams>
+{
+public:
+ DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params);
+
+ const Tensor *input() const { return _inputs[0]; }
+ Tensor *output() const { return _outputs[0]; }
+
+ void configure() override;
+ void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
new file mode 100644
index 000000000..1b805702d
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/DepthToSpace.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class DepthToSpaceTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(DepthToSpaceTest, DataTypes);
+
+TYPED_TEST(DepthToSpaceTest, SimpleCase)
+{
+ std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+ Shape input_shape{1, 1, 2, 4};
+ std::vector<TypeParam> output_data{1, 2, 5, 6, 3, 4, 7, 8};
+ std::vector<int32_t> output_shape{1, 2, 4, 1};
+
+ Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+ Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+ DepthToSpaceParams params{};
+ params.block_size = 2;
+
+ DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+ kernel.configure();
+ kernel.execute();
+
+ EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+ ::testing::ElementsAreArray(output_data));
+ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
index fad450d66..f53eaca94 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
@@ -45,12 +45,9 @@ TEST(L2NormalizeTest, Float)
ElementsAreArray(ArrayFloatNear(ref_output_data)));
}
-TEST(L2NormalizeTest, Uint8Quantized)
-{
- // TODO
- // Implement GetDequantizedOutput Function.
- // Create Test for Uint8 Case
-}
+// TODO Uint8Quantized
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
} // namespace
} // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
index b0c06e7a3..c79d3d6bc 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
@@ -61,15 +61,14 @@ TEST(LeakReluTest, FloatSimple)
1.0f, -0.5f, -1.0f, // Row 2
},
/*alpha=*/0.5f, getElementType<float>());
-}
-TEST(LeakReluTest, Uint8Simple)
-{
- // TODO
- // Implement GetDequantizedOutput Function.
- // Create Test for Uint8 Case
+ SUCCEED();
}
+// TODO Uint8Simple
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
} // namespace
} // namespace kernels
} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
index 17456a4a8..00feddf3d 100644
--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
@@ -49,10 +49,8 @@ TEST(LogisticTest, Float)
// TODO make a Shape checking of output_tensor.
}
-TEST(LogisticTest, Uint8)
-{
- // Need to Implement GetDequantizedOutput Function.
-}
+// TODO Uint8
+// Need to Implement GetDequantizedOutput Function.
} // namespace
} // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.cpp b/compiler/luci-interpreter/src/kernels/Reverse.cpp
new file mode 100644
index 000000000..a46308412
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Reverse.h"
+#include "kernels/Utils.h"
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Reverse::Reverse(const Tensor *input, const Tensor *axes, Tensor *output)
+ : Kernel({input, axes}, {output})
+{
+}
+
+void Reverse::configure()
+{
+ assert(axes()->shape().num_dims() == 1);
+ assert(input()->shape().num_dims() >= axes()->shape().num_elements());
+ if (input()->element_type() != DataType::S32 && input()->element_type() != DataType::FLOAT32 &&
+ input()->element_type() != DataType::U8 && input()->element_type() != DataType::S16 &&
+ input()->element_type() != DataType::S64)
+ {
+ throw std::runtime_error("Unsupported input type.");
+ }
+ if (axes()->element_type() != DataType::S32)
+ {
+ throw std::runtime_error("Unsupported axes type.");
+ }
+ if (axes()->shape().num_elements() > 1)
+ {
+ throw std::runtime_error("Current implementation does not support more than 1 axis.");
+ }
+ int axis_value = getTensorData<int32_t>(axes())[0];
+ if (axis_value < 0 || axis_value >= input()->shape().num_dims())
+ {
+ throw std::runtime_error("Invalid axes value");
+ }
+ assert(input()->element_type() == output()->element_type());
+
+ output()->resize(input()->shape());
+}
+
+void Reverse::execute() const
+{
+ int axis_value = getTensorData<int32_t>(axes())[0];
+ switch (output()->element_type())
+ {
+ case DataType::FLOAT32:
+ tflite::reference_ops::Reverse<float>(axis_value, getTensorShape(input()),
+ getTensorData<float>(input()), getTensorShape(output()),
+ getTensorData<float>(output()));
+ break;
+ case DataType::U8:
+ tflite::reference_ops::Reverse<uint8_t>(
+ axis_value, getTensorShape(input()), getTensorData<uint8_t>(input()),
+ getTensorShape(output()), getTensorData<uint8_t>(output()));
+ break;
+ default:
+ throw std::runtime_error("Unsupported output type");
+ }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.h b/compiler/luci-interpreter/src/kernels/Reverse.h
new file mode 100644
index 000000000..3489dae28
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_REVERSE_H
+#define LUCI_INTERPRETER_KERNELS_REVERSE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Reverse : public Kernel
+{
+public:
+ Reverse(const Tensor *input, const Tensor *axes, Tensor *output);
+
+ const Tensor *input() const { return _inputs[0]; }
+ const Tensor *axes() const { return _inputs[1]; }
+ Tensor *output() const { return _outputs[0]; }
+
+ void configure() override;
+ void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_REVERSE_H
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.test.cpp b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
new file mode 100644
index 000000000..5475a8bd3
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Reverse.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class ReverseTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(ReverseTest, DataTypes);
+
+TYPED_TEST(ReverseTest, MultiDimensions)
+{
+ // TypeParam
+ std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+ Shape input_shape{4, 3, 2};
+ std::vector<int32_t> axis_data{1};
+ Shape axis_shape{1};
+
+ std::vector<TypeParam> output_data{5, 6, 3, 4, 1, 2, 11, 12, 9, 10, 7, 8,
+ 17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20};
+ std::vector<int32_t> output_shape{4, 3, 2};
+
+ Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+ Tensor axis_tensor = makeInputTensor<DataType::S32>(axis_shape, axis_data);
+
+ Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+ Reverse kernel = Reverse(&input_tensor, &axis_tensor, &output_tensor);
+ kernel.configure();
+ kernel.execute();
+
+ EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+ ::testing::ElementsAreArray(output_data));
+ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp
new file mode 100644
index 000000000..c4bc3c57c
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Slice.h"
+#include "Utils.h"
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+#include <cassert>
+#include <cstring>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+const int max_dim = 4;
+
+Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output)
+ : Kernel({input, begin, size}, {output})
+{
+}
+
+template <typename T>
+Shape calculateOutputShape(const Tensor *input, const Tensor *begin, const Tensor *size)
+{
+ Shape output_shape = Shape(input->shape().num_dims());
+ for (int idx = 0; idx < input->shape().num_dims(); idx++)
+ {
+ T size_value = getTensorData<T>(size)[idx];
+ if (size_value < 0)
+ {
+ if (size_value != -1)
+ {
+ throw std::runtime_error("Invalid size.");
+ }
+ size_value = input->shape().dim(idx) - getTensorData<T>(begin)[idx];
+ }
+ else
+ {
+ if (input->shape().dim(idx) < getTensorData<T>(begin)[idx] + size_value)
+ {
+ throw std::runtime_error("Invalid begin and size.");
+ }
+ }
+ output_shape.dim(idx) = static_cast<int>(size_value);
+ }
+ return output_shape;
+}
+
+template <typename T>
+void getBeginAndSizeVectors(int dimensions, const Tensor *begin, const Tensor *size,
+ std::vector<int> *begins, std::vector<int> *sizes)
+{
+ for (int idx = dimensions - 1; idx >= 0; --idx)
+ {
+ begins->push_back(getTensorData<T>(begin)[idx]);
+ sizes->push_back(getTensorData<T>(size)[idx]);
+ }
+}
+
+void Slice::configure()
+{
+ assert(input()->element_type() == output()->element_type());
+ assert(begin()->element_type() == DataType::S32 || begin()->element_type() == DataType::S64);
+ assert(size()->element_type() == DataType::S32 || size()->element_type() == DataType::S64);
+ assert(begin()->shape().num_dims() == 1);
+ assert(size()->shape().num_dims() == 1);
+ assert(input()->shape().num_dims() <= max_dim);
+
+ if (begin()->element_type() == DataType::S32)
+ {
+ output()->resize(calculateOutputShape<int32_t>(input(), begin(), size()));
+ }
+ else if (begin()->element_type() == DataType::S64)
+ {
+ output()->resize(calculateOutputShape<int64_t>(input(), begin(), size()));
+ }
+ else
+ {
+ throw std::runtime_error("Unsupported type.");
+ }
+}
+
+void Slice::execute() const
+{
+ std::vector<int> begins;
+ begins.reserve(max_dim);
+ std::vector<int> sizes;
+ sizes.reserve(max_dim);
+ if (begin()->element_type() == DataType::S32)
+ {
+ getBeginAndSizeVectors<int32_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
+ }
+ else if (begin()->element_type() == DataType::S64)
+ {
+ getBeginAndSizeVectors<int64_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
+ }
+ else
+ {
+ throw std::runtime_error("Unsupported begin type.");
+ }
+ for (int i = input()->shape().num_dims(); i < max_dim; ++i)
+ {
+ begins.push_back(0);
+ sizes.push_back(1);
+ }
+
+ assert(begins.size() == 4);
+ assert(sizes.size() == 4);
+ tflite::SliceParams op_params{};
+ op_params.begin_count = 4;
+ op_params.size_count = 4;
+ for (int i = 0; i < 4; i++)
+ {
+ op_params.begin[i] = begins[3 - i];
+ op_params.size[i] = sizes[3 - i];
+ }
+ switch (input()->element_type())
+ {
+ case DataType::FLOAT32:
+ tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
+ getTensorData<float>(input()), getTensorShape(output()),
+ getTensorData<float>(output()));
+ break;
+ case DataType::U8:
+ tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
+ getTensorData<uint8_t>(input()), getTensorShape(output()),
+ getTensorData<uint8_t>(output()));
+ break;
+ default:
+ throw std::runtime_error("Unsupported input type.");
+ }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Slice.h b/compiler/luci-interpreter/src/kernels/Slice.h
new file mode 100644
index 000000000..23c359608
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SLICE_H
+#define LUCI_INTERPRETER_KERNELS_SLICE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Slice : public Kernel
+{
+public:
+ Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output);
+
+ const Tensor *input() const { return _inputs[0]; }
+ const Tensor *begin() const { return _inputs[1]; }
+ const Tensor *size() const { return _inputs[2]; }
+ Tensor *output() const { return _outputs[0]; }
+
+ void configure() override;
+ void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SLICE_H
diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
new file mode 100644
index 000000000..a360a29cc
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Slice.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class SliceTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(SliceTest, DataTypes);
+
+TYPED_TEST(SliceTest, SimpleTest)
+{
+ std::vector<TypeParam> input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
+ Shape input_shape{3, 2, 3, 1};
+ std::vector<int32_t> begin_data{1, 0, 0, 0};
+ Shape begin_shape{4};
+ std::vector<int32_t> size_data{2, 1, -1, 1};
+ Shape size_shape{4};
+ std::vector<TypeParam> output_data{3, 3, 3, 5, 5, 5};
+ std::vector<int32_t> output_shape{2, 1, 3, 1};
+
+ Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+ Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
+ Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+
+ Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+ Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor);
+ kernel.configure();
+ kernel.execute();
+
+ EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+ ::testing::ElementsAreArray(output_data));
+ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
index 3386d3683..b8c0ac497 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
@@ -68,6 +68,8 @@ TEST(TransposeConvTest, FloatSimple)
/*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
/*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
getElementType<float>());
+
+ SUCCEED();
}
TEST(TransposeConvTest, FloatTwoFiltersTest)
@@ -82,21 +84,18 @@ TEST(TransposeConvTest, FloatTwoFiltersTest)
3352, 3652, 2760},
/*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
getElementType<float>());
-}
-TEST(TransposeConvTest, Uint8Simple)
-{
- // TODO
- // Implement GetDequantizedOutput Function.
- // Create Test for Uint8 Case
-}
-TEST(TransposeConvTest, Uint8FiltersTest)
-{
- // TODO
- // Implement GetDequantizedOutput Function.
- // Create Test for Uint8 Case
+ SUCCEED();
}
+// TODO Uint8Simple
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
+// TODO Uint8FiltersTest
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
} // namespace
} // namespace kernels
} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt
index fb36c4ab0..d99485d06 100644
--- a/compiler/luci-interpreter/src/loader/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt
@@ -1,3 +1,5 @@
+nnas_find_package(GTest REQUIRED)
+
set(SOURCES
GraphLoader.h
GraphLoader.cpp
@@ -13,3 +15,8 @@ target_include_directories(luci_interpreter_loader PUBLIC "${LUCI_INTERPRETER_SO
target_link_libraries(luci_interpreter_loader
PUBLIC luci_lang luci_interpreter_core
PRIVATE luci_interpreter_kernels nncc_common)
+
+set(TEST_SOURCES KernelBuilder.test.cpp)
+
+GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES})
+target_link_libraries(luci_interpreter_loader_test luci_interpreter_loader)
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index 779fa0647..6ebf979d3 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -16,7 +16,6 @@
#include "loader/GraphLoader.h"
-#include "loader/ModuleLoader.h"
#include "loader/KernelBuilder.h"
#include <loco/IR/Algorithm.h>
@@ -71,6 +70,7 @@ bool isExecutableNode(const luci::CircleNode *node)
{
// These nodes denote inputs / outputs of a graph.
case luci::CircleOpcode::CONST:
+ case luci::CircleOpcode::CIRCLECONST:
case luci::CircleOpcode::CIRCLEINPUT:
case luci::CircleOpcode::CIRCLEOUTPUT:
// The following nodes denote outputs of multiple-output nodes.
@@ -102,11 +102,12 @@ bool isTensorProducingNode(const luci::CircleNode *node)
} // namespace
-GraphLoader::GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
- RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
- std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
- : _module_loader(module_loader), _graph(graph), _runtime_graph(runtime_graph),
- _runtime_to_ir(runtime_to_ir), _node_to_tensor(node_to_tensor)
+GraphLoader::GraphLoader(
+ const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+ std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+ : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
+ _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
{
}
@@ -136,6 +137,7 @@ void GraphLoader::loadTensors()
const luci::CircleQuantParam *params = node->quantparam();
quantization.scale.assign(params->scale.cbegin(), params->scale.cend());
quantization.zero_point.assign(params->zerop.cbegin(), params->zerop.cend());
+ quantization.quantized_dimension = params->quantized_dimension;
}
auto tensor = std::make_unique<Tensor>(node->dtype(), std::move(shape), std::move(quantization),
@@ -178,7 +180,7 @@ void GraphLoader::initInputOutputTensors() const
void GraphLoader::loadOperators()
{
- KernelBuilder kernel_builder(_module_loader, *this);
+ KernelBuilder kernel_builder(_graph_to_runtime_graph, _node_to_tensor);
// Create kernels for executable nodes. This has to be done in execution order.
for (const loco::Node *loco_node :
@@ -195,11 +197,4 @@ void GraphLoader::loadOperators()
}
}
-void GraphLoader::load()
-{
- loadTensors();
- initInputOutputTensors();
- loadOperators();
-}
-
} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-interpreter/src/loader/GraphLoader.h
index e0adc0f6c..89c5bcad7 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.h
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.h
@@ -27,29 +27,23 @@
namespace luci_interpreter
{
-class ModuleLoader;
-
class GraphLoader
{
public:
- GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
- RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+ GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
- void load();
-
- Tensor *getTensorForNode(const loco::Node *node) const { return _node_to_tensor.at(node); }
-
-private:
- void loadOperators();
- void initInputOutputTensors() const;
void loadTensors();
+ void initInputOutputTensors() const;
+ void loadOperators();
- const ModuleLoader &_module_loader;
+private:
const loco::Graph *_graph;
RuntimeGraph *_runtime_graph;
RuntimeToIR &_runtime_to_ir;
+ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
};
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
index 56da961dd..c19f8973f 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
@@ -21,6 +21,7 @@
#include "kernels/AveragePool2D.h"
#include "kernels/Concatenation.h"
#include "kernels/Conv2D.h"
+#include "kernels/DepthToSpace.h"
#include "kernels/DepthwiseConv2D.h"
#include "kernels/Elu.h"
#include "kernels/FullyConnected.h"
@@ -35,6 +36,8 @@
#include "kernels/Mul.h"
#include "kernels/Pad.h"
#include "kernels/Reshape.h"
+#include "kernels/Reverse.h"
+#include "kernels/Slice.h"
#include "kernels/Softmax.h"
#include "kernels/SpaceToDepth.h"
#include "kernels/Split.h"
@@ -43,8 +46,6 @@
#include "kernels/Unpack.h"
#include "kernels/Transpose.h"
#include "kernels/TransposeConv.h"
-#include "loader/GraphLoader.h"
-#include "loader/ModuleLoader.h"
#include <stdexcept>
@@ -68,7 +69,7 @@ static std::vector<const loco::Node *> collectOutputNodes(const luci::CircleNode
const Tensor *KernelBuilder::getInputTensor(const loco::Node *node) const
{
- const Tensor *tensor = _graph_loader.getTensorForNode(node);
+ const Tensor *tensor = _node_to_tensor.at(node);
assert(tensor != nullptr);
return tensor;
}
@@ -81,7 +82,7 @@ const Tensor *KernelBuilder::getOptionalInputTensor(const loco::Node *node) cons
Tensor *KernelBuilder::getOutputTensor(const loco::Node *node) const
{
- Tensor *tensor = _graph_loader.getTensorForNode(node);
+ Tensor *tensor = _node_to_tensor.at(node);
assert(tensor != nullptr);
return tensor;
}
@@ -98,7 +99,7 @@ KernelBuilder::getOutputTensors(const std::vector<const loco::Node *> &nodes) co
RuntimeGraph *KernelBuilder::getRuntimeGraph(const loco::Graph *graph) const
{
- RuntimeGraph *runtime_graph = _module_loader.getRuntimeGraph(graph);
+ RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
assert(runtime_graph != nullptr);
return runtime_graph;
}
@@ -120,14 +121,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAdd *node)
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleArgMax *node)
{
assert(node->arity() == 2);
- const Tensor *input1 = getInputTensor(node->input());
- const Tensor *input2 = getInputTensor(node->dimension());
+ const Tensor *input = getInputTensor(node->input());
+ const Tensor *axis = getInputTensor(node->dimension());
Tensor *output = getOutputTensor(node);
ArgMaxParams params{};
params.output_type = node->output_type();
- return std::make_unique<kernels::ArgMax>(input1, input2, output, params);
+ return std::make_unique<kernels::ArgMax>(input, axis, output, params);
}
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAveragePool2D *node)
@@ -188,6 +189,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleConv2D *node)
return std::make_unique<kernels::Conv2D>(input, filter, bias, output, params);
}
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthToSpace *node)
+{
+ assert(node->arity() == 1);
+
+ const Tensor *input = getInputTensor(node->input());
+ Tensor *output = getOutputTensor(node);
+
+ DepthToSpaceParams params{};
+ params.block_size = node->block_size();
+
+ return std::make_unique<kernels::DepthToSpace>(input, output, params);
+}
+
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthwiseConv2D *node)
{
assert(node->arity() == 3);
@@ -224,14 +238,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFullyConnected *n
assert(node->arity() == 3);
const Tensor *input = getInputTensor(node->input());
- const Tensor *filter = getInputTensor(node->weights());
+ const Tensor *weights = getInputTensor(node->weights());
const Tensor *bias = getOptionalInputTensor(node->bias());
Tensor *output = getOutputTensor(node);
FullyConnectedParams params{};
params.activation = node->fusedActivationFunction();
- return std::make_unique<kernels::FullyConnected>(input, filter, bias, output, params);
+ return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
}
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
@@ -255,6 +269,11 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
else_graph);
}
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
+{
+ throw std::runtime_error("Input node cannot be executed.");
+}
+
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleL2Normalize *node)
{
assert(node->arity() == 1);
@@ -323,11 +342,6 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLogistic *node)
return std::make_unique<kernels::Logistic>(input, output);
}
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
-{
- throw std::runtime_error("Input node cannot be executed.");
-}
-
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMaxPool2D *node)
{
assert(node->arity() == 1);
@@ -402,6 +416,30 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReshape *node)
return std::make_unique<kernels::Reshape>(input, shape, output);
}
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReverseV2 *node)
+{
+ assert(node->arity() == 2);
+
+ const Tensor *input = getInputTensor(node->tensor());
+ const Tensor *axes = getInputTensor(node->axis());
+ Tensor *output = getOutputTensor(node);
+
+ return std::make_unique<kernels::Reverse>(input, axes, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSlice *node)
+{
+ assert(node->arity() == 3);
+
+ const Tensor *input = getInputTensor(node->input());
+ const Tensor *begin = getInputTensor(node->begin());
+ const Tensor *size = getInputTensor(node->size());
+
+ Tensor *output = getOutputTensor(node);
+
+ return std::make_unique<kernels::Slice>(input, begin, size, output);
+}
+
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSoftmax *node)
{
assert(node->arity() == 1);
@@ -442,6 +480,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSplit *node)
return std::make_unique<kernels::Split>(axis, input, std::move(outputs));
}
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
+{
+ assert(node->arity() == 1);
+
+ const Tensor *input = getInputTensor(node->input());
+ Tensor *output = getOutputTensor(node);
+
+ SqueezeParams params{};
+ params.squeeze_dims = node->squeeze_dims();
+
+ return std::make_unique<kernels::Squeeze>(input, output, params);
+}
+
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *node)
{
assert(node->arity() == 4);
@@ -463,21 +514,15 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *nod
return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
}
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
{
- assert(node->arity() == 1);
+ assert(node->arity() == 2);
- const Tensor *input = getInputTensor(node->input());
+ const Tensor *input = getInputTensor(node->a());
+ const Tensor *perm = getInputTensor(node->perm());
Tensor *output = getOutputTensor(node);
- SqueezeParams params{};
- assert(node->squeeze_dims().size() <= 4);
- for (size_t i = 0; i < node->squeeze_dims().size(); i++)
- {
- params.squeeze_dims.push_back(node->squeeze_dims().at(i));
- }
-
- return std::make_unique<kernels::Squeeze>(input, output, params);
+ return std::make_unique<kernels::Transpose>(input, perm, output);
}
std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTransposeConv *node)
@@ -515,15 +560,4 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleUnpack *node)
return std::make_unique<kernels::Unpack>(input, std::move(outputs), params);
}
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
-{
- assert(node->arity() == 2);
-
- const Tensor *input = getInputTensor(node->a());
- const Tensor *perm = getInputTensor(node->perm());
- Tensor *output = getOutputTensor(node);
-
- return std::make_unique<kernels::Transpose>(input, perm, output);
-}
-
} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h
index 7e30d395b..d5c5a4b56 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h
@@ -24,18 +24,18 @@
#include <memory>
#include <vector>
+#include <unordered_map>
namespace luci_interpreter
{
-class GraphLoader;
-class ModuleLoader;
-
class KernelBuilder : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>
{
public:
- KernelBuilder(const ModuleLoader &module_loader, const GraphLoader &graph_loader)
- : _module_loader(module_loader), _graph_loader(graph_loader)
+ KernelBuilder(
+ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+ const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+ : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
{
}
@@ -45,6 +45,7 @@ public:
std::unique_ptr<Kernel> visit(const luci::CircleConcatenation *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleConv2D *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleConst *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleDepthToSpace *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleDepthwiseConv2D *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleElu *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleFullyConnected *node) override;
@@ -61,6 +62,8 @@ public:
std::unique_ptr<Kernel> visit(const luci::CircleOutput *node) override;
std::unique_ptr<Kernel> visit(const luci::CirclePad *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleReshape *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleReverseV2 *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleSlice *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleSoftmax *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleSpaceToDepth *node) override;
std::unique_ptr<Kernel> visit(const luci::CircleSplit *node) override;
@@ -82,8 +85,8 @@ private:
RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const;
private:
- const ModuleLoader &_module_loader;
- const GraphLoader &_graph_loader;
+ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
+ const std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
};
} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
new file mode 100644
index 000000000..33bc8ec9b
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "loader/GraphLoader.h"
+#include "loader/KernelBuilder.h"
+
+#include <kernels/Add.h>
+#include <kernels/ArgMax.h>
+#include <kernels/AveragePool2D.h>
+#include <kernels/Concatenation.h>
+#include <kernels/Conv2D.h>
+#include <kernels/DepthToSpace.h>
+#include <kernels/DepthwiseConv2D.h>
+#include <kernels/Elu.h>
+#include <kernels/FullyConnected.h>
+#include <kernels/L2Normalize.h>
+#include <kernels/L2Pool2D.h>
+#include <kernels/LeakyRelu.h>
+#include <kernels/LocalResponseNormalization.h>
+#include <kernels/Logistic.h>
+#include <kernels/MaxPool2D.h>
+#include <kernels/Mean.h>
+#include <kernels/Mul.h>
+#include <kernels/Pad.h>
+#include <kernels/Reshape.h>
+#include <kernels/Reverse.h>
+#include <kernels/Slice.h>
+#include <kernels/Softmax.h>
+#include <kernels/SpaceToDepth.h>
+#include <kernels/Split.h>
+#include <kernels/Squeeze.h>
+#include <kernels/StridedSlice.h>
+#include <kernels/Transpose.h>
+#include <kernels/TransposeConv.h>
+#include <kernels/Unpack.h>
+
+#include <gmock/gmock.h>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+using namespace testing;
+
+class KernelBuilderTest : public Test
+{
+protected:
+ luci::CircleInput *createInputNode() { return createNode<luci::CircleInput>(); }
+
+ template <typename NodeT, typename... Args> NodeT *createNode(Args &&... args)
+ {
+ auto *node = _graph.nodes()->create<NodeT>(std::forward<Args>(args)...);
+ // The actual type does not matter for the purpose of the tests.
+ // NOTE The type is meaningless for nodes with multiple outputs (corresponding *Out nodes carry
+ // actual output types).
+ node->dtype(loco::DataType::FLOAT32);
+ return node;
+ }
+
+ template <typename NodeOutT> NodeOutT *createNodeOut(loco::Node *node, int index)
+ {
+ auto *node_out = createNode<NodeOutT>();
+ node_out->input(node);
+ node_out->index(index);
+ return node_out;
+ }
+
+ template <typename KernelT> std::unique_ptr<KernelT> buildKernel(const luci::CircleNode *op)
+ {
+ std::unordered_map<const loco::Graph *, RuntimeGraph *> graph_to_runtime_graph;
+
+ RuntimeGraph runtime_graph(nullptr);
+ RuntimeToIR runtime_to_ir;
+ GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph,
+ _node_to_tensor);
+ graph_loader.loadTensors();
+
+ KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor);
+
+ auto kernel = op->accept(&kernel_builder);
+ return std::unique_ptr<KernelT>(dynamic_cast<KernelT *>(kernel.release()));
+ }
+
+ void checkTensor(const Tensor *tensor, const loco::Node *node)
+ {
+ EXPECT_THAT(tensor, Eq(_node_to_tensor.at(node)));
+ }
+
+private:
+ loco::Graph _graph;
+ std::unordered_map<const loco::Node *, Tensor *> _node_to_tensor;
+};
+
+TEST_F(KernelBuilderTest, Add)
+{
+ auto *input1 = createInputNode();
+ auto *input2 = createInputNode();
+
+ auto *op = createNode<luci::CircleAdd>();
+ op->x(input1);
+ op->y(input2);
+
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::Add>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input1(), input1);
+ checkTensor(kernel->input2(), input2);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, ArgMax)
+{
+ auto *input = createInputNode();
+ auto *axis = createInputNode();
+
+ auto *op = createNode<luci::CircleArgMax>();
+ op->input(input);
+ op->dimension(axis);
+
+ op->output_type(loco::DataType::FLOAT32);
+
+ auto kernel = buildKernel<kernels::ArgMax>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->axis(), axis);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().output_type, Eq(op->output_type()));
+}
+
+TEST_F(KernelBuilderTest, AveragePool2D)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleAveragePool2D>();
+ op->value(input);
+
+ op->padding(luci::Padding::SAME);
+ op->filter()->h(11);
+ op->filter()->w(13);
+ op->stride()->h(17);
+ op->stride()->w(19);
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::AveragePool2D>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Concatenation)
+{
+ auto *input1 = createInputNode();
+ auto *input2 = createInputNode();
+
+ auto *op = createNode<luci::CircleConcatenation>(2);
+ op->values(0, input1);
+ op->values(1, input2);
+ op->axis(11);
+
+ auto kernel = buildKernel<kernels::Concatenation>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(0), input1);
+ checkTensor(kernel->input(1), input2);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+}
+
+TEST_F(KernelBuilderTest, Conv2D)
+{
+ auto *input = createInputNode();
+ auto *filter = createInputNode();
+ auto *bias = createInputNode();
+
+ auto *op = createNode<luci::CircleConv2D>();
+ op->input(input);
+ op->filter(filter);
+ op->bias(bias);
+
+ op->padding(luci::Padding::SAME);
+ op->stride()->h(11);
+ op->stride()->w(13);
+ op->dilation()->h(17);
+ op->dilation()->w(19);
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::Conv2D>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->filter(), filter);
+ checkTensor(kernel->bias(), bias);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+ EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
+ EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, DepthToSpace)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleDepthToSpace>();
+ op->input(input);
+
+ op->block_size(11);
+
+ auto kernel = buildKernel<kernels::DepthToSpace>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().block_size, Eq(op->block_size()));
+}
+
+TEST_F(KernelBuilderTest, DepthwiseConv2D)
+{
+ auto *input = createInputNode();
+ auto *filter = createInputNode();
+ auto *bias = createInputNode();
+
+ auto *op = createNode<luci::CircleDepthwiseConv2D>();
+ op->input(input);
+ op->filter(filter);
+ op->bias(bias);
+
+ op->padding(luci::Padding::SAME);
+ op->depthMultiplier(11);
+ op->stride()->h(13);
+ op->stride()->w(17);
+ op->dilation()->h(19);
+ op->dilation()->w(23);
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::DepthwiseConv2D>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->filter(), filter);
+ checkTensor(kernel->bias(), bias);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+ EXPECT_THAT(kernel->params().depth_multiplier, Eq(op->depthMultiplier()));
+ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+ EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
+ EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Elu)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleElu>();
+ op->features(input);
+
+ auto kernel = buildKernel<kernels::Elu>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, FullyConnected)
+{
+ auto *input = createInputNode();
+ auto *weights = createInputNode();
+ auto *bias = createInputNode();
+
+ auto *op = createNode<luci::CircleFullyConnected>();
+ op->input(input);
+ op->weights(weights);
+ op->bias(bias);
+
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::FullyConnected>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->weights(), weights);
+ checkTensor(kernel->bias(), bias);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, L2Normalize)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleL2Normalize>();
+ op->x(input);
+
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::L2Normalize>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, L2Pool2D)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleL2Pool2D>();
+ op->value(input);
+
+ op->padding(luci::Padding::SAME);
+ op->filter()->h(11);
+ op->filter()->w(13);
+ op->stride()->h(17);
+ op->stride()->w(19);
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::L2Pool2D>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, LeakyRelu)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleLeakyRelu>();
+ op->features(input);
+
+ op->alpha(11.0f);
+
+ auto kernel = buildKernel<kernels::LeakyRelu>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
+}
+
+TEST_F(KernelBuilderTest, LocalResponseNormalization)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleLocalResponseNormalization>();
+ op->input(input);
+
+ op->radius(11);
+ op->bias(13.0f);
+ op->alpha(15.0f);
+ op->beta(17.0f);
+
+ auto kernel = buildKernel<kernels::LocalResponseNormalization>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().radius, Eq(op->radius()));
+ EXPECT_THAT(kernel->params().bias, Eq(op->bias()));
+ EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
+ EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
+}
+
+TEST_F(KernelBuilderTest, Logistic)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleLogistic>();
+ op->x(input);
+
+ auto kernel = buildKernel<kernels::Logistic>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, MaxPool2D)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleMaxPool2D>();
+ op->value(input);
+
+ op->padding(luci::Padding::SAME);
+ op->filter()->h(11);
+ op->filter()->w(13);
+ op->stride()->h(17);
+ op->stride()->w(19);
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::MaxPool2D>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Mean)
+{
+ auto *input = createInputNode();
+ auto *axes = createInputNode();
+
+ auto *op = createNode<luci::CircleMean>();
+ op->input(input);
+ op->reduction_indices(axes);
+
+ op->keep_dims(true);
+
+ auto kernel = buildKernel<kernels::Mean>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->axes(), axes);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims()));
+}
+
+TEST_F(KernelBuilderTest, Mul)
+{
+ auto *input1 = createInputNode();
+ auto *input2 = createInputNode();
+
+ auto *op = createNode<luci::CircleMul>();
+ op->x(input1);
+ op->y(input2);
+
+ op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+ auto kernel = buildKernel<kernels::Mul>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input1(), input1);
+ checkTensor(kernel->input2(), input2);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Pad)
+{
+ auto *input = createInputNode();
+ auto *paddings = createInputNode();
+
+ auto *op = createNode<luci::CirclePad>();
+ op->input(input);
+ op->paddings(paddings);
+
+ auto kernel = buildKernel<kernels::Pad>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->paddings(), paddings);
+ checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Reshape)
+{
+ auto *input = createInputNode();
+ auto *shape = createInputNode();
+
+ auto *op = createNode<luci::CircleReshape>();
+ op->tensor(input);
+ op->shape(shape);
+
+ auto kernel = buildKernel<kernels::Reshape>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->shape(), shape);
+ checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, ReverseV2)
+{
+ auto *input = createInputNode();
+ auto *axes = createInputNode();
+
+ auto *op = createNode<luci::CircleReverseV2>();
+ op->tensor(input);
+ op->axis(axes);
+
+ auto kernel = buildKernel<kernels::Reverse>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->axes(), axes);
+ checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Slice)
+{
+ auto *input = createInputNode();
+ auto *begin = createInputNode();
+ auto *size = createInputNode();
+
+ auto *op = createNode<luci::CircleSlice>();
+ op->input(input);
+ op->begin(begin);
+ op->size(size);
+
+ auto kernel = buildKernel<kernels::Slice>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->begin(), begin);
+ checkTensor(kernel->size(), size);
+ checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Softmax)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleSoftmax>();
+ op->logits(input);
+
+ op->beta(11.0f);
+
+ auto kernel = buildKernel<kernels::Softmax>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
+}
+
+TEST_F(KernelBuilderTest, SpaceToDepth)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleSpaceToDepth>();
+ op->input(input);
+
+ op->block_size(11);
+
+ auto kernel = buildKernel<kernels::SpaceToDepth>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().block_size, op->block_size());
+}
+
+TEST_F(KernelBuilderTest, Split)
+{
+ auto *axis = createInputNode();
+ auto *input = createInputNode();
+ auto *op = createNode<luci::CircleSplit>();
+ auto *output1 = createNodeOut<luci::CircleSplitOut>(op, 0);
+ auto *output2 = createNodeOut<luci::CircleSplitOut>(op, 1);
+
+ op->split_dim(axis);
+ op->input(input);
+
+ op->num_split(2);
+
+ auto kernel = buildKernel<kernels::Split>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->axis(), axis);
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(0), output1);
+ checkTensor(kernel->output(1), output2);
+}
+
+TEST_F(KernelBuilderTest, Squeeze)
+{
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleSqueeze>();
+ op->input(input);
+
+ op->squeeze_dims({11, 13});
+
+ auto kernel = buildKernel<kernels::Squeeze>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().squeeze_dims, ElementsAreArray(op->squeeze_dims()));
+}
+
+TEST_F(KernelBuilderTest, StridedSlice)
+{
+ auto *input = createInputNode();
+ auto *begin = createInputNode();
+ auto *end = createInputNode();
+ auto *strides = createInputNode();
+
+ auto *op = createNode<luci::CircleStridedSlice>();
+ op->input(input);
+ op->begin(begin);
+ op->end(end);
+ op->strides(strides);
+
+ op->begin_mask(11);
+ op->ellipsis_mask(13);
+ op->end_mask(17);
+ op->new_axis_mask(19);
+ op->shrink_axis_mask(23);
+
+ auto kernel = buildKernel<kernels::StridedSlice>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->begin(), begin);
+ checkTensor(kernel->end(), end);
+ checkTensor(kernel->strides(), strides);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().begin_mask, Eq(op->begin_mask()));
+ EXPECT_THAT(kernel->params().ellipsis_mask, Eq(op->ellipsis_mask()));
+ EXPECT_THAT(kernel->params().end_mask, Eq(op->end_mask()));
+ EXPECT_THAT(kernel->params().new_axis_mask, Eq(op->new_axis_mask()));
+ EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask()));
+}
+
+TEST_F(KernelBuilderTest, Transpose)
+{
+ auto *input = createInputNode();
+ auto *perm = createInputNode();
+
+ auto *op = createNode<luci::CircleTranspose>();
+ op->a(input);
+ op->perm(perm);
+
+ auto kernel = buildKernel<kernels::Transpose>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->perm(), perm);
+ checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, TransposeConv)
+{
+ auto *output_shape = createInputNode();
+ auto *filter = createInputNode();
+ auto *input = createInputNode();
+
+ auto *op = createNode<luci::CircleTransposeConv>();
+ op->inputSizes(output_shape);
+ op->filter(filter);
+ op->outBackprop(input);
+
+ op->padding(luci::Padding::SAME);
+ op->stride()->h(11);
+ op->stride()->w(13);
+
+ auto kernel = buildKernel<kernels::TransposeConv>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->output_shape(), output_shape);
+ checkTensor(kernel->filter(), filter);
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(), op);
+ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+}
+
+TEST_F(KernelBuilderTest, Unpack)
+{
+ auto *input = createInputNode();
+ auto *op = createNode<luci::CircleUnpack>();
+ auto *output1 = createNodeOut<luci::CircleUnpackOut>(op, 0);
+ auto *output2 = createNodeOut<luci::CircleUnpackOut>(op, 1);
+
+ op->value(input);
+
+ op->num(2);
+ op->axis(11);
+
+ auto kernel = buildKernel<kernels::Unpack>(op);
+ ASSERT_THAT(kernel, NotNull());
+
+ checkTensor(kernel->input(), input);
+ checkTensor(kernel->output(0), output1);
+ checkTensor(kernel->output(1), output2);
+ EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+}
+
+TEST_F(KernelBuilderTest, NonExisting1_NEG)
+{
+ auto *op = createNode<luci::CircleConst>();
+ ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+TEST_F(KernelBuilderTest, NonExisting2_NEG)
+{
+ auto *op = createNode<luci::CircleInput>();
+ ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+TEST_F(KernelBuilderTest, NonExisting3_NEG)
+{
+ auto *op = createNode<luci::CircleOutput>();
+ ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
index 7780a61b6..b9a2ae0a9 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
@@ -41,8 +41,11 @@ void ModuleLoader::load()
{
const loco::Graph *graph = _module->graph(i);
RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
- GraphLoader loader(*this, graph, runtime_graph, _runtime_to_ir, _node_to_tensor);
- loader.load();
+ GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph,
+ _node_to_tensor);
+ loader.loadTensors();
+ loader.initInputOutputTensors();
+ loader.loadOperators();
}
}
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-interpreter/src/loader/ModuleLoader.h
index 954dbfb61..1af0ed747 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.h
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.h
@@ -36,11 +36,6 @@ public:
void load();
- RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const
- {
- return _graph_to_runtime_graph.at(graph);
- }
-
private:
const luci::Module *_module;
RuntimeModule *_runtime_module;
diff --git a/compiler/luci-value-test/evalverify.sh b/compiler/luci-value-test/evalverify.sh
index dfd55a691..12c9a459a 100755
--- a/compiler/luci-value-test/evalverify.sh
+++ b/compiler/luci-value-test/evalverify.sh
@@ -4,8 +4,10 @@
#
# HOW TO USE
#
-# ./evalverify.sh <path/to/work_dir> <TEST 1> <TEST 2> ...
-# work_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test)
+# ./evalverify.sh <path/to/bin_dir> <path/to/work_dir> <path/to/venv_dir> <TEST 1> <TEST 2> ...
+# bin_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test)
+# work_dir : artifacts directoy where test materials exist
+# venv_dir : python virtual environment home directory
VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/luci_eval_verifier.py"
diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst
index 6a332f92c..364d8819d 100644
--- a/compiler/luci-value-test/test.lst
+++ b/compiler/luci-value-test/test.lst
@@ -1,6 +1,8 @@
#addeval(Abs_000)
addeval(Add_000)
+#addeval(Add_001)
addeval(Add_U8_000)
+#addeval(AddN_000)
#addeval(ArgMax_000)
#addeval(ArgMax_001)
#addeval(ArgMax_002)
@@ -9,73 +11,173 @@ addeval(Add_U8_000)
#addeval(ArgMax_U8_001)
#addeval(ArgMax_U8_002)
#addeval(ArgMax_U8_003)
+#addeval(ArgMin_000)
+#addeval(ArgMin_001)
+#addeval(ArgMin_002)
+#addeval(ArgMin_003)
+#addeval(ArgMin_U8_000)
+#addeval(ArgMin_U8_001)
+#addeval(ArgMin_U8_002)
+#addeval(ArgMin_U8_003)
addeval(AveragePool2D_000)
+#addeval(BatchMatMul_000)
#addeval(BatchMatMulV2_000)
#addeval(BatchMatMulV2_001)
#addeval(BatchToSpaceND_000)
#addeval(Cast_000)
+#addeval(Cast_001)
+#addeval(Ceil_000)
addeval(Concatenation_000)
addeval(Concatenation_U8_000)
addeval(Conv2D_000)
addeval(Conv2D_001)
addeval(Conv2D_002)
+#addeval(Conv2D_003)
addeval(Conv2D_U8_000)
addeval(Conv2D_U8_001)
#addeval(Cos_000)
+#addeval(DepthToSpace_000)
addeval(DepthwiseConv2D_000)
addeval(DepthwiseConv2D_U8_000)
+#addeval(DepthwiseConv2D_U8_001)
+addeval(DepthwiseConv2D_001)
#addeval(Div_000)
+#addeval(ELU_000)
#addeval(Equal_000)
#addeval(Exp_000)
+#addeval(ExpandDims_000)
+#addeval(ExpandDims_001)
+#addeval(ExpandDims_002)
+#addeval(ExpandDims_003)
+#addeval(Fill_000)
+#addeval(Fill_001)
+#addeval(Floor_000)
+#addeval(FloorDiv_000)
+#addeval(FloorDiv_001)
+#addeval(FloorMod_000)
+#addeval(FloorMod_001)
addeval(FullyConnected_000)
addeval(FullyConnected_001)
#addeval(FullyConnected_002)
#addeval(FullyConnected_U8_000)
#addeval(Gather_000)
+#addeval(GatherNd_000)
+#addeval(Greater_000)
+#addeval(GreaterEqual_000)
#addeval(If_000)
#addeval(If_001)
+addeval(L2Normalize_000)
+addeval(L2Pool2D_000)
+#addeval(L2Pool2D_U8_000)
+#addeval(LeakyRelu_000)
+#addeval(Less_000)
+#addeval(LessEqual_000)
+#addeval(LocalResponseNormalization_000)
+#addeval(Log_000)
+#addeval(LogicalAnd_000)
#addeval(LogicalNot_000)
#addeval(LogicalOr_000)
-#addeval(Logistic_000)
+addeval(Logistic_000)
+#addeval(LogSoftmax_000)
+#addeval(MatMul_000)
+#addeval(MatrixDiag_000)
+#addeval(MatrixSetDiag_000)
+#addeval(Maximum_000)
addeval(MaxPool2D_000)
addeval(MaxPool2D_U8_000)
addeval(Mean_000)
addeval(Mean_001)
addeval(Mean_U8_000)
+#addeval(Minimum_000)
+#addeval(MirrorPad_000)
addeval(Mul_000)
#addeval(Mul_U8_000)
+#addeval(Neg_000)
+#addeval(NotEqual_000)
+#addeval(OneHot_000)
+#addeval(OneHot_001)
+#addeval(OneHot_002)
+#addeval(OneHot_003)
#addeval(Pack_000)
#addeval(Pack_U8_000)
addeval(Pad_000)
addeval(Pad_U8_000)
+#addeval(Pow_000)
+#addeval(PRelu_000)
+#addeval(Range_000)
+#addeval(Rank_000)
+#addeval(ReduceAny_000)
+#addeval(ReduceAny_001)
+#addeval(ReduceAny_002)
+#addeval(ReduceAny_003)
+#addeval(ReduceMax_000)
+#addeval(ReduceMin_000)
#addeval(ReduceProd_000)
#addeval(ReduceProd_001)
#addeval(ReduceProd_002)
#addeval(ReduceProd_003)
#addeval(ReLU_000)
+#addeval(ReLU6_000)
+#addeval(ReLUN1To1_000)
addeval(Reshape_000)
addeval(Reshape_001)
addeval(Reshape_002)
#addeval(Reshape_003)
addeval(Reshape_U8_000)
+#addeval(ResizeBilinear_000)
+#addeval(ResizeNearestNeighbor_000)
+#addeval(ReverseSequence_000)
+#addeval(ReverseV2_000)
+#addeval(Round_000)
#addeval(Rsqrt_000)
+#addeval(ScatterNd_000)
+#addeval(SegmentSum_000)
+#addeval(Select_000)
+#addeval(Select_001)
+#addeval(Select_002)
+#addeval(SelectV2_000)
+#addeval(SelectV2_001)
+#addeval(SelectV2_002)
+#addeval(Shape_000)
#addeval(Sin_000)
+addeval(Slice_000)
addeval(Softmax_000)
#addeval(Softmax_U8_000)
#addeval(SpaceToBatchND_000)
#addeval(SpaceToBatchND_001)
#addeval(SpaceToBatchND_002)
#addeval(SpaceToBatchND_003)
-#addeval(StridedSlice_000)
-#addeval(StridedSlice_001)
+#addeval(SpaceToDepth_000)
+#addeval(SparseToDense_000)
+#addeval(Split_000)
+#addeval(SplitV_000)
+#addeval(Sqrt_000)
+#addeval(Square_000)
+#addeval(SquaredDifference_000)
+addeval(Squeeze_000)
+addeval(StridedSlice_000)
+addeval(StridedSlice_001)
+addeval(StridedSlice_002)
#addeval(Sub_000)
#addeval(Sub_U8_000)
+#addeval(Sum_000)
+#addeval(Sum_001)
#addeval(Tanh_000)
#addeval(Tile_000)
#addeval(Tile_U8_000)
-#addeval(Transpose_000)
+#addeval(TopKV2_000)
+#addeval(TopKV2_001)
+addeval(Transpose_000)
+#addeval(TransposeConv_000)
#addeval(Unpack_000)
#addeval(Unpack_001)
#addeval(Unpack_002)
+addeval(Unpack_003)
+#addeval(Where_000)
+#addeval(Where_001)
#addeval(While_000)
#addeval(While_001)
+#addeval(While_002)
+#addeval(While_003)
+#addeval(YUV_TO_RGB_U8_000)
+#addeval(ZerosLike_000)
diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
index 3c01b676f..344c99ff5 100644
--- a/compiler/luci/export/src/CircleOperationExporter.cpp
+++ b/compiler/luci/export/src/CircleOperationExporter.cpp
@@ -890,7 +890,7 @@ void OperationExporter::visit(luci::CircleSpaceToDepth *node)
{
export_simple(node, circle::BuiltinOperator_SPACE_TO_DEPTH,
circle::BuiltinOptions_SpaceToDepthOptions,
- CreateSpaceToDepthOptions(builder).Union());
+ CreateSpaceToDepthOptions(builder, node->block_size()).Union());
}
void OperationExporter::visit(luci::CircleSparseToDense *node)
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index 5cad3920b..dc8c2fbc9 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -302,7 +302,10 @@ encodeQuantizationParameters(FlatBufferBuilder &builder, luci::CircleQuantParam
scale = builder.CreateVector(quantparam->scale);
zero_point = builder.CreateVector(quantparam->zerop);
}
- return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point);
+ // Note: QuantizationDetails is not supported
+ return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point,
+ circle::QuantizationDetails::QuantizationDetails_NONE,
+ 0, quantparam->quantized_dimension);
}
void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder,
diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
index 81e945dd1..bc7f39762 100644
--- a/compiler/luci/import/src/CircleReader.cpp
+++ b/compiler/luci/import/src/CircleReader.cpp
@@ -156,6 +156,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
const auto &max = quantization->max;
const auto &scale = quantization->scale;
const auto &zero_point = quantization->zero_point;
+ const auto &quantized_dimension = quantization->quantized_dimension;
if ((!min.empty() && !max.empty()) || (!scale.empty() && !zero_point.empty()))
{
@@ -165,6 +166,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
quantparam->max = max;
quantparam->scale = scale;
quantparam->zerop = zero_point;
+ quantparam->quantized_dimension = quantized_dimension;
return quantparam;
}
diff --git a/compiler/luci/import/src/Importer.test.cpp b/compiler/luci/import/src/Importer.test.cpp
index 4426e15fd..8366546f0 100644
--- a/compiler/luci/import/src/Importer.test.cpp
+++ b/compiler/luci/import/src/Importer.test.cpp
@@ -20,4 +20,9 @@
#include <gtest/gtest.h>
-TEST(TensorFlowLiteImport, Dummy) { luci::Importer import; }
+TEST(TensorFlowLiteImport, Dummy)
+{
+ luci::Importer import;
+
+ SUCCEED();
+}
diff --git a/compiler/luci/import/src/Nodes/CircleLogistic.cpp b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
index 85e7e55b2..c77c55eef 100644
--- a/compiler/luci/import/src/Nodes/CircleLogistic.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
@@ -32,21 +32,7 @@ bool CircleLogisticGraphBuilder::validate(const ValidateArgs &args) const
if (outputs.size() != 1)
return false;
- // Must be one of the following types
- // float16, float32, float64, complex64, or complex128
const auto &tensors = args.reader.tensors();
- const auto &tensor = tensors.at(inputs[0]);
- switch (tensor->type)
- {
- case circle::TensorType_FLOAT16:
- case circle::TensorType_FLOAT32:
- case circle::TensorType_FLOAT64:
- case circle::TensorType_COMPLEX64:
- break;
- default:
- return false;
- }
-
if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type)
return false;
diff --git a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
index 7bdf46daa..eb0956c4f 100644
--- a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
@@ -30,6 +30,24 @@ bool CircleTransposeConvGraphBuilder::validate(const ValidateArgs &args) const
if (args.op.inputs.size() != 3)
return false;
+ const auto &inputs = args.op.inputs;
+ const auto &tensors = args.reader.tensors();
+ const auto &filter_tensor = tensors.at(inputs[1]);
+ const auto &filter_shape = filter_tensor.get()->shape;
+ const auto &ifm_tensor = tensors.at(inputs[2]);
+ const auto &ifm_shape = ifm_tensor.get()->shape;
+
+ // ifm and filters must be 4-D tensor
+ if (ifm_shape.size() != 4)
+ return false;
+ if (filter_shape.size() != 4)
+ return false;
+
+ // input shape : [batch, height, width, in_channels]
+ // filters shape : [output_channels, height, weight, in_channels]
+ if (ifm_tensor.get()->shape.at(3) != filter_tensor.get()->shape.at(3))
+ return false;
+
return true;
}
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
index 488dcfb89..acd79210a 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
@@ -120,6 +120,7 @@ CIRCLE_NODE(BCQ_FULLY_CONNECTED, luci::CircleBCQFullyConnected)
CIRCLE_NODE(BCQ_GATHER, luci::CircleBCQGather)
CIRCLE_NODE(INSTANCE_NORM, luci::CircleInstanceNorm)
// Virtual node(s)
+CIRCLE_NODE(CIRCLECONST, void)
CIRCLE_NODE(CIRCLEINPUT, luci::CircleInput)
CIRCLE_NODE(CIRCLEOUTPUT, luci::CircleOutput)
CIRCLE_NODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy)
diff --git a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
index 7253e657b..694437303 100644
--- a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
+++ b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
@@ -29,6 +29,7 @@ struct CircleQuantParam
std::vector<float> max;
std::vector<float> scale;
std::vector<int64_t> zerop;
+ int32_t quantized_dimension{0};
};
} // namespace luci
diff --git a/compiler/luci/lang/src/Module.test.cpp b/compiler/luci/lang/src/Module.test.cpp
index 26bf073be..a5973e52d 100644
--- a/compiler/luci/lang/src/Module.test.cpp
+++ b/compiler/luci/lang/src/Module.test.cpp
@@ -22,7 +22,7 @@ TEST(ModuleTest, consturctor)
{
auto gs = luci::make_module();
- GTEST_SUCCEED();
+ SUCCEED();
}
TEST(ModuleTest, add)
diff --git a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
index 74ea82c6c..c07268cbf 100644
--- a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
@@ -35,7 +35,12 @@ TEST(CircleCustomTest, constructor)
ASSERT_EQ(0, custom_node.custom_code().size());
}
-TEST(CircleCustomTest, constructor_NEG) { ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); }
+TEST(CircleCustomTest, constructor_NEG)
+{
+ ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, "");
+
+ SUCCEED();
+}
TEST(CircleCustomTest, invalidIndex_NEG)
{
diff --git a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
index e3c8c9f60..35f28e9ac 100644
--- a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
@@ -41,11 +41,15 @@ TEST(CircleIfTest, constructor)
TEST(CircleIfTestDeath, invalid_arity_NEG)
{
ASSERT_DEBUG_DEATH(luci::CircleIf very_long_name_if_node(0, 1), "");
+
+ SUCCEED();
}
TEST(CircleIfTestDeath, invalid_output_count_NEG)
{
ASSERT_DEBUG_DEATH(luci::CircleIf if_node(2, 0), "");
+
+ SUCCEED();
}
TEST(CircleIfTestDeath, invalid_input_get_index_NEG)
diff --git a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
index 19290c0a2..913686fbd 100644
--- a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
@@ -41,11 +41,15 @@ TEST(CircleWhileTest, constructor)
TEST(CircleWhileTestDeath, invalid_arity_NEG)
{
ASSERT_DEBUG_DEATH(luci::CircleWhile very_long_name_while_node(0, 1), "");
+
+ SUCCEED();
}
TEST(CircleWhileTestDeath, invalid_output_count_NEG)
{
ASSERT_DEBUG_DEATH(luci::CircleWhile while_node(2, 0), "");
+
+ SUCCEED();
}
TEST(CircleWhileTestDeath, invalid_input_get_index_NEG)
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 90fbe9009..2edf7a9c6 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -145,7 +145,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
{
static const std::vector<std::string> fakeq_supported_input_dtype{"float32"};
static const std::vector<std::string> fakeq_supported_output_dtype{"uint8"};
- static const std::vector<std::string> fakeq_supported_granularity{"layer"};
+ static const std::vector<std::string> fakeq_supported_granularity{"layer", "channel"};
auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
@@ -173,7 +173,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
{
static const std::vector<std::string> qwmm_supported_input_dtype{"float32"};
static const std::vector<std::string> qwmm_supported_output_dtype{"uint8"};
- static const std::vector<std::string> qwmm_supported_granularity{"layer"};
+ static const std::vector<std::string> qwmm_supported_granularity{"layer", "channel"};
auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
index b81db8827..edbaefa3d 100644
--- a/compiler/luci/pass/src/FuseBCQPass.cpp
+++ b/compiler/luci/pass/src/FuseBCQPass.cpp
@@ -67,14 +67,190 @@ const std::string node_name_prefix(luci::NodeName node_name)
return prefix;
}
+/**
+ * @brief Create CircleOutputExclude operation, which has same shape and dtype with
+ * original circle_node.
+ */
+luci::CircleOutputExclude *createNoOp(luci::CircleNode *circle_node)
+{
+ auto graph = circle_node->graph();
+ auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
+
+ if (circle_node->shape_status() == luci::ShapeStatus::VALID)
+ {
+ noOp->dtype(circle_node->dtype());
+ noOp->rank(circle_node->rank());
+ for (uint32_t i = 0; i < circle_node->rank(); ++i)
+ noOp->dim(i) = circle_node->dim(i);
+ }
+ else
+ {
+ // For type inference
+ noOp->dtype(loco::DataType::FLOAT32);
+ }
+
+ return noOp;
+};
+
} // namespace
namespace
{
-class BCQConverter final
+// V means the version of BCQ.
+template <int32_t V> class BCQFuser;
+
+template <> class BCQFuser<1>
{
public:
+ bool fuseBCQ(loco::Graph *g)
+ {
+ bool changed = false;
+
+ for (auto node : loco::all_nodes(g))
+ {
+ if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
+ {
+ add_BCQ_info_node(circle_const);
+ }
+ }
+
+ if (!is_bcqinfo_valid())
+ return false;
+
+ for (auto node : loco::active_nodes(loco::output_nodes(g)))
+ {
+ if (auto gather = dynamic_cast<luci::CircleGather *>(node))
+ {
+ auto params = dynamic_cast<luci::CircleConst *>(gather->params());
+ if (params != nullptr && has_BCQ_info(params))
+ {
+ auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
+
+ bcq_gather->op_version(1);
+ bcq_gather->input_scales(get_alpha(params));
+ bcq_gather->input_binary(get_packed_binary_code(params));
+ bcq_gather->indices(gather->indices());
+ bcq_gather->input_clusters(packed_clusters(params));
+
+ // input_binary shape : [output_size, hidden_size]
+ const auto binary_hidden_size =
+ loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
+ bcq_gather->input_hidden_size(binary_hidden_size);
+
+ if (do_w_x(params))
+ {
+ bcq_gather->axis(gather->axis());
+ }
+ else
+ {
+ const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
+ bcq_gather->axis(axis_transpose);
+ }
+
+ loco::replace(gather).with(bcq_gather);
+
+ changed = true;
+ }
+ }
+ else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
+ {
+ auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
+ if (weights != nullptr && has_BCQ_info(weights))
+ {
+ auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+
+ bcq_fc->op_version(1);
+ bcq_fc->weights_scales(get_alpha(weights));
+ bcq_fc->weights_binary(get_packed_binary_code(weights));
+ bcq_fc->bias(fully_connected->bias());
+ bcq_fc->weights_clusters(packed_clusters(weights));
+ bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
+
+ loco::Node *bcq_input = fully_connected->input();
+ int32_t batch_rank = 0;
+
+ // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
+ const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
+ if (original_input->shape_status() == luci::ShapeStatus::VALID &&
+ original_input->rank() > 2)
+ {
+ auto new_shape = g->nodes()->create<luci::CircleConst>();
+ new_shape->dtype(loco::DataType::S32);
+ new_shape->size<loco::DataType::S32>(2);
+ new_shape->rank(1);
+ new_shape->dim(0) = 2;
+
+ auto batch_size = 1;
+ for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
+ batch_size *= original_input->dim(i).value();
+
+ new_shape->at<loco::DataType::S32>(0) = batch_size;
+ new_shape->at<loco::DataType::S32>(1) =
+ original_input->dim(original_input->rank() - 1).value();
+ new_shape->shape_status(luci::ShapeStatus::VALID);
+
+ auto reshape = g->nodes()->create<luci::CircleReshape>();
+ reshape->tensor(original_input);
+ reshape->shape(new_shape);
+
+ bcq_input = reshape;
+ batch_rank = original_input->rank() - 2;
+ }
+
+ // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
+ if (do_w_x(weights))
+ {
+ const auto binary_hidden_size =
+ loco::must_cast<luci::CircleNode *>(fully_connected->input())
+ ->dim(batch_rank)
+ .value();
+ bcq_fc->weights_hidden_size(binary_hidden_size);
+ bcq_fc->input(bcq_input);
+ loco::replace(fully_connected).with(bcq_fc);
+ }
+ else
+ {
+ const auto binary_hidden_size =
+ loco::must_cast<luci::CircleNode *>(fully_connected->input())
+ ->dim(1 + batch_rank)
+ .value();
+ bcq_fc->weights_hidden_size(binary_hidden_size);
+
+ auto perm = g->nodes()->create<luci::CircleConst>();
+ perm->dtype(loco::DataType::S32);
+ perm->size<loco::DataType::S32>(2);
+ perm->rank(1);
+ perm->dim(0) = 2;
+ perm->at<loco::DataType::S32>(0) = 1;
+ perm->at<loco::DataType::S32>(1) = 0;
+ perm->shape_status(luci::ShapeStatus::VALID);
+
+ auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+ input_transpose->a(bcq_input);
+ input_transpose->perm(perm);
+
+ bcq_fc->input(input_transpose);
+
+ auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+ output_transpose->a(bcq_fc);
+ output_transpose->perm(perm);
+
+ loco::replace(fully_connected).with(output_transpose);
+ }
+
+ changed = true;
+ }
+ }
+ }
+
+ if (changed)
+ clear_BCQ_nodes();
+
+ return changed;
+ }
+
+private:
void add_BCQ_info_node(luci::CircleConst *node)
{
const auto node_name = node->name();
@@ -119,16 +295,65 @@ public:
return has_info;
}
+ /**
+ * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
+ * from graph output by using CircleOutputExclude
+ */
+ void clear_BCQ_nodes()
+ {
+ auto clear_nodes = [](std::map<std::string, luci::CircleConst *> &nodes) {
+ for (auto &n : nodes)
+ {
+ auto node = n.second;
+
+ for (auto s : loco::succs(node))
+ {
+ if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
+ {
+ outnode->from(createNoOp(node));
+ }
+ else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
+ {
+ for (auto o : loco::succs(reshape_node))
+ {
+ auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
+ circle_output->from(createNoOp(reshape_node));
+ }
+ }
+ }
+ }
+ };
+
+ clear_nodes(_do_w_x);
+ clear_nodes(_alpha);
+ clear_nodes(_packed_binary_code);
+ clear_nodes(_number_of_clusters);
+ clear_nodes(_size_of_clusters);
+ clear_nodes(_qbits_of_clusters);
+ clear_nodes(_dequant_weight);
+ }
+
+ bool is_bcqinfo_valid()
+ {
+ // do_w_x should be int32 or bool type
+ for (auto n : _do_w_x)
+ {
+ if (n.second->dtype() != loco::DataType::BOOL && n.second->dtype() != loco::DataType::S32)
+ return false;
+ }
+
+ return true;
+ }
+
+private:
bool do_w_x(luci::CircleConst *node)
{
const auto prefix = node_name_prefix(node->name());
if (_do_w_x[prefix]->dtype() == loco::DataType::S32)
return _do_w_x[prefix]->at<loco::DataType::S32>(0) == 1;
- else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL)
- return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
else
- throw std::runtime_error("do_w_x should be int or bool");
+ return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
}
luci::CircleConst *get_alpha(luci::CircleConst *node)
@@ -187,64 +412,6 @@ public:
return packed_clusters;
}
- /**
- * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
- * from graph output by using CircleOutputExclude
- */
- void clear_BCQ_nodes()
- {
- auto createNoOp = [](luci::CircleNode *circle_node) {
- auto graph = circle_node->graph();
- auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
-
- if (circle_node->shape_status() == luci::ShapeStatus::VALID)
- {
- noOp->dtype(circle_node->dtype());
- noOp->rank(circle_node->rank());
- for (uint32_t i = 0; i < circle_node->rank(); ++i)
- noOp->dim(i) = circle_node->dim(i);
- }
- else
- {
- // For type inference
- noOp->dtype(loco::DataType::FLOAT32);
- }
-
- return noOp;
- };
-
- auto clear_nodes = [createNoOp](std::map<std::string, luci::CircleConst *> &nodes) {
- for (auto &n : nodes)
- {
- auto node = n.second;
-
- for (auto s : loco::succs(node))
- {
- if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
- {
- outnode->from(createNoOp(node));
- }
- else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
- {
- for (auto o : loco::succs(reshape_node))
- {
- auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
- circle_output->from(createNoOp(reshape_node));
- }
- }
- }
- }
- };
-
- clear_nodes(_do_w_x);
- clear_nodes(_alpha);
- clear_nodes(_packed_binary_code);
- clear_nodes(_number_of_clusters);
- clear_nodes(_size_of_clusters);
- clear_nodes(_qbits_of_clusters);
- clear_nodes(_dequant_weight);
- }
-
private:
std::map<std::string, luci::CircleConst *> _do_w_x;
std::map<std::string, luci::CircleConst *> _alpha;
@@ -262,142 +429,9 @@ namespace luci
bool FuseBCQPass::run(loco::Graph *g)
{
- BCQConverter converter;
-
bool changed = false;
- for (auto node : loco::all_nodes(g))
- {
- if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
- {
- converter.add_BCQ_info_node(circle_const);
- }
- }
-
- for (auto node : loco::active_nodes(loco::output_nodes(g)))
- {
- if (auto gather = dynamic_cast<luci::CircleGather *>(node))
- {
- auto params = dynamic_cast<luci::CircleConst *>(gather->params());
- if (params != nullptr && converter.has_BCQ_info(params))
- {
- auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
-
- bcq_gather->input_scales(converter.get_alpha(params));
- bcq_gather->input_binary(converter.get_packed_binary_code(params));
- bcq_gather->indices(gather->indices());
- bcq_gather->input_clusters(converter.packed_clusters(params));
-
- const auto binary_hidden_size =
- loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
- bcq_gather->input_hidden_size(binary_hidden_size);
-
- if (converter.do_w_x(params))
- {
- bcq_gather->axis(gather->axis());
- }
- else
- {
- const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
- bcq_gather->axis(axis_transpose);
- }
-
- loco::replace(gather).with(bcq_gather);
-
- changed = true;
- }
- }
- else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
- {
- auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
- if (weights != nullptr && converter.has_BCQ_info(weights))
- {
- auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
-
- bcq_fc->weights_scales(converter.get_alpha(weights));
- bcq_fc->weights_binary(converter.get_packed_binary_code(weights));
- bcq_fc->bias(fully_connected->bias());
- bcq_fc->weights_clusters(converter.packed_clusters(weights));
- bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
-
- loco::Node *bcq_input = fully_connected->input();
- int32_t batch_rank = 0;
-
- // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
- const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
- if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2)
- {
- auto new_shape = g->nodes()->create<luci::CircleConst>();
- new_shape->dtype(loco::DataType::S32);
- new_shape->size<loco::DataType::S32>(2);
- new_shape->rank(1);
- new_shape->dim(0) = 2;
-
- auto batch_size = 1;
- for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
- batch_size *= original_input->dim(i).value();
-
- new_shape->at<loco::DataType::S32>(0) = batch_size;
- new_shape->at<loco::DataType::S32>(1) =
- original_input->dim(original_input->rank() - 1).value();
- new_shape->shape_status(ShapeStatus::VALID);
-
- auto reshape = g->nodes()->create<luci::CircleReshape>();
- reshape->tensor(original_input);
- reshape->shape(new_shape);
-
- bcq_input = reshape;
- batch_rank = original_input->rank() - 2;
- }
-
- // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
- if (converter.do_w_x(weights))
- {
- const auto binary_hidden_size =
- loco::must_cast<luci::CircleNode *>(fully_connected->input())
- ->dim(batch_rank)
- .value();
- bcq_fc->weights_hidden_size(binary_hidden_size);
- bcq_fc->input(bcq_input);
- loco::replace(fully_connected).with(bcq_fc);
- }
- else
- {
- const auto binary_hidden_size =
- loco::must_cast<luci::CircleNode *>(fully_connected->input())
- ->dim(1 + batch_rank)
- .value();
- bcq_fc->weights_hidden_size(binary_hidden_size);
-
- auto perm = g->nodes()->create<luci::CircleConst>();
- perm->dtype(loco::DataType::S32);
- perm->size<loco::DataType::S32>(2);
- perm->rank(1);
- perm->dim(0) = 2;
- perm->at<loco::DataType::S32>(0) = 1;
- perm->at<loco::DataType::S32>(1) = 0;
- perm->shape_status(ShapeStatus::VALID);
-
- auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
- input_transpose->a(bcq_input);
- input_transpose->perm(perm);
-
- bcq_fc->input(input_transpose);
-
- auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
- output_transpose->a(bcq_fc);
- output_transpose->perm(perm);
-
- loco::replace(fully_connected).with(output_transpose);
- }
-
- changed = true;
- }
- }
- }
-
- if (changed)
- converter.clear_BCQ_nodes();
+ changed = BCQFuser<1>().fuseBCQ(g);
return changed;
}
diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
index 6726ce746..9c9e74100 100644
--- a/compiler/luci/pass/src/QuantizationUtils.cpp
+++ b/compiler/luci/pass/src/QuantizationUtils.cpp
@@ -99,6 +99,13 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t
nudged_zero_point = static_cast<uint8_t>(std::round(zero_point_double));
}
+ // protect scale from being very low due to overflow
+ if (scale < 1e-5)
+ {
+ scale = 1e-5;
+ nudged_zero_point = static_cast<uint8_t>(std::round(qmin_double - rmin / scale));
+ }
+
nudged_min = static_cast<float>((qmin_double - nudged_zero_point) * scale);
nudged_max = static_cast<float>((qmax_double - nudged_zero_point) * scale);
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index f8abee751..2264bd770 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -138,7 +138,8 @@ bool is_quantized(const CircleNode *node)
node->dtype() == loco::DataType::S32; // bias
}
-void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor)
+void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
+ int32_t &channel_dim_index)
{
assert(node->dtype() == loco::DataType::FLOAT32);
@@ -153,7 +154,6 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
uint32_t indices[4] = {
0,
};
- int channel_dim_index{0};
if (!get_channel_dim_index(node, dimension, channel_dim_index))
{
@@ -189,7 +189,7 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
}
void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
- std::vector<float> &scaling_factor)
+ std::vector<float> &scaling_factor, int32_t &channel_dim_index)
{
assert(node->dtype() == loco::DataType::FLOAT32);
@@ -204,7 +204,6 @@ void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
uint32_t indices[4] = {
0,
};
- int channel_dim_index{0};
if (!get_channel_dim_index(node, dimension, channel_dim_index))
{
@@ -350,8 +349,8 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
circle_node->dtype(loco::DataType::S16);
}
- circle_node->quantparam()->max[0] = nudged_max;
- circle_node->quantparam()->min[0] = nudged_min;
+ circle_node->quantparam()->min.clear();
+ circle_node->quantparam()->max.clear();
circle_node->quantparam()->scale.push_back(scaling_factor);
circle_node->quantparam()->zerop.push_back(zp);
}
@@ -472,15 +471,19 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
assert(quantparam != nullptr);
auto min = quantparam->min;
auto scaling_factor = quantparam->scale;
+ int32_t channel_dim_index = 0;
if (output_type == loco::DataType::U8)
{
- asym_wquant_per_channel(circle_const, min, scaling_factor);
+ asym_wquant_per_channel(circle_const, min, scaling_factor, channel_dim_index);
}
else
{
- sym_wquant_per_channel(circle_const, scaling_factor);
+ sym_wquant_per_channel(circle_const, scaling_factor, channel_dim_index);
}
+ quantparam->min.clear();
+ quantparam->max.clear();
+ quantparam->quantized_dimension = channel_dim_index;
}
// Find min/max per layer-wise
else
@@ -493,6 +496,8 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
auto min = quantparam->min[0];
auto scaling_factor = quantparam->scale[0];
asym_wquant_per_layer(circle_const, min, scaling_factor);
+ quantparam->min.clear();
+ quantparam->max.clear();
}
}
}
diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst
index 188e29828..3da3437cc 100644
--- a/compiler/luci/tests/test.lst
+++ b/compiler/luci/tests/test.lst
@@ -30,13 +30,16 @@ addread(Ceil_000)
addread(Concatenation_000)
addread(Concatenation_U8_000)
addread(Conv2D_000)
+addread(Conv2D_001)
addread(Conv2D_002)
addread(Conv2D_003)
addread(Conv2D_U8_000)
+addread(Conv2D_U8_001)
addread(Cos_000)
addread(DepthToSpace_000)
addread(DepthwiseConv2D_000)
addread(DepthwiseConv2D_U8_000)
+addread(DepthwiseConv2D_U8_001)
addread(DepthwiseConv2D_001)
addread(Div_000)
addread(ELU_000)
@@ -84,6 +87,7 @@ addread(MaxPool2D_000)
addread(MaxPool2D_U8_000)
addread(Mean_000)
addread(Mean_001)
+addread(Mean_U8_000)
addread(Minimum_000)
addread(MirrorPad_000)
addread(Mul_000)
@@ -97,6 +101,7 @@ addread(OneHot_003)
addread(Pack_000)
addread(Pack_U8_000)
addread(Pad_000)
+addread(Pad_U8_000)
addread(Pow_000)
addread(PRelu_000)
addread(Range_000)
@@ -222,13 +227,16 @@ addwrite(Ceil_000)
addwrite(Concatenation_000)
addwrite(Concatenation_U8_000)
addwrite(Conv2D_000)
+addwrite(Conv2D_001)
addwrite(Conv2D_002)
addwrite(Conv2D_003)
addwrite(Conv2D_U8_000)
+addwrite(Conv2D_U8_001)
addwrite(Cos_000)
addwrite(DepthToSpace_000)
addwrite(DepthwiseConv2D_000)
addwrite(DepthwiseConv2D_U8_000)
+addwrite(DepthwiseConv2D_U8_001)
addwrite(DepthwiseConv2D_001)
addwrite(Div_000)
addwrite(ELU_000)
@@ -276,6 +284,7 @@ addwrite(MaxPool2D_000)
addwrite(MaxPool2D_U8_000)
addwrite(Mean_000)
addwrite(Mean_001)
+addwrite(Mean_U8_000)
addwrite(Minimum_000)
addwrite(MirrorPad_000)
addwrite(Mul_000)
diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
index 2c80664e2..820b6d8a3 100644
--- a/compiler/one-cmds/one-codegen
+++ b/compiler/one-cmds/one-codegen
@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
function Usage()
{
- echo "Usage: $0 [BACKEND] ..."
+ echo "Usage: one-codegen [BACKEND] ..."
echo "Available BACKEND drivers:"
backend_exist=0
for file in `find $DRIVER_PATH -name *-compile -type f`;
@@ -33,23 +33,34 @@ function Usage()
if [ $backend_exist == 0 ]; then
echo " (There is no available backend drivers)"
fi
+
+ exit 255
}
-# Get command from command-line
-BACKEND=$1; shift
-BACKEND_DRIVER="$BACKEND-compile"
+function version()
+{
+ $DRIVER_PATH/one-version one-codegen
+ exit 255
+}
-if [[ -z "${BACKEND_DRIVER}" ]]; then
+# Get command from command-line
+BACKEND=$1
+if [[ -z ${BACKEND} ]]; then
Usage
- exit 255
fi
+shift
+
+if [[ "${BACKEND}" == "--version" ]]; then
+ version
+fi
+
+BACKEND_DRIVER="${BACKEND}-compile"
BACKEND_DRIVER_CMD="${DRIVER_PATH}/${BACKEND_DRIVER}"
if [[ ! -f "${BACKEND_DRIVER_CMD}" ]]; then
echo "ERROR: '${BACKEND_DRIVER}' is not supported"
Usage
- exit 255
fi
"${BACKEND_DRIVER_CMD}" "$@"
diff --git a/compiler/one-cmds/one-import b/compiler/one-cmds/one-import
index dbf4af534..b1dd8f4c3 100644
--- a/compiler/one-cmds/one-import
+++ b/compiler/one-cmds/one-import
@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
function Usage()
{
- echo "Usage: $0 [FRAMEWORK] ..."
+ echo "Usage: one-import [FRAMEWORK] ..."
echo "Available FRAMEWORK drivers:"
framework_exist=0
for file in "$DRIVER_PATH"/one-import-*;
@@ -31,23 +31,34 @@ function Usage()
if [ $framework_exist == 0 ]; then
echo " (There is no available import drivers)"
fi
+
+ exit 255
}
-# Get command from command-line
-FRAMEWORK=$1; shift
-FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
+function version()
+{
+ $DRIVER_PATH/one-version one-import-tf
+ exit 255
+}
-if [[ -z "${FRAMEWORK_DRIVER}" ]]; then
+# Get command from command-line
+FRAMEWORK=$1
+if [[ -z ${FRAMEWORK} ]]; then
Usage
- exit 255
+fi
+shift
+
+if [ ${FRAMEWORK} = "--version" ]; then
+ version
fi
+FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
+
FRAMEWORK_DRIVER_CMD="${DRIVER_PATH}/${FRAMEWORK_DRIVER}"
if [[ ! -f "${FRAMEWORK_DRIVER_CMD}" ]]; then
echo "ERROR: '${FRAMEWORK_DRIVER}' is not supported"
Usage
- exit 255
fi
"${FRAMEWORK_DRIVER_CMD}" "$@"
diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf
index c048a4e0c..d59e1c529 100644
--- a/compiler/one-cmds/one-import-tf
+++ b/compiler/one-cmds/one-import-tf
@@ -22,14 +22,24 @@ usage()
{
echo "Convert TensorFlow model to circle."
echo "Usage: one-import-tf"
+ echo " --version Show version information and exit"
echo " --input_path <path/to/tfmodel>"
echo " --output_path <path/to/circle>"
echo " --input_arrays <names of the input arrays, comma-separated>"
echo " --input_shapes <input shapes, colon-separated>"
echo " --output_arrays <names of the output arrays, comma-separated>"
- exit 0
+ echo " --v2 Use TensorFlow 2.x interface (default is 1.x interface)"
+ exit 255
}
+version()
+{
+ $DRIVER_PATH/one-version one-import-tf
+ exit 255
+}
+
+TF_INTERFACE="--v1"
+
# Parse command-line arguments
#
while [ "$#" -ne 0 ]; do
@@ -39,6 +49,9 @@ while [ "$#" -ne 0 ]; do
'--help')
usage
;;
+ '--version')
+ version
+ ;;
'--input_path')
export INPUT_PATH="$2"
shift 2
@@ -59,6 +72,10 @@ while [ "$#" -ne 0 ]; do
export OUTPUT_ARRAYS="$2"
shift 2
;;
+ '--v2')
+ TF_INTERFACE="--v2"
+ shift
+ ;;
*)
echo "Unknown parameter: ${CUR}"
shift
@@ -92,14 +109,21 @@ fi
# remove previous log
rm -rf "${OUTPUT_PATH}.log"
+show_err_onexit()
+{
+ cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
# generate temporary tflite file
-echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
+echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
--input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
--output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
--output_arrays ${OUTPUT_ARRAYS} > "${OUTPUT_PATH}.log"
echo " " >> "${OUTPUT_PATH}.log"
-python "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
+python "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
--input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
--output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
--output_arrays ${OUTPUT_ARRAYS} >> "${OUTPUT_PATH}.log" 2>&1
diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite
index 31ed5af85..053489c92 100644
--- a/compiler/one-cmds/one-import-tflite
+++ b/compiler/one-cmds/one-import-tflite
@@ -22,9 +22,16 @@ usage()
{
echo "Convert TensorFlow lite model to circle."
echo "Usage: one-import-tflite"
+ echo " --version Show version information and exit"
echo " --input_path <path/to/tflitemodel>"
echo " --output_path <path/to/circle>"
- exit 0
+ exit 255
+}
+
+version()
+{
+ $DRIVER_PATH/one-version one-import-tflite
+ exit 255
}
# Parse command-line arguments
@@ -36,6 +43,9 @@ while [ "$#" -ne 0 ]; do
'--help')
usage
;;
+ '--version')
+ version
+ ;;
'--input_path')
export INPUT_PATH="$2"
shift 2
@@ -55,12 +65,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
echo "Error: input model not found"
echo ""
usage
- exit 2
fi
# remove previous log
rm -rf "${OUTPUT_PATH}.log"
+show_err_onexit()
+{
+ cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
# convert .tflite to .circle
echo "${DRIVER_PATH}/tflite2circle" "${INPUT_PATH}" "${OUTPUT_PATH}" > "${OUTPUT_PATH}.log"
diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
index 95384c10d..17b6b980e 100644
--- a/compiler/one-cmds/one-optimize
+++ b/compiler/one-cmds/one-optimize
@@ -22,6 +22,7 @@ usage()
{
echo "Optimize circle model."
echo "Usage: one-optimize"
+ echo " --version Show version information and exit"
echo " --all Enable all optimization algorithms"
echo " --fuse_bcq Enable FuseBCQ Pass"
echo " --fuse_instnorm Enable FuseInstanceNormalization Pass"
@@ -33,7 +34,13 @@ usage()
echo " Enable ResolveCustomOpMatMulPass Pass"
echo " --input_path <path/to/input/circle>"
echo " --output_path <path/to/output/circle>"
- exit 0
+ exit 255
+}
+
+version()
+{
+ $DRIVER_PATH/one-version one-optimize
+ exit 255
}
OPTIMIZE_all=0
@@ -52,6 +59,9 @@ while [ "$#" -ne 0 ]; do
'--help')
usage
;;
+ '--version')
+ version
+ ;;
'--all')
OPTIMIZE_all=1
shift
@@ -96,7 +106,6 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
echo "Error: input model not found"
echo ""
usage
- exit 2
fi
OPTIMIZE_OPTIONS=""
@@ -123,6 +132,13 @@ fi
# remove previous log
rm -rf "${OUTPUT_PATH}.log"
+show_err_onexit()
+{
+ cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
# NOTE do not wrap ${OPTIMIZE_OPTIONS} with ""
# optimize circle
echo "${DRIVER_PATH}/circle2circle" ${OPTIMIZE_OPTIONS} \
diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack
index 2bc4c601d..9224b2cd9 100644
--- a/compiler/one-cmds/one-pack
+++ b/compiler/one-cmds/one-pack
@@ -22,9 +22,16 @@ usage()
{
echo "Package circle to nnpkg"
echo "Usage: one-pack"
+ echo " -v, --version Show version information and exit"
echo " -i <path/to/circle>"
echo " -o <path/to/nnpackage/folder>"
- exit 0
+ exit 255
+}
+
+version()
+{
+ $DRIVER_PATH/one-version one-pack
+ exit 255
}
# Parse command-line arguments
@@ -36,6 +43,12 @@ while [ "$#" -ne 0 ]; do
'--help')
usage
;;
+ '-v')
+ version
+ ;;
+ '--version')
+ version
+ ;;
'-i')
export INPUT_PATH="$2"
shift 2
@@ -55,12 +68,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
echo "Error: input model not found"
echo ""
usage
- exit 2
fi
# remove previous log
rm -rf "${OUTPUT_PATH}.log"
+show_err_onexit()
+{
+ cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
# Package circle model file to nnpkg
echo "${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" > "${OUTPUT_PATH}.log"
diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
index ff9e26672..c74b2c2d2 100644
--- a/compiler/one-cmds/one-quantize
+++ b/compiler/one-cmds/one-quantize
@@ -22,16 +22,23 @@ usage()
{
echo "Quantize circle model."
echo "Usage: one-quantize"
+ echo " --version Show version information and exit"
echo " --input_dtype Input data type (supported: float32, default=float32)"
echo " --quantized_dtype Output quantized data type (supported: uint8, default=uint8)"
- echo " --granularity Quantize granularity (supported: layer, default=layer)"
+ echo " --granularity Quantize granularity (supported: layer, channel, default=layer)"
echo " --min_percentile Minimum percentile (0.0~100.0, default=1.0)"
echo " --max_percentile Maximum percentile (0.0~100.0, default=99.0)"
echo " --mode Record mode (supported: percentile/moving_average, default=percentile)"
echo " --input_path <path/to/input/circle>"
echo " --input_data <path/to/input/data>"
echo " --output_path <path/to/output/circle>"
- exit 0
+ exit 255
+}
+
+version()
+{
+ $DRIVER_PATH/one-version one-quantize
+ exit 255
}
INPUT_DTYPE=float32
@@ -50,6 +57,9 @@ while [ "$#" -ne 0 ]; do
'--help')
usage
;;
+ '--version')
+ version
+ ;;
'--input_dtype')
INPUT_DTYPE="$2"
@@ -100,13 +110,11 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
echo "Error: input model not found"
echo ""
usage
- exit 2
fi
if [ -z ${INPUT_DATA} ] || [ ! -e ${INPUT_DATA} ]; then
echo "Error: input data not found"
echo ""
usage
- exit 2
fi
FILE_BASE=$(basename ${OUTPUT_PATH})
@@ -118,6 +126,13 @@ trap "{ rm -rf $TMPDIR; }" EXIT
# remove previous log
rm -rf "${OUTPUT_PATH}.log"
+show_err_onexit()
+{
+ cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
# quantize circle
echo "${DRIVER_PATH}/circle-quantizer" \
--quantize_dequantize_weights ${INPUT_DTYPE} ${QUANTIZED_DTYPE} ${GRANULARITY} \
diff --git a/compiler/one-cmds/requires.cmake b/compiler/one-cmds/requires.cmake
index 9b858ad90..812149c37 100644
--- a/compiler/one-cmds/requires.cmake
+++ b/compiler/one-cmds/requires.cmake
@@ -3,3 +3,4 @@ require("tflite2circle")
require("circle2circle")
require("circle-quantizer")
require("record-minmax")
+require("vconone")
diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt
index 862660e06..f8a165bd3 100644
--- a/compiler/record-minmax/CMakeLists.txt
+++ b/compiler/record-minmax/CMakeLists.txt
@@ -19,9 +19,14 @@ target_link_libraries(record-minmax safemain)
target_link_libraries(record-minmax luci_import)
target_link_libraries(record-minmax luci_export)
target_link_libraries(record-minmax luci_interpreter)
+target_link_libraries(record-minmax vconone)
install(TARGETS record-minmax DESTINATION bin)
+if(NOT ENABLE_TEST)
+ return()
+endif(NOT ENABLE_TEST)
+
nnas_find_package(GTest REQUIRED)
GTest_AddTest(record_minmax_function_test "${CMAKE_CURRENT_SOURCE_DIR}/tests/RecordFunction.test.cpp")
target_include_directories(record_minmax_function_test PRIVATE include)
diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp
index ae4fcb7c7..8b09498c3 100644
--- a/compiler/record-minmax/driver/Driver.cpp
+++ b/compiler/record-minmax/driver/Driver.cpp
@@ -17,6 +17,13 @@
#include "RecordMinMax.h"
#include <arser/arser.h>
+#include <vconone/vconone.h>
+
+void print_version(void)
+{
+ std::cout << "record-minmax version " << vconone::get_string() << std::endl;
+ std::cout << vconone::get_copyright() << std::endl;
+}
int entry(const int argc, char **argv)
{
@@ -25,6 +32,13 @@ int entry(const int argc, char **argv)
arser::Arser arser(
"Embedding min/max values of activations to the circle model for post-training quantization");
+ arser.add_argument("--version")
+ .nargs(0)
+ .required(false)
+ .default_value(false)
+ .help("Show version information and exit")
+ .exit_with(print_version);
+
arser.add_argument("--input_model")
.nargs(1)
.type(arser::DataType::STR)
@@ -66,7 +80,7 @@ int entry(const int argc, char **argv)
{
std::cout << err.what() << std::endl;
std::cout << arser;
- return 0;
+ return 255;
}
auto input_model_path = arser.get<std::string>("--input_model");
diff --git a/compiler/record-minmax/requires.cmake b/compiler/record-minmax/requires.cmake
index 054503539..f6804cef1 100644
--- a/compiler/record-minmax/requires.cmake
+++ b/compiler/record-minmax/requires.cmake
@@ -1,3 +1,4 @@
require("luci")
require("safemain")
require("arser")
+require("vconone")
diff --git a/compiler/record-minmax/src/HDF5Importer.cpp b/compiler/record-minmax/src/HDF5Importer.cpp
index cf30cd863..a0e65eeb7 100644
--- a/compiler/record-minmax/src/HDF5Importer.cpp
+++ b/compiler/record-minmax/src/HDF5Importer.cpp
@@ -20,6 +20,7 @@
#include <string>
#include <cassert>
+#include <stdexcept>
using Shape = luci_interpreter::Shape;
using DataType = luci_interpreter::DataType;
diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp
index 45f0197c8..410ce3d69 100644
--- a/compiler/record-minmax/src/MinMaxObserver.cpp
+++ b/compiler/record-minmax/src/MinMaxObserver.cpp
@@ -38,7 +38,8 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node,
assert(node->opcode() != luci::CircleOpcode::UNPACK);
assert(node->opcode() != luci::CircleOpcode::WHILE);
- if (node->opcode() == luci::CircleOpcode::CONST)
+ if (node->opcode() == luci::CircleOpcode::CONST ||
+ node->opcode() == luci::CircleOpcode::CIRCLECONST)
{
// node is not activation. Do nothing.
return;
diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
index d12a0d3ae..17c6aa6ff 100644
--- a/compiler/record-minmax/src/RecordMinMax.cpp
+++ b/compiler/record-minmax/src/RecordMinMax.cpp
@@ -158,7 +158,7 @@ void RecordMinMax::profileData(const std::string &mode, const std::string &input
auto node = iter->first;
auto minmax = iter->second;
- float min, max;
+ float min{0.0f}, max{0.0f};
if (mode == "percentile")
{
min = getNthPercentile(minmax.min_vector, min_percentile);
diff --git a/compiler/record-minmax/tests/RecordFunction.test.cpp b/compiler/record-minmax/tests/RecordFunction.test.cpp
index 13b464db9..e2f135a4e 100644
--- a/compiler/record-minmax/tests/RecordFunction.test.cpp
+++ b/compiler/record-minmax/tests/RecordFunction.test.cpp
@@ -32,6 +32,8 @@ TEST(GetNthPercentileTest, Edge)
EXPECT_FLOAT_NEAR(0, getNthPercentile(input, 0));
EXPECT_FLOAT_NEAR(9, getNthPercentile(input, 100));
+
+ SUCCEED();
}
TEST(GetNthPercentileTest, Simple)
@@ -47,6 +49,8 @@ TEST(GetNthPercentileTest, Simple)
{
EXPECT_FLOAT_NEAR(0.09 * std::floor(i) + 0.045, getNthPercentile(input, i));
}
+
+ SUCCEED();
}
TEST(GetNthPercentileTest, Float)
@@ -61,6 +65,8 @@ TEST(GetNthPercentileTest, Float)
EXPECT_FLOAT_NEAR(2.799942346802177, getNthPercentile(input, 1));
EXPECT_FLOAT_NEAR(7.768503955476342, getNthPercentile(input, 3.14));
EXPECT_FLOAT_NEAR(99.40456084968194, getNthPercentile(input, 99));
+
+ SUCCEED();
}
TEST(GetNthPercentileTest, FloatWithNegative)
@@ -75,6 +81,8 @@ TEST(GetNthPercentileTest, FloatWithNegative)
EXPECT_FLOAT_NEAR(-47.20005765319782, getNthPercentile(input, 1));
EXPECT_FLOAT_NEAR(-42.23149604452366, getNthPercentile(input, 3.14));
EXPECT_FLOAT_NEAR(49.40456084968194, getNthPercentile(input, 99));
+
+ SUCCEED();
}
TEST(GetNthPercentileTest, SigleElement)
@@ -84,6 +92,8 @@ TEST(GetNthPercentileTest, SigleElement)
EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 0));
EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 50));
EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 100));
+
+ SUCCEED();
}
TEST(GetNthPercentileTest, OutOfBoundary_NEG)
@@ -92,6 +102,8 @@ TEST(GetNthPercentileTest, OutOfBoundary_NEG)
EXPECT_THROW(getNthPercentile(input, -1), std::runtime_error);
EXPECT_THROW(getNthPercentile(input, 101), std::runtime_error);
+
+ SUCCEED();
}
TEST(GetNthPercentileTest, EmptyVector_NEG)
@@ -99,6 +111,8 @@ TEST(GetNthPercentileTest, EmptyVector_NEG)
std::vector<float> input;
EXPECT_THROW(getNthPercentile(input, 10), std::runtime_error);
+
+ SUCCEED();
}
} // namespace record_minmax
diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt
index d33059fde..4421a4660 100644
--- a/compiler/tfl-verify/CMakeLists.txt
+++ b/compiler/tfl-verify/CMakeLists.txt
@@ -6,6 +6,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
add_executable(tfl-verify ${SOURCES})
target_include_directories(tfl-verify PRIVATE src)
+target_link_libraries(tfl-verify arser)
target_link_libraries(tfl-verify foder)
target_link_libraries(tfl-verify mio_tflite)
target_link_libraries(tfl-verify safemain)
diff --git a/compiler/tfl-verify/requires.cmake b/compiler/tfl-verify/requires.cmake
index ed6b84db5..79503f325 100644
--- a/compiler/tfl-verify/requires.cmake
+++ b/compiler/tfl-verify/requires.cmake
@@ -1,3 +1,4 @@
+require("arser")
require("foder")
require("mio-tflite")
require("safemain")
diff --git a/compiler/tfl-verify/src/Driver.cpp b/compiler/tfl-verify/src/Driver.cpp
index 81f6d5489..6d1897607 100644
--- a/compiler/tfl-verify/src/Driver.cpp
+++ b/compiler/tfl-verify/src/Driver.cpp
@@ -16,22 +16,31 @@
#include "VerifyFlatBuffers.h"
+#include <arser/arser.h>
+
#include <iostream>
#include <memory>
#include <string>
int entry(int argc, char **argv)
{
- if (argc != 2)
+ arser::Arser arser;
+ arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file path to verify");
+
+ try
{
- std::cerr << "ERROR: Failed to parse arguments" << std::endl;
- std::cerr << std::endl;
- std::cerr << "USAGE: " << argv[0] << " [tflite]" << std::endl;
+ arser.parse(argc, argv);
+ }
+ catch (const std::runtime_error &err)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
return 255;
}
+
auto verifier = std::make_unique<VerifyFlatbuffers>();
- std::string model_file = argv[argc - 1];
+ std::string model_file = arser.get<std::string>("tflite");
std::cout << "[ RUN ] Check " << model_file << std::endl;
diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
index 932a649c5..692ce48c1 100644
--- a/compiler/tflchef/core/src/ModelChef.cpp
+++ b/compiler/tflchef/core/src/ModelChef.cpp
@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
quant_builder.add_min(quant_min);
quant_builder.add_scale(quant_scale);
quant_builder.add_zero_point(quant_zero_point);
+ quant_builder.add_quantized_dimension(quant.quantized_dimension());
// Update QuantizationParameters Index
quant_index = quant_builder.Finish();
diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto
index 792503bc9..55785c35d 100644
--- a/compiler/tflchef/proto/tflchef.proto
+++ b/compiler/tflchef/proto/tflchef.proto
@@ -35,6 +35,7 @@ message TensorQuantization {
repeated float max = 2;
repeated float scale = 3;
repeated int64 zero_point = 4;
+ optional int32 quantized_dimension = 5 [default = 0];
}
message Operand {
diff --git a/compiler/tflchef/tflite/src/RecipeChef.cpp b/compiler/tflchef/tflite/src/RecipeChef.cpp
index db62d0e40..088961c1c 100644
--- a/compiler/tflchef/tflite/src/RecipeChef.cpp
+++ b/compiler/tflchef/tflite/src/RecipeChef.cpp
@@ -184,6 +184,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const tflite::Model *model)
for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
chef_quant->add_zero_point(quant->zero_point()->Get(idx));
}
+ tflchef::TensorQuantization *chef_quant = operand->mutable_quant();
+ chef_quant->set_quantized_dimension(quant->quantized_dimension());
}
}
diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp
index cecfeeb3e..46e5b5583 100644
--- a/compiler/tflchef/tools/file/Driver.cpp
+++ b/compiler/tflchef/tools/file/Driver.cpp
@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << std::endl;
std::cout << arser;
- return 0;
+ return 255;
}
int32_t model_version = 1;
diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp
index 1116dec34..4d795a3d0 100644
--- a/compiler/tflchef/tools/reverse/Driver.cpp
+++ b/compiler/tflchef/tools/reverse/Driver.cpp
@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << std::endl;
std::cout << arser;
- return 0;
+ return 255;
}
std::string tflite_path = arser.get<std::string>("tflite");
diff --git a/compiler/tfldump/driver/Driver.cpp b/compiler/tfldump/driver/Driver.cpp
index 3961d2f17..38c9c062f 100644
--- a/compiler/tfldump/driver/Driver.cpp
+++ b/compiler/tfldump/driver/Driver.cpp
@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << '\n';
std::cout << arser;
- return 0;
+ return 255;
}
std::string tflite_path = arser.get<std::string>("tflite");
diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt
index a0a2e026b..b1d1f6149 100644
--- a/compiler/tflite2circle/CMakeLists.txt
+++ b/compiler/tflite2circle/CMakeLists.txt
@@ -14,5 +14,6 @@ target_link_libraries(tflite2circle arser)
target_link_libraries(tflite2circle safemain)
target_link_libraries(tflite2circle mio_tflite)
target_link_libraries(tflite2circle mio_circle)
+target_link_libraries(tflite2circle vconone)
install(TARGETS tflite2circle DESTINATION bin)
diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp
index 67b8e33bc..2f11e0a13 100644
--- a/compiler/tflite2circle/driver/Driver.cpp
+++ b/compiler/tflite2circle/driver/Driver.cpp
@@ -24,10 +24,25 @@
#include "CircleModel.h"
#include "TFLModel.h"
+#include <vconone/vconone.h>
+
+void print_version(void)
+{
+ std::cout << "tflite2circle version " << vconone::get_string() << std::endl;
+ std::cout << vconone::get_copyright() << std::endl;
+}
+
int entry(int argc, char **argv)
{
arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"};
+ arser.add_argument("--version")
+ .nargs(0)
+ .required(false)
+ .default_value(false)
+ .help("Show version information and exit")
+ .exit_with(print_version);
+
arser.add_argument("tflite")
.nargs(1)
.type(arser::DataType::STR)
@@ -42,7 +57,7 @@ int entry(int argc, char **argv)
{
std::cout << err.what() << std::endl;
std::cout << arser;
- return 0;
+ return 255;
}
std::string tfl_path = arser.get<std::string>("tflite");
diff --git a/compiler/tflite2circle/requires.cmake b/compiler/tflite2circle/requires.cmake
index ff19b7491..837c287b6 100644
--- a/compiler/tflite2circle/requires.cmake
+++ b/compiler/tflite2circle/requires.cmake
@@ -2,3 +2,4 @@ require("arser")
require("mio-tflite")
require("mio-circle")
require("safemain")
+require("vconone")
diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
new file mode 100644
index 000000000..b8cb79331
--- /dev/null
+++ b/compiler/vconone/CMakeLists.txt
@@ -0,0 +1,31 @@
+if (NOT VCONONE_VERSION)
+ set(VCONONE_VERSION 0x0000000000080001)
+ # NOTE order is [build patch minor major]
+ # if VCONONE_VERSION is set with -D option, it will be cached
+ # you may have to remove cache file if you remove -D option
+endif()
+
+configure_file(version_cfg.h.in version_cfg.h @ONLY)
+
+set(DRIVER "driver/driver.cpp")
+
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(vconone STATIC ${SOURCES})
+target_include_directories(vconone PUBLIC include)
+target_include_directories(vconone PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+add_executable(one-version ${DRIVER})
+target_link_libraries(one-version vconone)
+install(TARGETS one-version DESTINATION bin)
+
+if(NOT ENABLE_TEST)
+ return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(vconone_test ${TESTS})
+target_link_libraries(vconone_test vconone)
diff --git a/compiler/vconone/README.md b/compiler/vconone/README.md
new file mode 100644
index 000000000..c08dd63d3
--- /dev/null
+++ b/compiler/vconone/README.md
@@ -0,0 +1,14 @@
+# vconone
+
+_vconone_ provides version number and strings for one-* commands and command
+line tools
+
+# Revise version number
+
+To revise version number, update `VCONONE_VERSION` in `CmakeLists.txt`
+or give `-DVCONONE_VERSION=0x0000000100080001` at cmake configure step.
+
+Number given is four numbers `build`, `patch`, `minor` and `major` in order for
+each 16bit integers. `build` is not used for now.
+
+`0x0000000100080001` version is interpretered as `1.8.1`
diff --git a/compiler/vconone/driver/driver.cpp b/compiler/vconone/driver/driver.cpp
new file mode 100644
index 000000000..12bd0eef2
--- /dev/null
+++ b/compiler/vconone/driver/driver.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vconone/vconone.h>
+
+#include <string>
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+ auto str = vconone::get_string();
+ if (argc >= 2)
+ {
+ for (int c = 1; c < argc; ++c)
+ std::cout << argv[c] << " ";
+ std::cout << "version " << str << std::endl;
+ std::cout << vconone::get_copyright() << std::endl;
+ }
+ else
+ std::cout << str;
+
+ return 0;
+}
diff --git a/compiler/vconone/include/vconone/vconone.h b/compiler/vconone/include/vconone/vconone.h
new file mode 100644
index 000000000..a6a1998a5
--- /dev/null
+++ b/compiler/vconone/include/vconone/vconone.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __VCON_ONE_H__
+#define __VCON_ONE_H__
+
+#include <cstdint>
+#include <string>
+
+namespace vconone
+{
+
+struct four
+{
+ uint16_t major;
+ uint16_t minor;
+ uint16_t patch;
+ uint16_t build; // build is not used for now
+};
+
+union version {
+ uint64_t v;
+ four f;
+};
+
+/**
+ * @brief get_number will return version union structure
+ */
+version get_number(void);
+
+/**
+ * @brief get_string will return string of major.minor.patch (without build)
+ */
+std::string get_string(void);
+
+/**
+ * @brief get_string4 will return string of major.minor.patch.build
+ */
+std::string get_string4(void);
+
+/**
+ * @brief get_copyright will return copyright string
+ */
+std::string get_copyright(void);
+
+} // namespace vconone
+
+#endif // __VCON_ONE_H__
diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp
new file mode 100644
index 000000000..9b693c621
--- /dev/null
+++ b/compiler/vconone/src/version.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vconone/vconone.h"
+
+#include "version_cfg.h"
+
+#include <sstream>
+
+namespace vconone
+{
+
+version get_number(void)
+{
+ version v;
+ v.v = VCONONE_VERSION;
+ return v;
+}
+
+std::string get_string4(void)
+{
+ std::ostringstream ss;
+
+ auto v = get_number();
+ ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch) << "."
+ << unsigned(v.f.build);
+
+ return ss.str();
+}
+
+std::string get_string(void)
+{
+ std::ostringstream ss;
+
+ auto v = get_number();
+ ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch);
+
+ return ss.str();
+}
+
+std::string get_copyright(void)
+{
+ std::string str;
+ str = "Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
+ str += "Licensed under the Apache License, Version 2.0\r\n";
+ str += "https://github.com/Samsung/ONE";
+ return str;
+}
+
+} // namespace vconone
diff --git a/compiler/vconone/src/version.test.cpp b/compiler/vconone/src/version.test.cpp
new file mode 100644
index 000000000..35a0647c1
--- /dev/null
+++ b/compiler/vconone/src/version.test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vconone/vconone.h>
+
+#include <gtest/gtest.h>
+
+TEST(vconone, version_number)
+{
+ auto v = vconone::get_number();
+
+ ASSERT_NE(0x0000000000000000ULL, v.v);
+}
+
+TEST(vconone, version_string)
+{
+ auto str = vconone::get_string();
+
+ ASSERT_NE("..", str);
+ ASSERT_NE("", str);
+}
+
+TEST(vconone, version_string4)
+{
+ auto str = vconone::get_string4();
+
+ ASSERT_NE("...", str);
+ ASSERT_NE("", str);
+}
+
+TEST(vconone, copyright)
+{
+ auto str = vconone::get_copyright();
+
+ ASSERT_NE("", str);
+}
diff --git a/compiler/vconone/version_cfg.h.in b/compiler/vconone/version_cfg.h.in
new file mode 100644
index 000000000..aa3ad9e70
--- /dev/null
+++ b/compiler/vconone/version_cfg.h.in
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __VCON_ONE_VERSION_CFG_H__
+#define __VCON_ONE_VERSION_CFG_H__
+
+#define VCONONE_VERSION @VCONONE_VERSION@ULL
+
+#endif // __VCON_ONE_VERSION_CFG_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
deleted file mode 100644
index 9699b5c00..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLArgOperationKernel.h
- * @brief This file defines CLArgOperationKernel
- * @ingroup COM_AI_RUNTIME
- */
-
-#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
-#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define interface for the argop kernel.
- */
-class CLArgOperationKernel : public ICLKernel
-{
-public:
- /**
- * @brief Default constructor.
- */
- CLArgOperationKernel();
- /**
- * @brief Prevent instances of this class from being copied (As this class contains pointers).
- * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
- */
- CLArgOperationKernel(const CLArgOperationKernel &) = delete;
- /**
- * @brief Prevent instances of this class from being copied (As this class contains pointers).
- * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
- * @return Reference of this instance
- */
- CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete;
- /**
- * @brief Allow instances of this class to be moved
- * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
- */
- CLArgOperationKernel(CLArgOperationKernel &&) = default;
- /**
- * @brief Allow instances of this class to be moved
- * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
- * @return Reference of this instance
- */
- CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default;
- /**
- * @brief Initialise the kernel's input, output and border mode.
- * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
- * @param[out] output The output tensor, Data types supported: S32.
- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
- * @param[in] op Arg operation to perform.
- * return N/A
- */
- void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op);
- /**
- * @brief Static function to check if given info will lead to a valid configuration of @ref
- * CLArgOperationKernel
- * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
- * @param[in] output The output tensor info, Data types supported: S32.
- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
- * @param[in] op Arg operation to perform.
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
- ArgOperation op);
-
- /*
- * @brief Run CLArgOperationKernel op
- * @param[in] window Window to be used for in_slice
- * @param[in] queue cl::CommandQueue
- * @return N/A
- */
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
- uint32_t _axis;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
deleted file mode 100644
index b0357fe99..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLCastKernel.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file defines CLCastKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
-#define __ARM_COMPUTE_CLCASTKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define OpenCL kernel for cast operation
- */
-class CLCastKernel : public ICLKernel
-{
-public:
- /**
- * @brief Construct CLCastKernel object
- */
- CLCastKernel();
-
- /**
- * @brief Prevent instances of this class from being copied (As this class contains pointers)
- */
- CLCastKernel(const CLCastKernel &) = delete;
-
- /**
- * @brief Prevent instances of this class from being copied (As this class contains pointers)
- */
- CLCastKernel &operator=(const CLCastKernel &) = delete;
-
- /**
- * @brief Construct CLCastKernel object using default move constructor
- * @param[in] CLCastKernel object to move
- */
- CLCastKernel(CLCastKernel &&) = default;
-
- /**
- * @brief Allow instances of this class to be moved
- * @param[in] CLCastKernel object to move
- */
- CLCastKernel &operator=(CLCastKernel &&) = default;
-
- /**
- * @brief Destruct this CLCastKernel object
- */
- ~CLCastKernel() = default;
-
- /**
- * @brief Initialise the kernel's input and output.
- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[in] input_subtype Sub data type of input.
- * @return N/A
- */
- void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
-
- /**
- * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
- * queue.
- * @note The queue is *not* flushed by this method, and therefore the kernel will not have
- * been executed by the time this method returns.
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of
- * the window returned by window()).
- * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
- * @return N/A
- */
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input; /**< Source tensor */
- ICLTensor *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
deleted file mode 100644
index 8615cf120..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
-#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform depthTospace operation */
-class CLDepthToSpaceKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLDepthToSpaceKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
- /** Default destructor */
- ~CLDepthToSpaceKernel() = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- */
- void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input; /**< Source tensor */
- ICLTensor *_output; /**< Destination tensor */
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
deleted file mode 100644
index 9321c3677..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * This kernel performs the following computation:
- *
- * -# Convert a values from int8 to int32
- * -# Convert b values from int8 to int32
- * -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class CLGEMMLowpMatrixMultiplyKernelEx : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMLowpMatrixMultiplyKernelEx();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyKernelEx(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyKernelEx &operator=(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyKernelEx(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyKernelEx &operator=(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p
- * input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type
- * supported: S32
- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of
- * the input matrices
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
- const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLGEMMLowpMatrixMultiplyKernelEx
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p
- * input0
- * @param[in] output Output tensor to store the result of matrix multiplication. Data type
- * supported: S32
- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of
- * the input matrices
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1,
- const ITensorInfo *output,
- const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_input_as_3d;
- bool _reinterpret_output_as_3d;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
deleted file mode 100644
index dd2dbf6a4..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
-#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to calculate PReLU*/
-class CLPReLUKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLPReLUKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers). */
- CLPReLUKernel(const CLPReLUKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers). */
- CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLPReLUKernel(CLPReLUKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
- /** Initialize the kernel's input, output.
- *
- * @param[in] input Source tensor1.
- * @param[in] alpha Source tensor2.
- * @param[out] output Output tensor.
- */
- void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
- BorderSize border_size() const override;
-
-private:
- const ICLTensor *_input;
- const ICLTensor *_alpha;
- ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
deleted file mode 100644
index 4c0a82ce1..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
-#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform spaceTodepth operation */
-class CLSpaceToDepthKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLSpaceToDepthKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
- /** Default destructor */
- ~CLSpaceToDepthKernel() = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- */
- void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input; /**< Source tensor */
- ICLTensor *_output; /**< Destination tensor */
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
deleted file mode 100644
index 9d174deb5..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL.
- */
-class CLTransposeConvLayerUpsampleKernel : public ICLKernel
-{
-public:
- /** Constructor */
- CLTransposeConvLayerUpsampleKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLTransposeConvLayerUpsampleKernel &
- operator=(const CLTransposeConvLayerUpsampleKernel &) = delete;
- /** Default Move Constructor. */
- CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default;
- /** Default move assignment operator */
- CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default;
- /** Default destructor */
- ~CLTransposeConvLayerUpsampleKernel() = default;
-
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
- * @param[out] output Destination tensor. Data types supported: same as @p input. All but
- * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only
- * performed within the XY-plane.
- * @param[in] inner_border Top and right inner border sizes. These rows and columns will be
- * filled with zero.
- * @param[in] info Contains padding and stride information described in @ref
- * PadStrideInfo.
- */
- void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
- const PadStrideInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLTransposeConvLayerUpsample
- *
- * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32.
- * @param[in] output Destination tensor info. Data types supported: same as @p input. All
- * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is
- * only performed within the XY-plane.
- * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled
- * with zero.
- * @param[in] info Contains padding and stride information described in @ref
- * PadStrideInfo.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const BorderSize &inner_border, const PadStrideInfo &info);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
- BorderSize _inner_border;
- PadStrideInfo _info;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
deleted file mode 100644
index d4c9c610a..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
-#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** CPP kernel to perform tensor upsample.
- *
- */
-class CPPUpsampleKernelEx : public ICPPKernel
-{
-public:
- const char *name() const override { return "CPPUpsampleKernelEx"; }
- /** Default constructor */
- CPPUpsampleKernelEx();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete;
- /** Allow instances of this class to be moved */
- CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default;
- /** Allow instances of this class to be moved */
- CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default;
- /** Default destructor */
- ~CPPUpsampleKernelEx() = default;
-
- /** Set the input and output of the kernel.
- *
- * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8
- * @param[out] output The output tensor. Data types supported: Same as @p input
- * @param[in] info Padding info.
- */
- void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
- bool is_parallelisable() const override;
-
-private:
- const ITensor *_input;
- ITensor *_output;
- PadStrideInfo _info;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
deleted file mode 100644
index 4e9f097c2..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NECASTKERNEL_H__
-#define __ARM_COMPUTE_NECASTKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the cast layer kernel. */
-class NECastKernel : public INEKernel
-{
-public:
- const char *name() const override { return "NECastKernel"; }
- /** Default constructor */
- NECastKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NECastKernel(const NECastKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NECastKernel &operator=(const NECastKernel &) = delete;
- /** Default Move Constructor. */
- NECastKernel(NECastKernel &&) = default;
- /** Default move assignment operator */
- NECastKernel &operator=(NECastKernel &&) = default;
- /** Default destructor */
- ~NECastKernel() = default;
- /** Set input, output tensors.
- *
- * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
- * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
- * U8/S8/QASYMM8/U32/S32/F32.
- * @param[in] input_subtype Sub data type of input.
- */
- void configure(const ITensor *input, ITensor *output, SubDataType input_subtype);
- /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel
- *
- * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
- * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
- * @param[in] input_subtype Sub data type of input.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- SubDataType input_subtype);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- const ITensor *_input;
- ITensor *_output;
- SubDataType _input_subtype;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
deleted file mode 100644
index b62897e68..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the depth to space kernel */
-class NEDepthToSpaceLayerKernelEx : public INEKernel
-{
-public:
- const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; }
- /** Default constructor */
- NEDepthToSpaceLayerKernelEx();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete;
- /** Allow instances of this class to be moved */
- NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default;
- /** Allow instances of this class to be moved */
- NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default;
- /** Default destructor */
- ~NEDepthToSpaceLayerKernelEx() = default;
- /** Initialise the kernel's inputs and output.
- *
- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[out] output Tensor output. Data types supported: same as @p input
- * @param[in] block_shape Block shape x value.
- */
- void configure(const ITensor *input, ITensor *output, int32_t block_shape);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEDepthToSpaceLayerKernelEx.
- *
- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[in] output Tensor output info. Data types supported: same as @p input
- * @param[in] block_shape Block shape value.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- const ITensor *_input; /**< Source tensor */
- ITensor *_output; /**< Destination tensor */
- int32_t _block_shape; /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
deleted file mode 100644
index 57de78dd8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
-#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for an element-wise unary operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x) = OP(input(x))@f]
- *
- */
-class NEElementwiseUnaryKernelEx : public INEKernel
-{
-public:
- const char *name() const override { return "NEElementwiseUnaryKernelEx"; }
- /** Default constructor */
- NEElementwiseUnaryKernelEx();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete;
- /** Allow instances of this class to be moved */
- NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default;
- /** Allow instances of this class to be moved */
- NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default;
- /** Default destructor */
- ~NEElementwiseUnaryKernelEx() = default;
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEElementwiseUnaryKernelEx
- *
- * @param[in] op Arithmetic operation to be executed.
- * @param[in] input First tensor input. Data types supported: F16/F32/S32.
- * @param[in] output Output tensor. Data types supported: Same as @p input.
- */
- void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output);
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEElementwiseUnaryKernelEx
- *
- * @param[in] op Arithmetic operation to be executed.
- * @param[in] input First tensor input info. Data types supported: F16/F32/S32.
- * @param[in] output Output tensor info. Data types supported: Same as @p input.
- *
- * @return a Status
- */
- static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input,
- const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
- /** Common signature for all the specialised arithmetic functions
- *
- * @param[in] input An input tensor. Data types supported: F16/F32/S32.
- * @param[out] output The output tensor. Data types supported: Same as @p input.
- * @param[in] window Region on which to execute the kernel.
- */
- using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output,
- const Window &window);
-
-protected:
- // Inherited methods overridden:
- static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output);
-
- /** Function to use for the particular tensor types passed to configure() */
- std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function;
-
- const ITensor *_input;
- ITensor *_output;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
deleted file mode 100644
index 722efd3d0..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__
-#define __ARM_COMPUTE_NEPRELUKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform Parametric Rectified Linear Unit
- *
- * Result is computed by:
- * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f]
- */
-class NEPReLUKernel : public INEKernel
-{
-public:
- const char *name() const override { return "NEPReLUKernel"; }
- /** Default constructor */
- NEPReLUKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEPReLUKernel(const NEPReLUKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEPReLUKernel &operator=(const NEPReLUKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEPReLUKernel(NEPReLUKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEPReLUKernel &operator=(NEPReLUKernel &&) = default;
- /** Initialise the kernel's inputs and output
- *
- * @param[in] input Input tensor. Data type supported: QASYMM8/F32
- * @param[in] alpha Alpha tensor. Data types supported: Same as @p input
- * @param[out] output Output tensor. Data types supported: Same as @p input
- */
- void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEPReLUKernel.h
- *
- * @param[in] input Input tensor input info. Data types supported: QASYMM8/F32.
- * @param[in] alpha Alpha tensor input info. Data types supported: Same as @p input.
- * @param[in] output Output tensor info. Data types supported: Same as @p input.
- *
- * @return a Status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *alpha,
- const ITensorInfo *output);
- static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
- const ITensorInfo &output);
-
-private:
- const ITensor *_input; /**< Source tensor */
- const ITensor *_alpha; /**< Alpha tensor */
- ITensor *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
deleted file mode 100644
index 0ffcf6be8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
-#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the space to depth kernel */
-class NESpaceToDepthLayerKernelEx : public INEKernel
-{
-public:
- const char *name() const override { return "NESpaceToDepthLayerKernelEx"; }
- /** Default constructor */
- NESpaceToDepthLayerKernelEx();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete;
- /** Allow instances of this class to be moved */
- NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default;
- /** Allow instances of this class to be moved */
- NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default;
- /** Default destructor */
- ~NESpaceToDepthLayerKernelEx() = default;
- /** Initialise the kernel's inputs and output.
- *
- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[out] output Tensor output. Data types supported: same as @p input
- * @param[in] block_shape Block shape value
- */
- void configure(const ITensor *input, ITensor *output, int32_t block_shape);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NESpaceToDepthLayerKernelEx
- *
- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[in] output Tensor output info. Data types supported: same as @p input
- * @param[in] block_shape Block shape value
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- const ITensor *_input; /**< Source tensor */
- ITensor *_output; /**< Destination tensor */
- int32_t _block_shape; /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index 97bc4cea5..cfbd13436 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -16,25 +16,14 @@
#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
#define __ARM_COMPUTE_CLFUNCTIONSEX_H__
-#include <arm_compute/runtime/CL/functions/CLArgOperation.h>
-#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h>
#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
-#include <arm_compute/runtime/CL/functions/CLCast.h>
-#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
#include <arm_compute/runtime/CL/functions/CLGatherEx.h>
#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
-#include <arm_compute/runtime/CL/functions/CLLogicalNot.h>
#include <arm_compute/runtime/CL/functions/CLNeg.h>
-#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
-#include <arm_compute/runtime/CL/functions/CLPReLU.h>
#include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
-#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h>
-#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h>
-#include <arm_compute/runtime/CL/functions/CLSplit.h>
-#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h>
#include <arm_compute/runtime/CL/functions/CLTopKV2.h>
#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
deleted file mode 100644
index c37096f7c..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLArgOperation.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLArgOperation class
- */
-
-#ifndef __ARM_COMPUTE_CLARGOPERATION_H__
-#define __ARM_COMPUTE_CLARGOPERATION_H__
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to execute CLArgOperation operation
- */
-class CLArgOperation : public IFunction
-{
-public:
- /**
- * @brief Construct a new CLArgOperation object
- */
- CLArgOperation();
-
- /**
- * @brief Prevent instances of this class from being copied (As this class contains pointers)
- */
- CLArgOperation(const CLArgOperation &) = delete;
-
- /**
- * @brief Prevent instances of this class from being copied (As this class contains pointers)
- */
- CLArgOperation &operator=(const CLArgOperation &) = delete;
-
- /**
- * @brief Construct a new CLArgOperation object by using copy constructor
- * @param[in] CLArgOperation object to move
- */
- CLArgOperation(CLArgOperation &&) = default;
-
- /**
- * @brief Assign a CLArgOperation object.
- * @param[in] CLArgOperation object to assign. This object will be moved.
- */
- CLArgOperation &operator=(CLArgOperation &&) = default;
-
- /**
- * @brief Initialise the kernel's inputs and outputs.
- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
- * @param[out] output The result of arg operation. Data types supported: S32.
- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
- * @param[in] op Arg operation to perform.
- * @return N/A
- */
- void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op);
-
- /**
- * @brief Static function to check if given info will lead to a valid configuration
- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
- * @param[out] output The result of arg operation. Data types supported: S32.
- * @param[in] op Arg operation to perform.
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
- const ITensorInfo *output, ArgOperation op);
- /**
- * @brief Run the OpenCL kernel for this operation
- * @return N/A
- */
- void run() override;
-
-private:
- ICLTensor *_input{nullptr};
- ICLTensor *_output{nullptr};
- std::vector<uint32_t> _axis{};
- ArgOperation _arg_op{ArgOperation::MAX};
-
- std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
- std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr};
- size_t _num_of_kernels{0};
-};
-}
-#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
deleted file mode 100644
index eed5cb8a4..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
-#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLBatchToSpaceNDKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLBatchToSpaceND : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[in] block_size A pointer to an array of integer values specifying block sizes
- * for spatial dimension.
- */
- void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
deleted file mode 100644
index ebe0d8a1c..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLCast.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLCast class
- */
-
-#ifndef __ARM_COMPUTE_CLCAST_H__
-#define __ARM_COMPUTE_CLCAST_H__
-
-#include "arm_compute/core/TypesEx.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLCastKernel.
- * This converts the input tensor to the tensor of the output tensor's type.
- */
-class CLCast : public ICLSimpleFunction
-{
-public:
- /**
- * @brief Initialise the kernel's input and output
- * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * The input tensor is [in, out] because its TensorInfo might be
- * modified inside the kernel.
- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[in] input_subtype Sub data type of input.
- */
- void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
-};
-}
-#endif /* __ARM_COMPUTE_CLCAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
deleted file mode 100644
index d52a538df..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
-#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLDepthToSpaceKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLDepthToSpace : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[block_size] block size integer only
- */
- void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
-};
-} // namesace arm_compute
-
-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
new file mode 100644
index 000000000..409eaf593
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Function to run the deconvolution layer.
+ *
+ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perform a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input and pad is the amount of padding.
+ *
+ * The relation between input to output is as follows:
+ * \f[
+ * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ * \f]
+ * \f[
+ * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ * \f]
+ *
+ * where:
+ * width_input is the size of the first input dimension.
+ * height_input is the size of the second input dimension.
+ * width_output is the size of the first output dimension.
+ * height_output is the size of the second output dimension.
+ * kernel_x and kernel_y are the convolution sizes in x and y.
+ * stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse.
+ *
+ * This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref CLDeconvolutionLayerUpsample
+ * -# @ref CLConvolutionLayer
+ *
+ * And the following CPP kernels:
+ * -# @ref CLReverse
+ *
+ */
+class CLDirectTransposeConvLayer : public IFunction
+{
+public:
+ /** Constructor */
+ CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete;
+ /** Default move constructor */
+ CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete;
+ /** Default move assignment operator */
+ CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
+ /** Set the input, weights, biases and output tensors.
+ *
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data type supported: Should match @p input data type, except for
+ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+ * @param[out] output Output tensor. The output has the same number of dimensions as the
+ * @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution, this
+ * is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ *
+ */
+ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+ const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info = WeightsInfo());
+ /** Set the input, weights, biases and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and
+ * an optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data type supported: Should match @p input data type, except for
+ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+ * @param[out] output Output tensor. The output has the same number of dimensions as
+ * the @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution,
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+ *
+ */
+ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
+ const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+ unsigned int invalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info = WeightsInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLDirectTransposeConvLayer
+ *
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data type supported: Should match @p input data type, except for input
+ * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+ * @param[in] output Output tensor info. The output has the same number of dimensions as the
+ * @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution, this is
+ * decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+ unsigned int invalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info = WeightsInfo());
+
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
+
+private:
+ MemoryGroup _memory_group;
+ CLDeconvolutionLayerUpsample _scale_f;
+ CLConvolutionLayer _conv_f;
+ CLReverse _flip_weights;
+
+ CLTensor _scaled_output;
+ ICLTensor *_original_weights;
+ CLTensor _weights_flipped;
+ CLTensor _flip_axis;
+
+ bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
index 1a0284a3e..f3266f688 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -50,7 +50,7 @@
#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
namespace arm_compute
{
@@ -168,7 +168,7 @@ private:
CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
CLScaleFactorSymm8Kernel _scale_factor_kernel;
CLQuantizationSymmetricKernel _quant_input_kernel;
- CLGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+ CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
CLMultiplyScaleFactorKernel _multiply_scale_kernel;
CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to
// add bias in
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
deleted file mode 100644
index 68aba74ab..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-class IMemoryManager;
-class ICLTensor;
-
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the
- * following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of
- * GEMMInfo is FALSE)
- * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
- * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
- *
-*/
-class CLGEMMLowpMatrixMultiplyCoreEx : public IFunction
-{
-public:
- /** Constructor */
- CLGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyCoreEx(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
- /** Default move constructor */
- CLGEMMLowpMatrixMultiplyCoreEx(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyCoreEx &operator=(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
- /** Default move assignment operator */
- CLGEMMLowpMatrixMultiplyCoreEx &operator=(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
- /** Initialise the kernel's inputs, output
- *
- * @note GEMMLowp: low precision GEMM kernel. [A * B + C]
- * This kernel performs the following computations:
- *
- * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
- * -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
- * -# Compute the matrix product of the resulting a * b in int32.
- * -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
- *
- * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8.
- * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
- * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported:
- * S32
- * @param[out] output Output tensor. Data type supported: S32 or QASYMM8 if
- * gemm_info.gemmlowp_output_stage != NONE
- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
- * and
- * if the reshape of matrix B should be executed only for the first run
- */
- void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output,
- const GEMMInfo &gemm_info = GEMMInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLGEMMLowpMatrixMultiplyCoreEx
- *
- * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8.
- * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a
- * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type
- * supported: S32
- * @param[in] output Output tensor info. Data type supported: S32 or QASYMM8 if
- * gemm_info.gemmlowp_output_stage != NONE
- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
- * and
- * if the reshape of matrix B should be executed only for the first run
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
- const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
-
- // Inherited methods overridden:
- void run() override;
- void prepare() override;
-
-private:
- MemoryGroup _memory_group;
-
- // Kernels used
- CLGEMMLowpMatrixMultiplyKernelEx _mm_midgard_kernel;
- CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
- CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
-
- // Temporary tensors
- CLTensor _vector_sum_col;
- CLTensor _vector_sum_row;
-
- int32_t _a_offset;
- int32_t _b_offset;
- bool _reshape_b_only_on_first_run;
- bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
deleted file mode 100644
index 51216715f..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__
-#define __ARM_COMPUTE_CLLOGICALNOT_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-class CLLogicalNot : public ICLSimpleFunction
-{
-public:
- /** Initialise the function's source and destination.
- *
- * @param[in] input Source tensor. Data types supported: QASYMM8.
- * @param[out] output Output tensor. Data types supported: QASYMM8.
- */
- void configure(ICLTensor *input, ICLTensor *output);
-};
-
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
deleted file mode 100644
index 7fbe558ff..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLPRELU_H__
-#define __ARM_COMPUTE_CLPRELU_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-class CLPReLU : public ICLSimpleFunction
-{
-public:
- /** Initialise the function's source and destination.
- *
- * @param[in] input. Data types supported:
- * QASYMM8/F16/F32.
- * @param[in] alpha. Data types supported:
- * QASYMM8/F16/F32.
- * @param[out] output Output tensor. Data types supported: Same as @p input.
- */
- void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLPRELU_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
deleted file mode 100644
index e83fb01cd..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLPixelWiseDivision.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLPixelWiseDivision class
- */
-#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
-#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLPixelWiseDivisionKernel.
- */
-class CLPixelWiseDivision : public ICLSimpleFunction
-{
-public:
- /**
- * @brief Initialise the kernel's inputs, output and convertion policy.
- * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32
- * The input tensor is [in, out] because its TensorInfo might be
- * modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
- * The input tensor is [in, out] because its TensorInfo might be
- * modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] output The output tensor, Data types supported: same as @p input1.
- * Note: U8 requires both inputs to be U8.
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or
- * 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
- * even.
- * @return N/A
- */
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
- ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
- RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
-
- /**
- * @brief Static function to check if given info will lead to a valid configuration of @ref
- * CLPixelWiseDivision
- * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32
- * @param[in] input2 An input tensor info. Data types supported: same as @p input1.
- * @param[in] output The output tensor info, Data types supported: same as @p input1.
- * Note: U8 requires both inputs to be U8.
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n
- * where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
- * @return a status
- */
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output, float scale = 1.f,
- ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
- RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
-};
-}
-#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
deleted file mode 100644
index b49cbd873..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__
-#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__
-
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLRNNLayerEx */
-class CLRNNLayerEx : public IFunction
-{
-public:
- /** Default constructor */
- CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Initialize the function
- *
- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
- * types supported: F16/F32
- * @param[in] weights Weights tensor of shape [input_size, num_units] that
- * multiplies the input. Data types supported: Same as @p input
- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
- * the current 'state'. Data types supported: Same as @p input
- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same
- * as @p input
- * @param[out] output Output tensor of shape [num_units, batch_size]. Data types
- * supported: Same as @p input
- * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types
- * supported: Same as @p input
- * @param[in] info Activation layer parameter.
- */
- void configure(const ICLTensor *input, const ICLTensor *weights,
- const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
- ICLTensor *output, ActivationLayerInfo &info);
- /** Initialize the function
- *
- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
- * types supported: F16/F32
- * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies
- * the input. Data types supported: Same as @p input
- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
- * current 'state'. Data types supported: Same as @p input
- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p
- * input
- * @param[in] output Output tensor of shape [num_units, batch_size]. Data types
- * supported: Same as @p input
- * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types
- * supported: Same as @p input
- * @param[in] info Activation layer parameter.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
- const ITensorInfo *hidden_state, const ITensorInfo *output,
- const ActivationLayerInfo &info);
-
- // Inherited methods overridden:
- void run() override;
- void prepare() override;
-
-private:
- MemoryGroup _memory_group;
- CLGEMM _gemm_state_f;
- CLSaturatedArithmeticOperationKernel _add_kernel;
- CLActivationLayerKernel _activation_kernel;
- CLFullyConnectedLayer _fully_connected_kernel;
- CLCopyKernel _copy_kernel;
- CLTensor _fully_connected_out;
- CLTensor _gemm_output;
- CLTensor _add_output;
- bool _is_prepared;
-};
-}
-#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
deleted file mode 100644
index 2090b46fa..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
-#define __ARM_COMPUTE_CLSPACETODEPTH_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLSpaceToDepthKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLSpaceToDepth : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * @param[block_size] block size integer only
- */
- void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
deleted file mode 100644
index 03edd15e6..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLStridedSlice.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
- */
-
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLStridedSliceKernel
- */
-class CLStridedSliceEx : public ICLSimpleFunction
-{
-public:
- /**
- * @brief Initialise the kernel's inputs and outputs
- * @param[in] input Tensor input. Data type supported:
- * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
- * @param[out] output Output tensor. Data type supported: Same as @p input
- * @param[in] beginData 'begin' vector of strided slice operation
- * @param[in] endData 'end' vector of strided slice operation
- * @param[in] stridesData 'strides' vector of strided slice operation
- * @param[in] beginMask If the ith bit is set, begin[i] is ignored
- * @param[in] endMask If the ith bit is set, end[i] is ignored
- * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the
- * dimensionality by 1, taking on the value at index begin[i]
- * @return N/A
- */
- void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
- ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
- int32_t shrinkAxisMask);
-};
-}
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
index 54a697e69..5fb102e47 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,16 +37,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
-
-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
@@ -54,119 +49,102 @@
namespace arm_compute
{
-class ICLTensor;
-/** Function to run the transpose convolution layer.
- *
- * @note This layer was copied in order to fix a bug computing to wrong output dimensions.
- *
- * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input
- * depending on the stride and pad info and then perform a 1x1
- * convolution pass. Input stride defines how many zeroes we should put between each element of the
- * input, pad is the amount of padding and finally a is a user
- * specified value where a < stride - 1, that increases the padding top and right of the input
- * image.
- *
- * The relation between input to output is as follows:
- * \f[
- * width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
- * \f]
- * \f[
- * height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
- * \f]
- *
- * where:
- * width_input is the size of the first input dimension.
- * height_input is the size of the second input dimension.
- * width_output is the size of the first output dimension.
- * height_output is the size of the second output dimension.
- * kernel_x and kernel_y are the convolution sizes in x and y.
- * stride_x and stride_y is the input stride of the first and second dimension.
- *
- * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
- * Therefore, it will be necessary to use the weights in the
- * reverse order to perform an actual convolution. This is achieved by using the @ref
- * CPPFlipWeightsKernel.
- *
- * This function calls the following OpenCL kernels/functions:
- *
- * -# @ref CLTransposeConvLayerUpsample
- * -# @ref CLConvolutionLayer
+/** Basic function to compute the deconvolution layer. This function calls the following OpenCL
+ * kernels/functions:
*
+ * -# @ref CLGEMMDeconvolutionLayer
+ * -# @ref CLDirectTransposeConvLayer
*/
class CLTransposeConvLayer : public IFunction
{
public:
- /** Constructor */
+ /** Default constructor */
CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLTransposeConvLayer(const CLTransposeConvLayer &) = delete;
- /** Default move constructor */
- CLTransposeConvLayer(CLTransposeConvLayer &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete;
- /** Default move assignment operator */
- CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default;
+
/** Set the input, weights, biases and output tensors.
*
- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input,
- * and an optional 4th dimension for batch of inputs.
- * Data types supported: QASYMM8/F16/F32.
- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
- * Data type supported: Same as @p input.
- * @param[in] bias (Optional) The biases have one dimension. Data type supported:
- * Same as @p input.
- * @param[out] output Output tensor. The output has the same number of dimensions
- * as the @p input.
- * @param[in] info Contains padding and policies to be used in the
- * transpose convolution, this is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to top edge of the output.
- * @param[in] weights_info (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been
- * reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same
+ * as @p input.
+ * @param[out] output Output tensor. The output has the same number of dimensions as the
+ * @p input.
+ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this
+ * is described in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ *
*/
void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
- const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+ const PadStrideInfo &deconv_info, unsigned int invalid_right,
+ unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo());
+ /** Set the input, weights, biases and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and
+ * an optional 4th dimension for batch of inputs. Data types supported:
+ * QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension. Data type supported:
+ * Same as @p input.
+ * @param[out] output Output tensor. The output has the same number of dimensions as
+ * the @p input.
+ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution,
+ * this is described in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+ *
+ */
+ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
+ const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+ unsigned int invalid_right, unsigned int invalid_bottom,
const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref
- * CLTransposeConvLayer
+ * CLTransposeConvLayer
+ *
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as
+ * @p input.
+ * @param[in] output Output tensor info. The output has the same number of dimensions as the
+ * @p input.
+ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is
+ * described in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
*
- * @param[in] input Input tensor info. 3 lower dimensions represent a single input,
- * and an optional 4th dimension for batch of inputs.
- * Data types supported: QASYMM8/F16/F32.
- * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM].
- * Data type supported: Same as @p input.
- * @param[in] bias (Optional) The biases have one dimension. Data type supported:
- * Same as @p input.
- * @param[in] output Output tensor info. The output has the same number of dimensions
- * as the @p input.
- * @param[in] info Contains padding and policies to be used in the
- * transpose convolution, this is decribed in @ref PadStrideInfo.
- * @param[in] innvalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to top edge of the output.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
- unsigned int innvalid_right, unsigned int invalid_bottom,
+ const ITensorInfo *bias, ITensorInfo *output,
+ const PadStrideInfo &deconv_info, unsigned int invalid_right,
+ unsigned int invalid_bottom,
const WeightsInfo &weights_info = WeightsInfo());
+ static DeconvolutionMethod
+ get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, ITensorInfo *output,
+ const PadStrideInfo &deconv_info, unsigned int invalid_right,
+ unsigned int invalid_bottom, const WeightsInfo &weights_info);
// Inherited methods overridden:
void run() override;
void prepare() override;
private:
- MemoryGroup _memory_group;
- CLTransposeConvLayerUpsample _scale_f;
- CLConvolutionLayer _conv_f;
- CPPFlipWeightsKernel _flip_weights;
- CLTensor _scaled_output;
- ICLTensor *_original_weights;
- CLTensor _weights_flipped;
- bool _is_prepared;
+ std::shared_ptr<IMemoryManager> _memory_manager;
+ std::unique_ptr<IFunction> _function;
};
-}
+} // namespace arm_compute
#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
deleted file mode 100644
index 7570fe76d..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */
-class CLTransposeConvLayerUpsample : public IFunction
-{
-public:
- /** Default constructor */
- CLTransposeConvLayerUpsample();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete;
- /** Allow instances of this class to be moved */
- CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default;
- /** Allow instances of this class to be moved */
- CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default;
- /** Default destructor */
- virtual ~CLTransposeConvLayerUpsample() = default;
-
- /** Initialize the function's source, destination, interpolation type and border_mode.
- *
- * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
- * @param[in] inner_border The number of zeros added to right and top edges of the input.
- * @param[in] info Contains padding and policies to be used in the deconvolution.
- */
- void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
- const PadStrideInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLTransposeConvLayerUpsample
- *
- * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32.
- * @param[in] output Destination tensor info. Data type supported: same as @p input.
- * @param[in] inner_border The number of zeros added to right and top edges of the input.
- * @param[in] info Contains padding and policies to be used in the deconvolution.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const BorderSize &inner_border, const PadStrideInfo &info);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- CLTransposeConvLayerUpsampleKernel _upsample;
- ICLTensor *_output;
-};
-}
-#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
deleted file mode 100644
index 666afef4b..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
-#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
-
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref CPPUpsample */
-class CPPUpsampleEx : public ICPPSimpleFunction
-{
-public:
- /** Configure the upsample CPP kernel
- *
- * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8
- * @param[out] output The output tensor. Data types supported: Same as @p input
- * @param[in] info Padding information
- */
- void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
-};
-}
-#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index 49504fde3..3fad230f1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -18,20 +18,13 @@
#include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h>
#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
-#include <arm_compute/runtime/NEON/functions/NECast.h>
-#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h>
#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
#include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NEPReLU.h>
-#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
#include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
-#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h>
#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
-#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h>
#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
deleted file mode 100644
index f0f0d8114..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NECAST_H__
-#define __ARM_COMPUTE_NECAST_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */
-class NECast : public INESimpleFunctionNoBorder
-{
-public:
- /** Configure the kernel.
- *
- * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
- * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
- * U8/S8/QASYMM8/U32/S32/F32.
- * @param[in] input_subtype Sub data type of input.
- */
- void configure(const ITensor *input, ITensor *output,
- SubDataType input_subtype = SubDataType::NONE);
- /** Static function to check if given info will lead to a valid configuration of @ref NECast
- *
- * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
- * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
- * @param[in] input_subtype Sub data type of input.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- SubDataType input_subtype = SubDataType::NONE);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NECAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
deleted file mode 100644
index 005d85add..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */
-class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder
-{
-public:
- /** Set the input and output tensors.
- *
- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[out] output Tensor output. Data types supported: same as @p input
- * @param[in] block_shape Block shape value.
- */
- void configure(const ITensor *input, ITensor *output, int32_t block_shape);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEDepthToSpaceLayerEx.
- *
- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[in] output Tensor output info. Data types supported: same as @p input
- * @param[in] block_shape Block shape x value.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
deleted file mode 100644
index 27a38e982..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
-#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform negative on an input tensor. */
-class NENegLayer : public INESimpleFunction
-{
-public:
- /** Initialize the function
- *
- * @param[in] input Input tensor. Data types supported: F16/F32/S32.
- * @param[out] output Output tensor. Data types supported: same as @p input.
- */
- void configure(const ITensor *input, ITensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
- *
- * @param[in] input First tensor input info. Data types supported: F16/F32/S32.
- * @param[in] output Output tensor info. Data types supported: Same as @p input.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
index 39c57eb70..56548a479 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -46,7 +46,7 @@
#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
#include "arm_compute/runtime/Tensor.h"
@@ -164,7 +164,7 @@ private:
MemoryGroup _memory_group;
NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
NEQuantizationSymmetricKernel _quant_input_kernel;
- NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+ NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
NEMultiplyScaleFactorKernel _multiply_scale_kernel;
NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
Tensor _reshape_weights_output;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
deleted file mode 100644
index d844513c9..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following
- * NEON kernels if the DOT product instruction is not available:
- *
- * -# @ref NEGEMMInterleave4x4Kernel
- * -# @ref NEGEMMTranspose1xWKernel
- * -# @ref NEGEMMLowpMatrixMultiplyKernel
- * -# @ref NEGEMMLowpOffsetContributionKernel
- * -# @ref NEActivationLayer
- *
- * otherwise if the DOT product instruction is available:
- *
- * -# @ref NEGEMMLowpOffsetContributionKernel
- *
-*/
-class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction
-{
-public:
- /** Constructor */
- NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
- /** Default move constructor */
- NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
- /** Default move assignment operator */
- NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
- /** Initialise the kernel's inputs, output
- *
- * @note GEMM_LOWP: low precision GEMM kernel
- * This kernel performs the following computations:
- *
- * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
- * -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
- * -# Compute the matrix product of the resulting a * b in int32.
- *
- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
- * QASYMM8/QASYMM8_SIGNED otherwise
- *
- * @param[in] a First input tensor (Matrix A). Data type supported:
- * QASYMM8/QASYMM8_SIGNED.
- * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
- * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported:
- * S32
- * @param[out] output Output tensor. Data type supported: Data type supported:
- * S32/QASYMM8/QASYMM8_SIGNED
- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
- * and
- * if the reshape of matrix B should be executed only for the first run
- */
- void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output,
- const GEMMInfo &gemm_info = GEMMInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEGEMMLowpMatrixMultiplyCoreEx
- *
- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
- * QASYMM8/QASYMM8_SIGNED otherwise
- *
- * @param[in] a First input tensor info (Matrix A). Data type supported:
- * QASYMM8/QASYMM8_SIGNED.
- * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a
- * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type
- * supported: S32
- * @param[in] output Output tensor info. Data type supported: Data type supported:
- * S32/QASYMM8/QASYMM8_SIGNED
- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
- * and
- * if the reshape of matrix B should be executed only for the first run
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
- const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
-
- // Inherited methods overridden
- void run() override;
- void prepare() override;
-
-private:
- MemoryGroup _memory_group;
- NEGEMMAssemblyDispatch _asm_glue;
- std::unique_ptr<INEKernel> _mm_kernel;
- std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
- std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
- NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
- NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
- NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
- NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
-
- Tensor _vector_sum_col;
- Tensor _vector_sum_row;
- Tensor _tmp_a;
- Tensor _tmp_b;
- Tensor _mm_result_s32;
- Tensor _signed_a;
- Tensor _signed_output;
- const ITensor *_original_b;
- int32_t _a_offset;
- int32_t _b_offset;
-
- bool _run_vector_matrix_multiplication;
- bool _assembly_path;
- bool _fused_assembly_path;
- bool _reshape_b_only_on_first_run;
- bool _is_prepared;
- bool _fuse_output_stage;
- bool _flip_signedness;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
deleted file mode 100644
index ca8413352..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEPRELU_H__
-#define __ARM_COMPUTE_NEPRELU_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEPReLUKernel */
-class NEPReLU : public INESimpleFunctionNoBorder
-{
-public:
- /** Initialise the kernel's inputs and output
- *
- * @param[in] input. Data types supported: QASYMM8/F32.
- * @param[in] alpha. Data types supported: Same as @p input.
- * @param[out] output Output tensor. Data types supported: Same as @p input.
- */
- void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEPRELU_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
deleted file mode 100644
index 8a7b17946..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__
-#define __ARM_COMPUTE_NERNNLAYER_EX_H__
-
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NERNNLayerEx */
-class NERNNLayerEx : public IFunction
-{
-public:
- /** Default constructor */
- NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NERNNLayerEx(const NERNNLayerEx &) = delete;
- /** Default move constructor */
- NERNNLayerEx(NERNNLayerEx &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NERNNLayerEx &operator=(const NERNNLayerEx &) = delete;
- /** Default move assignment operator */
- NERNNLayerEx &operator=(NERNNLayerEx &&) = default;
- /** Initialize the function
- *
- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
- * types supported: F16/F32
- * @param[in] weights Weights tensor of shape [input_size, num_units] that
- * multiplies the input. Data types supported: Same as @p input
- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
- * the current 'state'. Data types supported: Same as @p input
- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same
- * as @p input
- * @param[out] output Output tensor of shape [num_units, batch_size]. Data types
- * supported: Same as @p input
- * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types
- * supported: Same as @p input
- * @param[in] info Activation layer parameter.
- */
- void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights,
- const ITensor *bias, ITensor *hidden_state, ITensor *output,
- ActivationLayerInfo &info);
- /** Initialize the function
- *
- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
- * types supported: F16/F32
- * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies
- * the input. Data types supported: Same as @p input
- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
- * current 'state'. Data types supported: Same as @p input
- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p
- * input
- * @param[in] output Output tensor of shape [num_units, batch_size]. Data types
- * supported: Same as @p input
- * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types
- * supported: Same as @p input
- * @param[in] info Activation layer parameter.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
- const ITensorInfo *hidden_state, const ITensorInfo *output,
- const ActivationLayerInfo &info);
-
- // Inherited methods overridden:
- void run() override;
- void prepare() override;
-
-private:
- MemoryGroup _memory_group;
- NEGEMM _gemm_state_f;
- NEArithmeticAdditionKernel _add_kernel;
- NEActivationLayerKernel _activation_kernel;
- NEFullyConnectedLayer _fully_connected_kernel;
- NECopyKernel _copy_kernel;
- Tensor _fully_connected_out;
- Tensor _gemm_output;
- Tensor _add_output;
- bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
deleted file mode 100644
index 03ac45798..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
-#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform reduce operation */
-class NEReduceMeanEx : public IFunction
-{
-public:
- /** Constructor */
- NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Configure kernel
- *
- * @note Supported tensor rank: up to 4
- *
- * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
- * @param[in] reduction_axis Reduction axis vector.
- * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
- * @param[out] output Destination tensor. Data type supported: Same as @p input
- */
- void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
- ITensor *output);
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEReduceMeanEx
- *
- * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
- * @param[in] reduction_axis Reduction axis vector.
- * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
- * @param[in] output Destination tensor. Data type supported: Same as @p input
- *
- * @return A status
- */
- static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
- bool keep_dims, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- MemoryGroup _memory_group;
- std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
- std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
- NEReshapeLayer _reshape;
- unsigned int _reduction_ops;
- bool _keep_dims;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
deleted file mode 100644
index 3b695fbc0..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
-#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to spatial divide a tensor. This function calls the following NEON
- * kernels/functions:
- *
- * -# @ref NEMemsetKernel
- * -# @ref NESpaceToBatchLayerKernel
- */
-class NESpaceToBatchLayerEx : public IFunction
-{
-public:
- /** Default constructor */
- NESpaceToBatchLayerEx();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete;
- /** Allow instances of this class to be moved */
- NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default;
- /** Allow instances of this class to be moved */
- NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default;
- /** Default destructor */
- virtual ~NESpaceToBatchLayerEx() = default;
- /** Set the input and output tensors.
- *
- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
- * @param[in] paddings 2-D tensor with shape [2, M]. Data types supported: S32
- * @param[out] output Tensor output. Data types supported: same as @p input
- */
- void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings,
- ITensor *output);
- /** Set the input and output tensors. (Static block shape and paddings)
- *
- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[in] block_shape_x Block shape x value.
- * @param[in] block_shape_y Block shape y value.
- * @param[in] padding_left The left padding of the output tensor.
- * @param[in] padding_right The right padding of the output tensor.
- * @param[out] output Tensor output. Data types supported: same as @p input
- */
- void configure(const ITensor *input, const int block_shape_x, const int block_shape_y,
- const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NESpaceToBatchLayerEx
- *
- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
- * @param[in] paddings paddings tensor info with shape [2, M]. Data types supported: S32
- * @param[in] output Tensor output info. Data types supported: same as @p input
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape,
- const ITensorInfo *paddings, const ITensorInfo *output);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NESpaceToBatchLayerEx (Static block shape and paddings)
- *
- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[in] block_shape_x Block shape x value.
- * @param[in] block_shape_y Block shape y value.
- * @param[in] padding_left The left padding of the output tensor.
- * @param[in] padding_right The right padding of the output tensor.
- * @param[in] output Tensor output info. Data types supported: same as @p input
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y,
- const Size2D &padding_left, const Size2D &padding_right,
- const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
- NEMemsetKernel _memset_kernel; /**< Memset kernel to run */
- bool _has_padding; /**< Flag to check if the output has padding */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
deleted file mode 100644
index 9f32616f3..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
-#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** This function calls the following NEON kernels/functions:
- *
- * -# @ref NESpaceToDepthLayerKernelEx
- */
-class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder
-{
-public:
- /** Set the input and output tensors.
- *
- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[out] output Tensor output. Data types supported: same as @p input
- * @param[in] block_shape Block shape value
- */
- void configure(const ITensor *input, ITensor *output, int32_t block_shape);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NESpaceToDepthLayerEx (Static block shape and paddings)
- *
- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
- * @param[in] output Tensor output info. Data types supported: same as @p input
- * @param[in] block_shape Block shape value
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 408d150d0..24ff5dac9 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,16 +37,14 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
@@ -59,8 +57,8 @@ namespace arm_compute
{
/** Function to run the deconvolution layer.
*
- * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the
- * input depending on the stride and pad info and then perfrom a 1x1
+ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perfrom a 1x1
* convolution pass. Input stride defines how many zeroes we should put between each element of the
* input, pad is the amount of padding and finaly a is a user
* specified value where a < stride - 1 that increases the padding top and right of the input image.
@@ -81,21 +79,22 @@ namespace arm_compute
* kernel_x and kernel_y are the convolution sizes in x and y.
* stride_x and stride_y is the input stride of the first and second dimension.
*
- * The weights used by Transpose convolution are supposed to be the same as the ones used for
- * Convolution. Therefore, it will be necessary to use the weights in the
- * reverse order to perform an actual convolution. This is achieved by using the @ref
- * CPPFlipWeightsKernel.
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
*
* This function calls the following NEON kernels/functions:
*
- * -# @ref CPPUpsample
+ * -# @ref CPPUpsampleEx
* -# @ref NEConvolutionLayer
+ * -# @ref NEPermute
+ * -# @ref NEReverse
*
*/
class NETransposeConvLayer : public IFunction
{
public:
- /** Default constructor */
+ /** Constructor */
NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -112,37 +111,38 @@ public:
/** Set the input, weights, biases and output tensors.
*
* @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
* @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
+ * supported: Same as @p input.
* @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type
- * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+ * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
+ * for F16 input.
* @param[out] output Output tensor. The output has the same number of dimensions as the @p
- * input.
+ * input.
* @param[in] info Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to top edge of the output.
+ * decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
*
*/
void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
const PadStrideInfo &info, unsigned int invalid_right,
unsigned int invalid_bottom);
/** Static function to check if given info will lead to a valid configuration of @ref
- * NETransposeConvLayer
+ * NETransposeConvLayer
*
* @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
* @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
+ * supported: Same as @p input.
* @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types
- * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+ * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
* @param[in] output Output tensor info. The output has the same number of dimensions as the @p
- * input.
+ * input.
* @param[in] info Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] innvalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to top edge of the output.
+ * decribed in @ref PadStrideInfo.
+ * @param[in] innvalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
*
* @return a status
*/
@@ -158,17 +158,11 @@ public:
private:
MemoryGroup _memory_group;
NEConvolutionLayer _conv_f;
- CPPUpsampleEx _upsample_f;
- CPPFlipWeightsKernel _flip_weights;
- NEPermute _permute_input;
- NEPermute _permute_weights;
- NEPermute _permute_output;
+ CPPUpsample _upsample_f;
+ NEReverse _flip_weights;
Tensor _scaled_output;
Tensor _weights_flipped;
- Tensor _permuted_input;
- Tensor _permuted_weights;
- Tensor _permuted_output;
- bool _is_nchw;
+ Tensor _flip_axis;
const ITensor *_original_weights;
ITensor *_input;
PadStrideInfo _info;
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 7b6b9742b..ba42a2456 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -55,16 +55,7 @@ using namespace arm_compute;
const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
// ARMComputeEx kernels
- {"arg_op", "arg_operation.cl"},
- {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
{"binary_logical_op", "binary_logical_op.cl"},
- {"cast", "cast.cl"},
- {"cast_qasymm_in", "cast.cl"},
- {"cast_qasymm_out", "cast.cl"},
- {"comparison_op", "comparison_op.cl"},
- {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
- {"depth_to_space_nchw", "depth_to_space.cl"},
- {"depth_to_space_nhwc", "depth_to_space.cl"},
{"embedding_lookup", "embedding_lookup.cl"},
{"gather_ex", "gather_ex.cl"},
{"gather_ex_1d", "gather_ex.cl"},
@@ -74,10 +65,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
{"instance_normalization_ex", "instance_normalization_ex.cl"},
{"multiply_scale_factor", "multiply_scale_factor.cl"},
{"neg_tensor", "neg_tensor.cl"},
- {"permute_generic", "permute_ex.cl"},
- {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
- {"prelu", "prelu.cl"},
- {"prelu_qasymm8", "prelu_quantized.cl"},
{"quantization_symm8", "quantization_symm8.cl"},
{"reduce_min_max", "reduce_operation.cl"},
{"reduce_sum_mean", "reduce_operation.cl"},
@@ -91,29 +78,15 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
{"radixsort_reorder", "topkv2_radixsort.cl"},
{"topkv2_quicksort", "topkv2_quicksort.cl"},
{"scale_factor_symm8", "scale_factor.cl"},
- {"space_to_depth_nchw", "space_to_depth.cl"},
- {"space_to_depth_nhwc", "space_to_depth.cl"},
};
const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
#ifdef EMBEDDED_KERNELS
{
- "arg_operation.cl",
-#include "./cl_kernels/arg_operation.clembed"
- },
- {
- "cast.cl",
-#include "./cl_kernels/cast.clembed"
- },
- {
"embedding_lookup.cl",
#include "./cl_kernels/embedding_lookup.clembed"
},
{
- "depth_to_space.cl",
-#include "./cl_kernels/depth_to_space.clembed"
- },
- {
"gather_ex.cl",
#include "./cl_kernels/gather_ex.clembed"
},
@@ -150,14 +123,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
#include "./cl_kernels/neg_tensor.clembed"
},
{
- "prelu.cl",
-#include "./cl_kernels/prelu.clembed"
- },
- {
- "prelu_quantized.cl",
-#include "./cl_kernels/prelu_quantized.clembed"
- },
- {
"quantization_symm8.cl",
#include "./cl_kernels/quantization_symm8.clembed"
},
@@ -170,10 +135,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
#include "./cl_kernels/scale_factor.clembed"
},
{
- "space_to_depth.cl",
-#include "./cl_kernels/space_to_depth.clembed"
- },
- {
"topkv2.cl",
#include "./cl_kernels/topkv2.clembed"
},
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
deleted file mode 100644
index 03717cfe9..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform arg_max/arg_min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
- * e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- * e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as
- * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types:
- * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension
- * (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension
- * (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element
- * in the source image
- * @param[in] input_stride_w Stride of the source tensor in W dimension
- * (in bytes)
- * @param[in] input_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[out] output_ptr Pointer to the destination image.
- * Supported data types: U32
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension
- * (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- * @param[in] axis Axis through which reduction occurs
- * @param[in] dim Dimension across the axis to be reduced.
- */
-
-__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
- const int dim)
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
- int indices[4] = {
- get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
- get_global_id(2) / DEPTH_OUT,
- };
-
- DATA_TYPE value =
- *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
- DATA_TYPE tval = value;
- int idx = 0;
- for (int i = 1; i < dim; ++i)
- {
- indices[axis] = i;
-
-#if OP_CODE == 1 // ArgMax
- value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
- indices[2], indices[3])));
-#elif OP_CODE == 2 // ArgMin
- value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
- indices[2], indices[3])));
-#else
- return;
-
-#endif
-
- if (tval != value)
- {
- idx = indices[axis];
- tval = value;
- }
- }
-
- *((__global uint *)out.ptr) = idx;
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
deleted file mode 100644
index f74c1c103..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
- * QASYMM8
- *
- * The following computations will be performed:
- *
- * -# Add offset terms to inputs
- -# Get scaled value of two inputs
- * -# Add inputs
- * -# Add offset terms to final result
- * -# Multiply each entry of result by result_mult_int
- * -# Shift the int32 accumulator by result_shift
- * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using
- * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The number of bits to shift left of input tensors must be passed at compile time using
- * -DLEFT_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
- * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
- -DIN2_OFFSET,
- * -RIN2_MULT_INT and -DIN2_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
- * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
- -DRESULT_SHIFT
- *
- * @attention The input and output data_types need to be passed at compile time using
- * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
- * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
- * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
- * @attention The inputs and output scale offset need to be passed at compile time using
- * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
- * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
- * wrapping policy will be used.
- *
- * @param[in] in1_ptr Pointer to the source tensor.
- * Supported data types: QASYMM8
- * @param[in] in1_stride_x Stride of the source tensor in X dimension
- * (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed
- * per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types:
- * QASYMM8
- * @param[in] in2_stride_x Stride of the source tensor in X dimension
- * (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed
- * per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- * @param[out] out_ptr Pointer to the destination tensor.
- * Supported data types: QASYMM8
- * @param[in] out_stride_x Stride of the destination tensor in X dimension
- * (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed
- * per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination tensor in Y dimension
- * (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed
- * per workitem(in bytes)
- * @param[in] out_stride_z Stride of the source tensor in Z dimension
- * (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Z processed
- * per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination
- * tensor
- */
-__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out))
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load data
- VEC_DATA_TYPE(int, 16)
- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
- VEC_DATA_TYPE(int, 16)
- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
- // Get scaled value of two inputs
- VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
- VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-
- VEC_DATA_TYPE(int, 16)
- left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
- VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
- VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
-
- VEC_DATA_TYPE(int, 16)
- scaled_in1_val =
- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
- VEC_DATA_TYPE(int, 16)
- scaled_in2_val =
- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
-
- // Add inputs and multiply with a multiplier smaller than 1
- VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
- VEC_DATA_TYPE(int, 16)
- out_val =
- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
- out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
- VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
- // TODO: Apply min-max BOUND to support fuse with relu.
- /*
- #if defined(MIN_BOUND)
- res = max(res, (uchar16)MIN_BOUND);
- #endif // defined(MIN_BOUND)
- #if defined(MAX_BOUND)
- res = min(res, (uchar16)MAX_BOUND);
- #endif // defined(MAX_BOUND)
- */
-
- // Store result
- VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 4147a0017..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef SCALE
-#define SCALE 1.0f
-#endif
-#ifndef OFFSET
-#define OFFSET 0
-#endif
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
-/** Perform a cast operation on an input tensor.
- *
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @attention -DBOOL_INPUT : Whether type of input is bool.
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VSTORE(VEC_SIZE)
- (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
- 0, (__global DATA_TYPE_OUT *)output.ptr);
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
- res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#if defined(BOOL_INPUT)
- VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
- VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
- res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#endif // defined(BOOL_INPUT)
-
- VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 input tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of input should be given as a preprocessor argument using
- * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
- in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
- VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
- VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
-
- VSTORE(VEC_SIZE)
- (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
- (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 output tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of output should be given as a preprocessor argument using
- * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: U8
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
- in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
- VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
- VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
-
- VSTORE(VEC_SIZE)
- (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
- (__global DATA_TYPE_OUT *)output.ptr);
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index 0285c955b..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- * e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- * using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- * -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
- int out_index[4] = {0};
- int in_index[4] = {0};
-
- out_index[0] = get_global_id(0); // W
- out_index[1] = get_global_id(1); // H
- out_index[2] = get_global_id(2) % Z_OUT; // C
- out_index[3] = get_global_id(2) / Z_OUT; // B
-
- in_index[0] = out_index[0] / BLOCK_SIZE;
- in_index[1] = out_index[1] / BLOCK_SIZE;
- in_index[2] = out_index[2] +
- ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
- in_index[3] = out_index[3];
-
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
- &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor (NHWC)
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- * e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- * using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- * -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
- int out_index[4] = {0};
- int in_index[4] = {0};
-
- out_index[0] = get_global_id(0); // C
- out_index[1] = get_global_id(1); // W
- out_index[2] = get_global_id(2) % Z_OUT; // H
- out_index[3] = get_global_id(2) / Z_OUT; // B
-
- in_index[0] = out_index[0] +
- ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
- in_index[1] = out_index[1] / BLOCK_SIZE;
- in_index[2] = out_index[2] / BLOCK_SIZE;
- in_index[3] = out_index[3];
-
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
- &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 2d0b6a299..e07a25ec9 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,7 +37,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#ifndef ARM_COMPUTE_HELPER_H
#define ARM_COMPUTE_HELPER_H
@@ -59,16 +58,219 @@
#pragma OPENCL EXTENSION cl_arm_printf : enable
#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
#define EXPAND(x) x
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each
+ * element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
#define VLOAD_STR(size) vload##size
#define VLOAD(size) VLOAD_STR(size)
#define VSTORE_STR(size) vstore##size
#define VSTORE(size) VSTORE_STR(size)
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create
+// defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
#define VEC_DATA_TYPE_STR(type, size) type##size
#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index a83b1a8a5..5f1b3f902 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,29 +37,112 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
#define ARM_COMPUTE_HELPERS_ASYMM_H
#include "helpers.h"
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+ float out_f32 = input / scale + offset;
+ uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+ return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+ return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+ return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size) \
+ inline VEC_DATA_TYPE(type, size) \
+ quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+ { \
+ VEC_DATA_TYPE(float, size) \
+ out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
+ VEC_DATA_TYPE(type, size) \
+ res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \
+ VEC_DATA_TYPE(type, size)); \
+ return res; \
+ }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size) \
+ inline VEC_DATA_TYPE(float, size) \
+ dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+ { \
+ return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \
+ }
+
/** Correctly-rounded-to-nearest division by a power-of-two.
*
* @param[in] size Size of vector.
*
* @return Correctly-rounded-to-nearest division by a power-of-two.
*/
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
- { \
- VEC_DATA_TYPE(int, size) \
- mask = (1 << exponent) - 1; \
- const VEC_DATA_TYPE(int, size) zero = 0; \
- const VEC_DATA_TYPE(int, size) one = 1; \
- VEC_DATA_TYPE(int, size) \
- threshold = (mask >> 1) + select(zero, one, x < 0); \
- return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
+ VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+ { \
+ const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \
+ const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \
+ VEC_DATA_TYPE(int, size) \
+ mask = (one << exponent) - one; \
+ VEC_DATA_TYPE(int, size) \
+ threshold = (mask >> 1) + select(zero, one, x < 0); \
+ return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
}
/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -81,9 +164,19 @@
b_64 = convert_long##size(b); \
VEC_DATA_TYPE(long, size) \
ab_64 = a_64 * b_64; \
- /* COMPMID-907 */ \
+ /* Revert COMPMID-907 */ \
+ VEC_DATA_TYPE(long, size) \
+ mask1 = 1 << 30; \
+ VEC_DATA_TYPE(long, size) \
+ mask2 = 1 - (1 << 30); \
+ VEC_DATA_TYPE(long, size) \
+ is_positive_or_zero = ab_64 >= 0; \
+ VEC_DATA_TYPE(long, size) \
+ nudge = select(mask2, mask1, is_positive_or_zero); \
+ VEC_DATA_TYPE(long, size) \
+ mask = 1ll << 31; \
VEC_DATA_TYPE(int, size) \
- ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \
+ ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \
return select(ab_x2_high32, INT_MAX, overflow); \
}
@@ -335,9 +428,18 @@
return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \
}
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) \
+ dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) \
+ DEQUANTIZE_STR(input, offset, scale, type, size)
+
#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
asymm_rounding_divide_by_POW2_##size(x, exponent)
#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+ ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
@@ -360,11 +462,53 @@
#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+ { \
+ const int left_shift = shift > 0 ? shift : 0; \
+ const int right_shift = shift > 0 ? 0 : -shift; \
+ return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \
+ right_shift, size); \
+ }
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+ multiply_by_quantized_multiplier##size(input, qmul, shift)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+ASYMM_MULT_IMPL(1)
ASYMM_MULT_IMPL(2)
ASYMM_MULT_IMPL(4)
ASYMM_MULT_IMPL(8)
@@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+ASYMM_SELECT_USING_MASK_IMPL(1)
ASYMM_SELECT_USING_MASK_IMPL(2)
ASYMM_SELECT_USING_MASK_IMPL(4)
ASYMM_SELECT_USING_MASK_IMPL(8)
ASYMM_SELECT_USING_MASK_IMPL(16)
+ASYMM_MASK_IF_ZERO_IMPL(1)
ASYMM_MASK_IF_ZERO_IMPL(2)
ASYMM_MASK_IF_ZERO_IMPL(4)
ASYMM_MASK_IF_ZERO_IMPL(8)
ASYMM_MASK_IF_ZERO_IMPL(16)
+ASYMM_MASK_IF_NON_ZERO_IMPL(1)
ASYMM_MASK_IF_NON_ZERO_IMPL(2)
ASYMM_MASK_IF_NON_ZERO_IMPL(4)
ASYMM_MASK_IF_NON_ZERO_IMPL(8)
@@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
@@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+ASYMM_RESCALE_IMPL(1)
ASYMM_RESCALE_IMPL(2)
ASYMM_RESCALE_IMPL(4)
ASYMM_RESCALE_IMPL(8)
ASYMM_RESCALE_IMPL(16)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
+
#endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
deleted file mode 100644
index 12c8eeb79..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in] input1_ptr Pointer to the source image. Supported Data
- * types : F16/F32
- * @param[in] input1_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[in] alpha_ptr Pointer to the source image. Supported Data
- * types : F16/F32
- * @param[in] alpha_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] alpha_step_x input2_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] alpha_step_y input2_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] alpha_step_z input2_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source
- * image
- *
- * @param[out] output_ptr Pointer to the destination image. Supported
- * data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VSTORE(VEC_SIZE)
- (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
- ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
- : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
- 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
deleted file mode 100644
index a66e107d1..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
- defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-#define SELECT_TYPE VEC_INT
-
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
- * -DDATA_TYPE_IN=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- * -DVEC_SIZE=16
- * @note Can only take uchar data types.
- *
- * @param[in] input1_ptr Pointer to the source image. Supported Data
- * types : QASYMM8
- * @param[in] input1_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input1_step_x input1_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input1_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input1_step_y input1_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input1_step_z input1_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[in] alpha_ptr Pointer to the source image. Supported Data
- * types : QASYMM8
- * @param[in] alpha_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] alpha_step_x input2_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] alpha_step_y input2_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] alpha_step_z input2_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported
- * data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
- VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
-
- in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
- alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
-
- const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
- const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
- const VEC_FLOAT outf32 =
- select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
- const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
- const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
- VSTORE(VEC_SIZE)
- (res, 0, (__global uchar *)output.ptr);
-}
-
-#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
- // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index eb612f834..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- * e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- * argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- * -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- int out_index[4] = {0};
- int in_index[4] = {0};
-
- in_index[0] = get_global_id(0); // W
- in_index[1] = get_global_id(1); // H
- in_index[2] = get_global_id(2) % Z_IN; // C
- in_index[3] = get_global_id(2) / Z_IN; // B
-
- out_index[0] = in_index[0] / BLOCK_SIZE;
- out_index[1] = in_index[1] / BLOCK_SIZE;
- out_index[2] =
- in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
- out_index[3] = in_index[3];
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
- out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-
-#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- * e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- * argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- * -DBLOCK_SIZE=1
- *
- * @param[in] input_ptr Pointer to the source image. Supported data
- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in
- * bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in
- * bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
- * image
- * @param[out] output_ptr Pointer to the destination image. Supported data
- * types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension
- * (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension
- * (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
- * destination image
- */
-__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- int out_index[4] = {0};
- int in_index[4] = {0};
-
- in_index[0] = get_global_id(0); // C
- in_index[1] = get_global_id(1); // W
- in_index[2] = get_global_id(2) % Z_IN; // H
- in_index[3] = get_global_id(2) / Z_IN; // B
-
- out_index[0] =
- in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
- out_index[1] = in_index[1] / BLOCK_SIZE;
- out_index[2] = in_index[2] / BLOCK_SIZE;
- out_index[3] = in_index[3];
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
- out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
deleted file mode 100644
index 06eeb5b98..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
-{
- TensorShape out_shape{input_shape};
-
- out_shape.set(axis, 1);
-
- return out_shape;
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
- ArgOperation /*op*/)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
- DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
- output->tensor_shape().num_dimensions(),
- "Input's rank is not same with output");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
- "Inputs are not broadcast compatible");
-
- const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
- "output shape's size does not match axis");
-
- const auto num_dimensions = input->tensor_shape().num_dimensions();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
- return Status{};
-}
-
-} // namespace
-
-CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
-
-void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
- ArgOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
- _input = input;
- _output = output;
- _axis = axis;
-
- std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
- output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
-
- // Construct kernel and set op_code based on type of ArgOperation as specified by object op
- std::string kernel_name = "arg_op";
- int op_code = 0;
- if (op == ArgOperation::MAX)
- {
- op_code = 1;
- }
- else if (op == ArgOperation::MIN)
- {
- op_code = 2;
- }
- else
- throw std::runtime_error("Operation not supported, yet");
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
- build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output_info, Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output_info->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const uint32_t axis, ArgOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-
- return Status{};
-}
-
-void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &shape_in = _input->info()->tensor_shape();
-
- unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
- _kernel.setArg<cl_int>(idx++, _axis);
- _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup input slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- // Copy output's shape in order to use for recovering at end of this method
- const TensorShape shape_out = _output->info()->tensor_shape();
- _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-
- // Recover output's shape of output tensor
- _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index bb5556888..fbc76f5e1 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
deleted file mode 100644
index 01ea655b4..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
- _input = input;
- _output = output;
-
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- // Set kernel build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" +
- get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.add_option(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- // Create kernel
- if (is_data_type_quantized_asymmetric(input->info()->data_type()))
- {
- UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
- const float scale_in = qinfo.scale;
- const int offset_in = qinfo.offset;
- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options()));
- }
- else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
- {
- UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
- const float scale_in = qinfo.scale;
- const float offset_in = qinfo.offset;
-
- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options()));
- }
- else
- {
- build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT");
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("cast", build_opts.options()));
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
-
- ICLKernel::configure_internal(win);
-}
-
-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
deleted file mode 100644
index 389136817..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-// TODO Use this validation function
-#if 0
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const int32_t block_size)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
- "Block size should be greater than or equal to 1.");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
- "Output width should be equal to (Input width * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
- "Output height should be equal to (Input height * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
- "Input depth should be divisible by (block size * block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- output->dimension(2) != input->dimension(2) / (block_size * block_size),
- "Output depth should be equal to (Input depth / (block size * block size))");
-
- return Status{};
-}
-#endif
-} // namespace
-
-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
-{
- // DO NOTHING
-}
-
-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
- const int32_t block_size)
-{
- // TODO Add validation of data_layout
- _input = input;
- _output = output;
-
- // Set kernel build options
- auto layout_out = output->info()->data_layout();
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
- auto depth = output->info()->dimension(index_depth);
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
- build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
- "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup input slice
- Window slice_in(slice_out);
- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_in.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 79f5ce065..67aaf2db6 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
deleted file mode 100644
index 235e8975d..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/ToolchainSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
- const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4,
- "The number of dimensions for the matrix A must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3,
- "The number of dimensions for the matrix B must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 &&
- gemm_info.reinterpret_input_as_3d(),
- "The input1 tensor cannot have more than 2 dimensions if input0 "
- "has to be reinterpreted as 3D");
-
- const int m = gemm_info.m();
- const int n = gemm_info.n();
- const int k = gemm_info.k();
-
- ARM_COMPUTE_UNUSED(m);
- ARM_COMPUTE_UNUSED(n);
- ARM_COMPUTE_UNUSED(k);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
- if (gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) !=
- static_cast<unsigned int>(m));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
- }
-
- if (output->total_size() != 0)
- {
- const TensorInfo tensor_info_output =
- output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1,
- ITensorInfo *output,
- const GEMMReshapeInfo &gemm_info,
- ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if (reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_input_as_3d = false;
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output,
- input0->clone()
- ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info))
- .set_data_type(DataType::S32));
-
- TensorInfo tmp_info(*output);
-
- if (reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D
- // GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
- // Note: if the dot product instruction is available, the 8x2 tile has to be used
- num_elems_processed_per_iteration_x = 4;
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the
- // AccessWindowStatic
- const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2]
- : input0->tensor_shape()[1];
- const int bottom_pad =
- (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) %
- num_elems_processed_per_iteration_y;
-
- // Configure window
- win = calculate_max_window(
- tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(
- *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0),
- input0->dimension(1) + bottom_pad);
- AccessWindowStatic input1_access(
- input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- input1->dimension(1));
- AccessWindowStatic output_access(
- output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- window_changed =
- update_window_and_padding(win, input0_access,
- input1_access) || // window used by the execute_window_loop
- update_window_and_padding(
- win_out,
- output_access); // window used to update the padding requirements of output tensor
-
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse =
- std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true),
- _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1,
- ICLTensor *output,
- const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input0->info(), input1->info(), output->info(), gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d
- ? _input0->info()->num_dimensions() - 1
- : _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(),
- gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create build options
- std::string kernel_name(" ");
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
- "-DHEIGHT_GEMM3D=" +
- support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
- "-DDEPTH_GEMM3D=" +
- support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b,
- "-DMATRIX_B_DEPTH=" +
- support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" +
- support::cpp11::to_string(num_elements_processed.x()));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" +
- support::cpp11::to_string(num_elements_processed.y()));
-
- kernel_name = "gemmlowp_mm_midgard_ex";
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
-}
-
-Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0,
- const ITensorInfo *input1,
- const ITensorInfo *output,
- const GEMMReshapeInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(
- validate_and_configure_window(input0->clone().get(), input1->clone().get(),
- output->clone().get(), gemm_info, num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if (_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if (_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- const unsigned int total_cross_plane_pad =
- _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if (_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 =
- 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- const unsigned int total_cross_plane_pad =
- _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A
- // more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution
- // operation
- if (!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++,
- static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++,
- static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++,
- static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
- } while (window.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 3a25987d0..3bfe3e407 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -45,6 +45,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
#include "arm_compute/core/UtilsEx.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index 7fbdcdaa7..930e7c944 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
@@ -110,7 +111,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
_hits = hits;
// Make _lookup_indices tensor
- _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+ _lookup_indices = support::cpp14::make_unique<CLTensor>();
_lookup_indices->allocator()->init(
TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
_lookup_indices->allocator()->allocate();
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index b45f6bb24..61c14d271 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -48,7 +48,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
-
+#include "support/StringSupport.h"
#include "support/ToolchainSupport.h"
namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index d305896ea..6b27c9917 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -49,6 +49,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index 74f7b4158..643c8b110 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
deleted file mode 100644
index 8910a7b80..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
- const TensorShape &out_shape =
- TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
- DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
- DataType::QASYMM8);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
- DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
- return Status{};
-}
-} // namespace
-
-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
-
- _input = input;
- _alpha = alpha;
- _output = output;
-
- // Create kernel
- std::string kernel_name = "prelu";
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
- if (is_data_type_quantized_asymmetric(input->info()->data_type()))
- {
- build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string(
- input->info()->quantization_info().uniform().offset));
- build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string(
- alpha->info()->quantization_info().uniform().offset));
- build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string(
- output->info()->quantization_info().uniform().offset));
- build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string(
- input->info()->quantization_info().uniform().scale));
- build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string(
- alpha->info()->quantization_info().uniform().scale));
- build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(
- output->info()->quantization_info().uniform().scale));
- kernel_name += "_qasymm8";
- }
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output->info(), out_shape);
-
- if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output->info(), Format::F16);
- }
- else if (input->info()->data_type() == DataType::F32 ||
- alpha->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
- Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
-
- AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- ICLKernel::configure_internal(win);
-}
-
-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input->info()->tensor_shape();
- const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
-
- const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_input1);
- add_3D_tensor_argument(idx, _alpha, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPReLUKernel::border_size() const
-{
- const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
- const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index 2d551f654..1a7a18cfa 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -49,6 +49,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
namespace arm_compute
{
@@ -69,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_fac
// Output must always be initialized
ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
return Status{};
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index a98318323..06c2579f2 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
using namespace arm_compute;
namespace
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index ff1904abd..8d8853c81 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -48,6 +48,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/StringSupport.h"
#include <climits>
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
deleted file mode 100644
index 64fc0384e..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const int32_t block_size)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
- DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
- "Block size should be greater than or equal to 1.");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
- "Input batch should be equal to Output batch");
-
- auto layout_out = input->data_layout();
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-
- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
- auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
- auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
- "Output depth should be equal to (input depth * block size *block size)");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
- (input->dimension(index_height) % block_size),
- "Input height and width should be divisible by block size");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
- (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
- "Output height and width should be equal to "
- "input_height/blocksize and input_width/blocksize respectively");
-
- return Status{};
-}
-
-} // namespace
-
-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
- const int32_t block_size)
-{
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
- _input = input;
- _output = output;
-
- // Set kernel build options
- auto layout_out = input->info()->data_layout();
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
- auto depth = input->info()->dimension(index_depth);
- build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
- build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
- "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
- // Setup output slice
- Window slice_out(slice_in);
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- slice_out.set(3, Window::Dimension(0, 0, 0));
-
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, slice_in);
- add_4D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_in);
- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
deleted file mode 100644
index 61999cbd4..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
- : _input(nullptr), _output(nullptr), _inner_border(), _info()
-{
-}
-
-Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
- const ITensorInfo *output,
- const BorderSize &inner_border,
- const PadStrideInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-
- const DataLayout data_layout = input->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
- for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
- "inner_border_right must be smaller that stride_x");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
- "inner_border_top must be smaller that stride_y");
-
- return Status{};
-}
-
-void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
- const BorderSize &inner_border,
- const PadStrideInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- _input = input;
- _output = output;
- _inner_border = inner_border;
- _info = info;
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
- input->info(), output->info(), inner_border, info));
-
- // Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
-
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure_internal(win);
-}
-
-void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const DataLayout data_layout = _input->info()->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- const int out_start_x = _info.pad_left();
- const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
- _info.pad_right() + _info.stride().first - 1;
- const int out_step_x = _info.stride().first;
-
- const int out_start_y = _inner_border.top + _info.pad_top();
- const int out_end_y =
- _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
- const int out_step_y = _info.stride().second;
-
- switch (data_layout)
- {
- case DataLayout::NCHW:
- {
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- Window slice_out = collapsed.first_slice_window_3D();
- slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
- slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
- Window slice_in = collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (collapsed.slide_window_slice_3D(slice_in) &&
- collapsed.slide_window_slice_3D(slice_out));
- break;
- }
- case DataLayout::NHWC:
- {
- // NOTE: not collapsing in NHWC
- Window slice_out = window.first_slice_window_3D();
- slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
- slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
- Window slice_in = window.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
- } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data layout");
- }
-}
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
deleted file mode 100644
index 648afb304..000000000
--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {}
-
-bool CPPUpsampleKernelEx::is_parallelisable() const { return false; }
-
-void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output,
- const PadStrideInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- _input = input;
- _output = output;
- _info = info;
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- ICPPKernel::configure(win);
-}
-
-void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
- // Initialize _scaled_output buffer
- const int width_scaled = _output->info()->dimension(0);
- const int height_scaled = _output->info()->dimension(1);
- const int stride_x = _info.stride().first;
- const int stride_y = _info.stride().second;
- const int start_x = _info.pad_left();
- const int start_y = _info.pad_top();
- const int end_y = height_scaled - _info.pad_bottom();
- const int end_x = width_scaled - _info.pad_top();
- const size_t element_size = _input->info()->element_size();
-
- // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
- const uint8_t fill_value =
- _output->info()->data_type() == DataType::QASYMM8
- ? utility::clamp<uint8_t>(_output->info()->quantization_info().uniform().offset)
- : 0;
- // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
- // values in a buffer of uint8_ts
- std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value);
-
- // Create window
- Window window_out(window);
- window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
- window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
-
- // Create iterators
- Iterator in(_input, window);
- Iterator out(_output, window_out);
-
- execute_window_loop(
- window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
deleted file mode 100644
index fbb9dbca9..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- SubDataType input_subtype)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
- DataType::QASYMM8, DataType::U32,
- DataType::S32, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
- input->data_type() != DataType::U8);
-
- if (output->tensor_shape().total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
- DataType::QASYMM8, DataType::U32,
- DataType::S32, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps());
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
-
- // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
- return std::make_tuple(Status{}, win);
-}
-
-typedef struct bool8x16
-{
- uint8x16_t val;
-} bool8x16_t;
-
-static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
-
-template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
-template <> inline uint8x16_t vcast(const bool8x16_t &v)
-{
- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
- return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-}
-
-template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
-{
- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
- const uint32x4x4_t ret = {{
- vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
- vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
- vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
- vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
- }};
-
- return ret;
-}
-
-template <> inline int32x4x4_t vcast(const bool8x16_t &v)
-{
- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
- const int32x4x4_t ret = {{
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
- }};
-
- return ret;
-}
-
-template <> inline float32x4x4_t vcast(const bool8x16_t &v)
-{
- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
- const float32x4x4_t ret = {{
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
- }};
-
- return ret;
-}
-
-template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
-{
- const uint32x4x4_t ret = {{
- vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
- vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
- vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
- vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
- }};
-
- return ret;
-}
-
-template <> inline int32x4x4_t vcast(const uint8x16_t &v)
-{
- const int32x4x4_t ret = {{
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
- }};
-
- return ret;
-}
-
-template <> inline float32x4x4_t vcast(const uint8x16_t &v)
-{
- const float32x4x4_t ret = {{
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
- }};
-
- return ret;
-}
-
-template <> inline uint8x16_t vcast(const int32x4x4_t &v)
-{
- // Saturate cast
- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
- vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
-}
-
-template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
-{
- // Saturate cast
- const uint32x4x4_t ret = {{
- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
- }};
-
- return ret;
-}
-
-template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
-{
- const float32x4x4_t ret = {{
- vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
- vcvtq_f32_s32(v.val[3]),
- }};
-
- return ret;
-}
-
-template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
-{
- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
- vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
-}
-
-template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
-{
- const int32x4x4_t ret = {{
- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
- }};
-
- return ret;
-}
-
-template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
-{
- const float32x4x4_t ret = {{
- vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
- vcvtq_f32_u32(v.val[3]),
- }};
-
- return ret;
-}
-
-template <> inline uint8x16_t vcast(const float32x4x4_t &v)
-{
- // Saturate cast
- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
- vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
- vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
- vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
-}
-
-template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
-{
- const uint32x4x4_t ret = {{
- vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
- vcvtq_u32_f32(v.val[3]),
- }};
-
- return ret;
-}
-
-template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
-{
- const int32x4x4_t ret = {{
- vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
- vcvtq_s32_f32(v.val[3]),
- }};
-
- return ret;
-}
-
-template <typename T> struct cast_vector;
-template <> struct cast_vector<bool>
-{
- using type = bool8x16_t;
-};
-template <> struct cast_vector<uint8_t>
-{
- using type = uint8x16_t;
-};
-template <> struct cast_vector<uint32_t>
-{
- using type = uint32x4x4_t;
-};
-template <> struct cast_vector<int32_t>
-{
- using type = int32x4x4_t;
-};
-template <> struct cast_vector<float>
-{
- using type = float32x4x4_t;
-};
-
-template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
-{
- wrapper::vstore(ptr, v.val[0]);
- wrapper::vstore(ptr + 4, v.val[1]);
- wrapper::vstore(ptr + 8, v.val[2]);
- wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
-{
- wrapper::vstore(ptr, v);
-}
-
-inline bool8x16_t vloadq(const bool *ptr)
-{
- bool8x16_t ret;
- ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
- return ret;
-}
-
-template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
-{
- return wrapper::vloadq(ptr);
-}
-
-template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
-{
- return vloadq(ptr);
-}
-
-template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
-{
- return vld4q_u32(ptr);
-}
-
-template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
-{
- return vld4q_s32(ptr);
-}
-
-template <> inline typename cast_vector<float>::type load_input(const float *ptr)
-{
- return vld4q_f32(ptr);
-}
-
-template <typename T> inline T get_value(const T *ptr) { return *ptr; }
-
-template <> inline bool get_value(const bool *ptr)
-{
- bool ret = (*ptr != 0);
- return ret;
-}
-
-template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
-{
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win_collapsed);
- Iterator out(output, win_collapsed);
-
-#ifdef __aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else //__aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
- execute_window_loop(
- win_collapsed,
- [&](const Coordinates &) {
- const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
-
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- using from_vector = typename cast_vector<FromT>::type;
- const from_vector vin = load_input(in_ptr + x);
-
- switch (output->info()->data_type())
- {
- case DataType::U8:
- {
- using to_vector = typename cast_vector<uint8_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::QASYMM8:
- {
- using to_vector = typename cast_vector<float>::type;
- const UniformQuantizationInfo &qinfo_out =
- output->info()->quantization_info().uniform();
- const auto vf = vcast<to_vector, from_vector>(vin);
- const auto vout = vquantize(vf, qinfo_out);
- store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::U32:
- {
- using to_vector = typename cast_vector<uint32_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::S32:
- {
- using to_vector = typename cast_vector<int32_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::F32:
- {
- using to_vector = typename cast_vector<float>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- FromT val = get_value(in_ptr + x);
- switch (output->info()->data_type())
- {
- case DataType::U8:
- {
- *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
- break;
- }
- case DataType::QASYMM8:
- {
- const QuantizationInfo &qinfo_out = output->info()->quantization_info();
- const auto qval =
- quantize_qasymm8(static_cast<float>(val), qinfo_out, rounding_policy);
- *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
- break;
- }
- case DataType::U32:
- {
- *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
- break;
- }
- case DataType::S32:
- {
- *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
- break;
- }
- case DataType::F32:
- {
- *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
- }
- },
- in, out);
-}
-
-void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
-{
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win_collapsed);
- Iterator out(output, win_collapsed);
-
-#ifdef __aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else //__aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
- const auto &qinfo_in = input->info()->quantization_info().uniform();
- const auto &qinfo_out = output->info()->quantization_info().uniform();
-
- execute_window_loop(
- win_collapsed,
- [&](const Coordinates &) {
- const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
-
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- using from_vector = typename cast_vector<float>::type;
- const auto vf = wrapper::vloadq(in_ptr + x);
- const auto vin = vdequantize(vf, qinfo_in);
- switch (output->info()->data_type())
- {
- case DataType::U8:
- {
- using to_vector = typename cast_vector<uint8_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::QASYMM8:
- {
- using to_vector = typename cast_vector<float>::type;
- const auto vf = vcast<to_vector, from_vector>(vin);
- const auto vout = vquantize(vf, qinfo_out);
- store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::U32:
- {
- using to_vector = typename cast_vector<uint32_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::S32:
- {
- using to_vector = typename cast_vector<int32_t>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
- break;
- }
- case DataType::F32:
- {
- using to_vector = typename cast_vector<float>::type;
- const to_vector vout = vcast<to_vector, from_vector>(vin);
- store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- qasymm8_t qval_in = *(in_ptr + x);
- const auto val = dequantize_qasymm8(qval_in, qinfo_in);
-
- switch (output->info()->data_type())
- {
- case DataType::U8:
- {
- *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
- break;
- }
- case DataType::QASYMM8:
- {
- const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy);
- *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
- break;
- }
- case DataType::U32:
- {
- *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
- break;
- }
- case DataType::S32:
- {
- *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
- break;
- }
- case DataType::F32:
- {
- *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
- }
- },
- in, out);
-}
-} // namespace
-
-NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
-{
-}
-
-void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
-
- _input = input;
- _output = output;
- _input_subtype = input_subtype;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- SubDataType input_subtype)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
- ARM_COMPUTE_RETURN_ON_ERROR(
- std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
- return Status{};
-}
-
-void NECastKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- switch (_input->info()->data_type())
- {
- case DataType::U8:
- if (_input_subtype == SubDataType::BOOL)
- {
- run_cast<bool>(_input, _output, window);
- }
- else
- {
- run_cast<uint8_t>(_input, _output, window);
- }
- break;
- case DataType::QASYMM8:
- run_cast_qasymm8(_input, _output, window);
- break;
- case DataType::U32:
- run_cast<uint32_t>(_input, _output, window);
- break;
- case DataType::S32:
- run_cast<int32_t>(_input, _output, window);
- break;
- case DataType::F32:
- run_cast<float>(_input, _output, window);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
deleted file mode 100644
index 95e269dee..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
-
- const DataLayout data_layout = input->data_layout();
- const int idx_channel =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
- 0);
- // Validate output if initialized
- if (output->total_size() != 0)
- {
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
- (block_shape * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
- (block_shape * input->tensor_shape()[idx_height]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
- : _input(nullptr), _output(nullptr), _block_shape()
-{
-}
-
-void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
-
- _input = input;
- _output = output;
- _block_shape = block_shape;
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
- ICPPKernel::configure(win);
-}
-
-Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
- return Status{};
-}
-
-void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
- const int idx_channel =
- get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const int depth_size = _input->info()->dimension(idx_channel);
- const int r = (depth_size / (_block_shape * _block_shape));
- const int element_size = _input->info()->element_size();
-
- Window slice_out = window.first_slice_window_3D();
-
- // The slice_out slice does not move
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- // Main loop for NCHW and NHWC
- if (_input->info()->data_layout() == DataLayout::NCHW)
- {
- Window slice_in = window.first_slice_window_2D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(slice_in,
- [&](const Coordinates &id) {
- const int x = id.x();
- const int y = id.y();
-
- const int z = id.z() % r;
- const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
- Coordinates output_coords{out_x, out_y, z, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_2D(slice_in));
- }
- else
- {
- Window slice_in = window.first_slice_window_3D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(slice_in,
- [&](const Coordinates &id) {
- const int x = id.y();
- const int y = id.z();
-
- const int z = id.x() % r;
- const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
- Coordinates output_coords{z, out_x, out_y, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_3D(slice_in));
- }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
deleted file mode 100644
index 200fc4f87..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <cstdint>
-#include <map>
-#include <string>
-
-namespace arm_compute
-{
-class Coordinates;
-
-namespace
-{
-template <ElementWiseUnaryEx op, typename ScalarType>
-inline ScalarType elementwise_op_scalar(const ScalarType &a)
-{
- switch (op)
- {
- case ElementWiseUnaryEx::NEG:
- return -a;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-template <ElementWiseUnaryEx op, typename VectorType>
-inline VectorType elementwise_op(const VectorType &a)
-{
- switch (op)
- {
- case ElementWiseUnaryEx::NEG:
- return wrapper::vneg(a);
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-template <ElementWiseUnaryEx op, typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
-{
- const int window_step_x = 16 / sizeof(ScalarType);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(in, win);
- Iterator output(out, win);
-
- execute_window_loop(win,
- [&](const Coordinates &) {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
-
- int x = window_start_x;
- for (; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(output_ptr + x,
- elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
- }
- for (; x < window_end_x; ++x)
- {
- *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
- }
- },
- input, output);
-}
-
-template <ElementWiseUnaryEx op>
-std::function<void(const ITensor *input, ITensor *output, const Window &window)>
-configure_func(const ITensor *input, ITensor *output)
-{
- std::string function_to_call("op_");
- function_to_call += string_from_data_type(input->info()->data_type()) + "_";
- function_to_call += string_from_data_type(output->info()->data_type());
-
- static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
- map_function = {
- {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
- };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
- auto it = map_function.find(function_to_call);
-
- if (it != map_function.end())
- {
- auto func = it->second;
- return [func](const ITensor *input, ITensor *output, const Window &window) {
- func(input, output, window);
- };
- }
- return nullptr;
-}
-} // namespace
-
-NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
- : _function(nullptr), _input(nullptr), _output(nullptr)
-{
-}
-
-void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
- ITensor *output)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Configure kernel window
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input->info());
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
-
- Window win = calculate_max_window(valid_region);
-
- _input = input;
- _output = output;
-
- INEKernel::configure(win);
-
- switch (op)
- {
- case ElementWiseUnaryEx::NEG:
- _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
- const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
- DataType::S32);
-
- // Validate in case of configured output
- if (output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
- }
-
- return Status{};
-}
-
-Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_UNUSED(op);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
- return Status{};
-}
-
-void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_function == nullptr);
- _function(_input, _output, window);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
deleted file mode 100644
index 641641b5a..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-namespace
-{
-
-/** Conditional element-wise operations */
-enum class ConditionalOperation
-{
- PRELU, /**< (x * y) for x < 0, x for x >= 0 */
-};
-
-template <ConditionalOperation op, typename ScalarType>
-inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
-{
- auto res = ScalarType(0);
-
- switch (op)
- {
- case ConditionalOperation::PRELU:
- res = a < 0 ? a * b : a;
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res;
-}
-
-template <ConditionalOperation op>
-inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
- QuantizationInfo qinfo)
-{
- return quantize_qasymm8(elementwise_conditional_op_scalar<op>(a, b), qinfo,
- RoundingPolicy::TO_NEAREST_UP);
-}
-
-template <ConditionalOperation op, typename VectorType>
-inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
-{
- VectorType res = {0, 0, 0, 0};
- VectorType const_0 = {0, 0, 0, 0};
-
- switch (op)
- {
- case ConditionalOperation::PRELU:
- res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
- ;
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res;
-}
-
-template <ConditionalOperation op>
-inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
- float32x4x4_t out = {{
- elementwise_conditional_op<op>(a.val[0], b.val[0]),
- elementwise_conditional_op<op>(a.val[1], b.val[1]),
- elementwise_conditional_op<op>(a.val[2], b.val[2]),
- elementwise_conditional_op<op>(a.val[3], b.val[3]),
- }};
- return out;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
- const ScalarType &broadcast_value,
- const bool reorder)
-{
- VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
- return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
- reorder ? a : broadcast_vector);
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *input1_ptr,
- const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
- }
- return x;
-}
-
-template <ConditionalOperation op>
-inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
- int window_step_x, const uint8_t *input1_ptr,
- const uint8_t *input2_ptr, uint8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2,
- float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Get inputs and compute output
- const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
- const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
- const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
- store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
- }
- return x;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
- int window_step_x,
- const ScalarType *non_broadcast_input_ptr,
- const ScalarType &broadcast_value,
- ScalarType *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
- wrapper::vstore(output_ptr + x,
- elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
- }
- return x;
-}
-
-template <ConditionalOperation op>
-inline int elementwise_conditional_op_quantized_broadcast_loop(
- int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
- float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
- float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t af =
- load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
- reorder ? af : broadcast_vector);
- store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
- }
- return x;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
- const Window &window)
-{
- elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
- &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
- &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
-}
-
-template <ConditionalOperation op>
-void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
- const Window &window)
-{
- elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
- &elementwise_conditional_op_quantized_broadcast_loop<op>,
- &elementwise_conditional_op_quantized_loop<op>);
-}
-} // namespace
-
-NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
-
- // Configure kernel window
- const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
-
- Window win = calculate_max_window(valid_region);
-
- _input = input;
- _alpha = alpha;
- _output = output;
- INEKernel::configure(win);
-}
-
-void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- if (_input->info()->data_type() == DataType::F32)
- {
- elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
- _output, window);
- }
- else if (_input->info()->data_type() == DataType::QASYMM8)
- {
- elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
- window);
- }
- else
- {
- ARM_COMPUTE_ERROR("Wrong Type");
- }
-}
-
-Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
- const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
-
- const TensorShape out_shape =
- TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
-
- // Checks performed when output is configured
- if (output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
- "Wrong shape for output");
- }
-
- return Status{};
-}
-
-Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
-
- return Status{};
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 6ba0f1fd4..5841f1d69 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
DataType::F32);
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
deleted file mode 100644
index 44feb200f..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
- ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
-
- // Validate output if initialized
- if (output->total_size() != 0)
- {
- const DataLayout data_layout = input->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int idx_batch =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
- output->tensor_shape()[idx_batch]);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
- 0);
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
- output->tensor_shape().total_size());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-} // namespace
-
-NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
- : _input(nullptr), _output(nullptr), _block_shape()
-{
-}
-
-void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
-
- _input = input;
- _block_shape = block_shape;
- _output = output;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
- INEKernel::configure(win);
-}
-
-Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
- return Status{};
-}
-
-void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
- const DataLayout data_layout = _input->info()->data_layout();
- const int channel_idx =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int element_size = _input->info()->element_size();
-
- const size_t channel_size = _input->info()->dimension(channel_idx);
-
- Window slice_out = window.first_slice_window_3D();
-
- int batch_id = 0;
-
- // Main loop for NCHW and NHWC
- if (_output->info()->data_layout() == DataLayout::NCHW)
- {
- do
- {
- Iterator out(_output, slice_out);
- execute_window_loop(slice_out,
- [&](const Coordinates &id) {
- const size_t channel_id = id.z();
- const size_t in_x =
- id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
- const size_t in_y =
- id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
- const int z = channel_id % channel_size;
- Coordinates input_coords{in_x, in_y, z, batch_id};
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
- ++batch_id;
- } while (window.slide_window_slice_3D(slice_out));
- }
- else
- {
- do
- {
- Iterator out(_output, slice_out);
- execute_window_loop(slice_out,
- [&](const Coordinates &id) {
- const size_t channel_id = id.x();
- const size_t in_x =
- id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
- const size_t in_y =
- id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
- const int z = channel_id % channel_size;
- Coordinates input_coords{z, in_x, in_y, batch_id};
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
- ++batch_id;
- } while (window.slide_window_slice_3D(slice_out));
- }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
deleted file mode 100644
index 2d379cf36..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-namespace arm_compute
-{
-
-CLArgOperation::CLArgOperation()
-{
- // DO NOTHING
-}
-
-void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
- ArgOperation op)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
- _input = input;
- _output = output;
- _axis = axis;
- _arg_op = op;
- // NOTE The argminmax_axis must have no duplication.
- _num_of_kernels = axis.size();
- const size_t num_of_interm_tensors = _num_of_kernels - 1;
-
- _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
- _argop_kernels =
- arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
-
- TensorShape shape{input->info()->tensor_shape()};
- for (size_t i = 0; i < num_of_interm_tensors; i++)
- {
- shape.set(_axis[i], 1);
- _interm_tensors[i].allocator()->init(
- TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
- .set_data_layout(input->info()->data_layout()));
- _interm_tensors[i].allocator()->allocate();
- }
-
- // Set a vector that is ordered ICLTensors sequentially.
- std::vector<ICLTensor *> tensors;
- tensors.emplace_back(input);
- for (size_t i = 0; i < num_of_interm_tensors; i++)
- {
- tensors.emplace_back(_interm_tensors.get() + i);
- }
- tensors.emplace_back(output);
-
- // Apply ArgMinMax on all kernels
- for (size_t i = 0; i < _num_of_kernels; i++)
- {
- _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
- }
-}
-
-Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
- const ITensorInfo *output, ArgOperation op)
-{
- const size_t num_of_kernels = axis.size();
- const size_t num_of_interm_tensors = num_of_kernels - 1;
-
- // Create temporary tensor infos
- auto interm_tensors =
- arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
-
- // Create intermediate tensor info
- TensorShape shape{input->tensor_shape()};
-
- for (size_t i = 0; i < num_of_interm_tensors; i++)
- {
- shape.set(axis[i], 1);
- interm_tensors[i].set_data_type(input->data_type());
- interm_tensors[i].set_tensor_shape(shape);
- interm_tensors[i].set_num_channels(input->num_channels());
- }
-
- // Set a vector that is ordered ITensorInfo sequentially.
- std::vector<const ITensorInfo *> tensors;
- tensors.emplace_back(input);
- for (size_t i = 0; i < num_of_interm_tensors; i++)
- {
- tensors.emplace_back(interm_tensors.get() + i);
- }
- tensors.emplace_back(output);
-
- // Validate argminmax only on all kernels
- for (size_t i = 0; i < num_of_kernels; i++)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(
- CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
- }
-
- return Status{};
-}
-
-void CLArgOperation::run()
-{
- for (size_t i = 0; i < _num_of_kernels; ++i)
- {
- CLScheduler::get().enqueue(_argop_kernels[i]);
- }
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index 92ee69a36..e5122ab8f 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
BinaryLogicalOperation op)
{
- auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+ auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
k->configure(input1, input2, output, op);
_kernel = std::move(k);
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
deleted file mode 100644
index b3118f39e..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLCast.h"
-
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-using namespace arm_compute;
-
-void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
- k->configure(input, output, input_subtype);
- _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
deleted file mode 100644
index db662505a..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-using namespace arm_compute;
-
-void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
- k->configure(input, output, block_size);
- _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
new file mode 100644
index 000000000..3dede0562
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
+ std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _scale_f(),
+ _conv_f(),
+ _flip_weights(),
+ _scaled_output(),
+ _original_weights(nullptr),
+ _weights_flipped(),
+ _flip_axis(),
+ _is_prepared(false)
+{
+}
+
+Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, ITensorInfo *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+ auto out_dims = transposeconv_output_dimensions(
+ input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+ weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+ const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+ if (bias != nullptr)
+ {
+ if (is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+ "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+ "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+ "Output's depth is invalid.");
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+ pad_bottom);
+ TensorInfo scale_out_info(input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(scale_out_shape)
+ .set_data_layout(data_layout));
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+ conv_info, weights_info));
+
+ return Status{};
+}
+
+void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights,
+ const ICLTensor *bias, ICLTensor *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom,
+ const WeightsInfo &weights_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info,
+ invalid_right, invalid_bottom, weights_info);
+}
+
+void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input, ICLTensor *weights,
+ const ICLTensor *bias, ICLTensor *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ _original_weights = weights;
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+
+ auto out_dims = transposeconv_output_dimensions(
+ input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+ weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+ invalid_bottom);
+
+ const TensorShape output_shape =
+ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(
+ *output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
+ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+ info, invalid_right, invalid_bottom));
+
+ _is_prepared = weights_info.retain_internal_weights();
+
+ _memory_group.manage(&_scaled_output);
+
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+ // to match output shape
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+ pad_right, pad_top, pad_bottom);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ // configure scale function
+ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+ DimensionRoundingType::FLOOR);
+ _scale_f.configure(input, &_scaled_output, upsample_info);
+
+ // Setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info,
+ weights_info);
+ _scaled_output.allocator()->allocate();
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+ _flip_axis.map(true);
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ if (weights->info()->data_layout() == DataLayout::NHWC)
+ {
+ axis_data[0] = 1;
+ axis_data[1] = 2;
+ }
+ else
+ {
+ axis_data[0] = 0;
+ axis_data[1] = 1;
+ }
+ _flip_axis.unmap();
+}
+
+void CLDirectTransposeConvLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _scale_f.run();
+ _conv_f.run();
+}
+
+void CLDirectTransposeConvLayer::prepare()
+{
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ _flip_weights.run();
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
+ _conv_f.prepare();
+
+ // Free flipped weights
+ if (!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index 3d9a28a48..ae9d8afc6 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
const ICLTensor *lookups)
{
- auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+ auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
k->configure(input, output, lookups);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index f098832b0..01989461e 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -45,7 +45,7 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <algorithm>
@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
ARM_COMPUTE_UNUSED(weights);
ARM_COMPUTE_UNUSED(output);
ARM_COMPUTE_RETURN_ON_ERROR(
- CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+ CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
return Status{};
}
@@ -68,7 +68,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+ auto k = support::cpp14::make_unique<CLTransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
@@ -172,7 +172,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
// Quantize input
_quantized_input.allocator()->init(
- input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+ DataType::QASYMM8_SIGNED));
_memory_group.manage(&_quantized_input);
_quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
@@ -199,7 +200,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
@@ -256,8 +257,9 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
// Validate quantization symm8 kernel
- const ITensorInfo &quantized_input = TensorInfo(
- input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+ const ITensorInfo &quantized_input =
+ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+ DataType::QASYMM8_SIGNED));
ARM_COMPUTE_RETURN_ON_ERROR(
CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 63e291b36..2ff4b9659 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -46,7 +46,7 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <algorithm>
@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+ auto k = support::cpp14::make_unique<CLTransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index 9aebc473e..157b4d977 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -53,18 +53,21 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
fc->configure(input_to_use, _weights, _biases, _output);
return std::unique_ptr<arm_compute::IFunction>(fc);
}
- else
+ else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
{
- assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
-
bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
input->info()->data_type() == DataType::F16) &&
- weights->info()->data_type() == DataType::S8;
+ (weights->info()->data_type() == DataType::S8 ||
+ weights->info()->data_type() == DataType::QASYMM8_SIGNED);
if (is_hybrid)
{
auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
+ ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+ const auto orgin_weights_data_type = weights_info->data_type();
+ weights_info->set_data_type(DataType::QASYMM8_SIGNED);
fc->configure(input_to_use, _weights, _biases, _output);
+ weights_info->set_data_type(orgin_weights_data_type);
return std::unique_ptr<arm_compute::IFunction>(fc);
}
else
@@ -74,6 +77,11 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
return std::unique_ptr<arm_compute::IFunction>(fc);
}
}
+ else
+ {
+ throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
+ }
+
}();
if (_needs_reshape)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
deleted file mode 100644
index ca5499dfc..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-
-namespace
-{
-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
-{
- return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx(
- std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(),
- _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0),
- _reshape_b_only_on_first_run(false), _is_prepared(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b,
- const ICLTensor *c, ICLTensor *output,
- const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_UNUSED(c);
- ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate(
- a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
- _is_prepared = false;
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
- _a_offset = a->info()->quantization_info().uniform().offset;
- _b_offset = b->info()->quantization_info().uniform().offset;
-
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Set the target for the kernels
- _mm_midgard_kernel.set_target(gpu_target);
-
- // GEMMRHSMatrixInfo rhs_info;
- // GEMMLHSMatrixInfo lhs_info;
-
- // Arguments used by GEMMReshapeInfo
- // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m,
- // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
- // in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d
- ? (a->info()->dimension(1) * a->info()->dimension(2))
- : a->info()->dimension(1);
- const unsigned int n = b->info()->dimension(0);
- const unsigned int k = a->info()->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
- const ICLTensor *matrix_b = b;
- // Configure matrix multiply kernel
- _mm_midgard_kernel.configure(
- a, matrix_b, output,
- GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
-}
-
-Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
- const ITensorInfo *c, const ITensorInfo *output,
- const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- ARM_COMPUTE_UNUSED(c);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
- "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
- "Matrix B already reshaped is not supported");
-
- const ITensorInfo *matrix_a_info = a;
-
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m =
- reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
- bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target);
-
- const GEMMReshapeInfo reshape_info =
- GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
- TensorInfo weights_info(*b);
- const ITensorInfo *matrix_b_info = &weights_info;
- if (reshape_matrix_b)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(false,
- "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b");
- }
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate(
- matrix_a_info, matrix_b_info, output, reshape_info));
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::run()
-{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Run matrix multiply
- CLScheduler::get().enqueue(_mm_midgard_kernel, false);
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::prepare()
-{
- if (!_is_prepared)
- {
- _is_prepared = true;
- }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index f594d7a2e..e0b833b04 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
int axis)
{
- auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
+ auto k = support::cpp14::make_unique<CLGatherExKernel>();
k->configure(input, indices, output, axis);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 27ed8e828..65b89a389 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
{
- auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+ auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
k->configure(lookups, keys, input, output, hits);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 80393e8d1..5a7e40839 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
ICLTensor *gamma, ICLTensor *beta, float epsilon)
{
- auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+ auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
k->configure(input, output, gamma, beta, epsilon);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
deleted file mode 100644
index fbb15ab1d..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLPReLU.h"
-
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
- k->configure(input, alpha, output);
- _kernel = std::move(k);
-
- if (output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
-
- if (broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
deleted file mode 100644
index 6049b7e70..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
- _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
- _gemm_output(), _add_output(), _is_prepared(false)
-{
-}
-
-Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
- const ITensorInfo *hidden_state, const ITensorInfo *output,
- const ActivationLayerInfo &info)
-{
- const int idx_width = 0;
- const int idx_height = 1;
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
- output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
- recurrent_weights->dimension(idx_width));
- ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
- recurrent_weights->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
- hidden_state->tensor_shape());
-
- auto shape_info =
- TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
- input->data_type());
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
- ARM_COMPUTE_RETURN_ON_ERROR(
- CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
- ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
- ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
-
- return Status{};
-}
-
-void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
- const ICLTensor *recurrent_weights, const ICLTensor *bias,
- ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
- recurrent_weights->info(), bias->info(),
- hidden_state->info(), output->info(), info));
-
- const int idx_height = 1;
- TensorShape shape =
- compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
-
- _is_prepared = false;
-
- _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
- _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-
- // Manage intermediate buffers and configure
- _memory_group.manage(&_fully_connected_out);
- _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
-
- _memory_group.manage(&_gemm_output);
- _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
-
- _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
- _memory_group.manage(&_add_output);
-
- _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
- &_add_output, ConvertPolicy::SATURATE);
-
- _fully_connected_out.allocator()->allocate();
- _gemm_output.allocator()->allocate();
-
- _activation_kernel.configure(&_add_output, hidden_state, info);
- _add_output.allocator()->allocate();
-
- _copy_kernel.configure(hidden_state, output);
-}
-
-void CLRNNLayerEx::run()
-{
- prepare();
-
- _memory_group.acquire();
-
- _fully_connected_kernel.run();
- _gemm_state_f.run();
- CLScheduler::get().enqueue(_add_kernel);
- CLScheduler::get().enqueue(_activation_kernel);
-
- // copy hidden out to output
- CLScheduler::get().enqueue(_copy_kernel);
-
- _memory_group.release();
-}
-
-void CLRNNLayerEx::prepare()
-{
- if (!_is_prepared)
- {
- _fully_connected_kernel.prepare();
- _gemm_state_f.prepare();
-
- _is_prepared = true;
- }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index 8ce2d746c..a41e6db60 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -60,8 +60,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
// Create temporary tensor infos
- auto interm_tensors =
- arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+ auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
// Create intermediate tensor info
TensorShape shape{input->tensor_shape()};
@@ -119,9 +118,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
const size_t num_of_kernels = axis.size();
const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
- _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
- _reduce_kernels =
- arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+ _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
// Set a vector that is ordered ICLTensors sequentially.
std::vector<ICLTensor *> tensors;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
deleted file mode 100644
index 7d7b2264b..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
-
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-using namespace arm_compute;
-
-void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
- k->configure(input, output, block_size);
- _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index e61746ef2..3215d01a7 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,218 +37,124 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/UtilsEx.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include <cmath>
#include <memory>
#include <tuple>
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
-CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
- : _memory_group(std::move(memory_manager)),
- _scale_f(),
- _conv_f(),
- _flip_weights(),
- _scaled_output(),
- _original_weights(nullptr),
- _weights_flipped(),
- _is_prepared(false)
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_manager(std::move(memory_manager)), _function()
+{
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+ ICLTensor *output, const PadStrideInfo &deconv_info,
+ unsigned int invalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info,
+ invalid_right, invalid_bottom, weights_info);
+}
+
+void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input,
+ ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+ const PadStrideInfo &deconv_info, unsigned int invalid_right,
+ unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr,
+ output->info(), deconv_info, invalid_right,
+ invalid_bottom, weights_info))
+ {
+ case DeconvolutionMethod::DIRECT:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+ f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
+ invalid_bottom, weights_info);
+ _function = std::move(f);
+ break;
+ }
+ case DeconvolutionMethod::GEMM:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+ f->configure(compile_context, input, weights, bias, output, deconv_info);
+ _function = std::move(f);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
}
Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
const ITensorInfo *bias, ITensorInfo *output,
- const PadStrideInfo &info, unsigned int invalid_right,
+ const PadStrideInfo &deconv_info, unsigned int invalid_right,
unsigned int invalid_bottom, const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
- const DataLayout data_layout = input->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-
- const unsigned int kernel_x = weights->dimension(idx_w);
- const unsigned int kernel_y = weights->dimension(idx_h);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
- "invalid_right must be smaller than kernel_x");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
- "inner_border_top must be smaller than kernel_y");
-
- // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
- auto out_dims = transposeconv_output_dimensions(
- input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
- weights->dimension(idx_h), info, invalid_right, invalid_bottom);
-
- const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
- if (bias != nullptr)
+ switch (CLTransposeConvLayer::get_deconvolution_method(
+ input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
{
- if (is_data_type_quantized_asymmetric(input->data_type()))
+ case DeconvolutionMethod::DIRECT:
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ // Validate direct convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
+ input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
+ break;
}
- else
+ case DeconvolutionMethod::GEMM:
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ // Validate gemm-based convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+ break;
}
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
- "Output's width is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
- "Output's height is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
- "Output's depth is invalid.");
-
- unsigned int pad_left = 0;
- unsigned int pad_right = 0;
- unsigned int pad_top = 0;
- unsigned int pad_bottom = 0;
- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
- *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
- pad_bottom);
- TensorInfo scale_out_info(input->clone()
- ->set_is_resizable(true)
- .reset_padding()
- .set_tensor_shape(scale_out_shape)
- .set_data_layout(data_layout));
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
- ARM_COMPUTE_RETURN_ON_ERROR(
- CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
- conv_info, weights_info));
-
return Status{};
}
-void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
- ICLTensor *output, const PadStrideInfo &info,
- unsigned int invalid_right, unsigned int invalid_bottom,
- const WeightsInfo &weights_info)
+DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
+ const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
+ ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
+ unsigned int invalid_bottom, const WeightsInfo &weights_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
+ ARM_COMPUTE_UNUSED(output, bias, weights_info);
- const DataLayout data_layout = input->info()->data_layout();
+ const DataLayout data_layout = input->data_layout();
const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- _original_weights = weights;
- _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
- _flip_weights.configure(weights, &_weights_flipped);
-
- // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
- // added.
- auto out_dims = transposeconv_output_dimensions(
- input->info()->dimension(idx_w), input->info()->dimension(idx_h),
- weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
- invalid_bottom);
-
- const TensorShape output_shape =
- compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(
- *output->info(),
- input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
- input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
- info, invalid_right, invalid_bottom));
-
- _is_prepared = weights_info.retain_internal_weights();
-
- _memory_group.manage(&_scaled_output);
-
- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
- // to match output shape
- unsigned int pad_left = 0;
- unsigned int pad_right = 0;
- unsigned int pad_top = 0;
- unsigned int pad_bottom = 0;
- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
- *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
- pad_right, pad_top, pad_bottom);
-
- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
- input->info()->quantization_info());
- scale_out_info.set_data_layout(data_layout);
- _scaled_output.allocator()->init(scale_out_info);
-
- // configure scale function
- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
- DimensionRoundingType::FLOOR);
- _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
-
- // setup the function to convolve the upscaled output
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
- _scaled_output.allocator()->allocate();
+ if (weights->dimension(idx_w) != deconv_info.stride().first ||
+ weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 ||
+ invalid_bottom != 0)
+ {
+ return DeconvolutionMethod::DIRECT;
+ }
+
+ return DeconvolutionMethod::GEMM;
}
void CLTransposeConvLayer::run()
{
prepare();
-
- _memory_group.acquire();
-
- _scale_f.run();
- _conv_f.run();
-
- _memory_group.release();
+ _function->run();
}
-void CLTransposeConvLayer::prepare()
-{
- if (!_is_prepared)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- // Run weights flipping and mark original weights tensor as unused
- _weights_flipped.allocator()->allocate();
- _weights_flipped.map(true);
- _original_weights->map(CLScheduler::get().queue(), true);
- CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
- _weights_flipped.unmap();
- _original_weights->unmap(CLScheduler::get().queue());
- _original_weights->mark_as_unused();
-
- // Prepare convolution
- _conv_f.prepare();
-
- if (!_weights_flipped.is_used())
- {
- _weights_flipped.allocator()->free();
- }
-
- _is_prepared = true;
- }
-}
+void CLTransposeConvLayer::prepare() { _function->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
deleted file mode 100644
index 07feb5a64..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
-CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
- : _upsample(),
- _output(nullptr)
-{
-}
-
-Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
- const BorderSize &inner_border,
- const PadStrideInfo &info)
-{
- return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
-}
-
-void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
- const BorderSize &inner_border,
- const PadStrideInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- _output = output;
- _upsample.configure(input, _output, inner_border, info);
-}
-
-void CLTransposeConvLayerUpsample::run()
-{
- _output->map(CLScheduler::get().queue(), true);
- if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
- {
- const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset;
- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
- }
- else
- {
- memset(_output->buffer(), 0, _output->info()->total_size());
- }
- _output->unmap(CLScheduler::get().queue());
-
- CLScheduler::get().enqueue(_upsample, false);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
index 114e1a72d..768c15b41 100644
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
@@ -41,14 +41,14 @@
#include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h"
#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
const ITensor *off_value, ITensor *output, const int axis)
{
- auto k = arm_compute::support::cpp14::make_unique<CPPOneHotKernelEx>();
+ auto k = support::cpp14::make_unique<CPPOneHotKernelEx>();
k->configure(indices, depth, on_value, off_value, output, axis);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
deleted file mode 100644
index 6c90ef3b4..000000000
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
-
-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
-{
- auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>();
- k->configure(input, output, info);
- _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
index ff81ff854..2752eb6aa 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
@@ -42,7 +42,7 @@
#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
#include "arm_compute/runtime/IRuntimeContext.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
@@ -53,7 +53,7 @@ NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT
void NEActivationLayerEx::configure(ITensor *input, ITensor *output,
ActivationLayerInfo activation_info)
{
- auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernelEx>();
+ auto k = support::cpp14::make_unique<NEActivationLayerKernelEx>();
k->configure(input, output, activation_info);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
index e42c453cf..2fc94b267 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -42,7 +42,7 @@
#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
#include "arm_compute/core/ITensor.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -53,7 +53,7 @@ template <BinaryLogicalOperation COP>
void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
ITensor *output)
{
- auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+ auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
k->configure(COP, input1, input2, output);
_kernel = std::move(k);
}
@@ -69,7 +69,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
BinaryLogicalOperation op)
{
- auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+ auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
k->configure(op, input1, input2, output);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
deleted file mode 100644
index dc5c62061..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NECast.h"
-
-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
-{
- auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
- k->configure(input, output, input_subtype);
- _kernel = std::move(k);
-}
-
-Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
- SubDataType input_subtype)
-{
- return NECastKernel::validate(input, output, input_subtype);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
deleted file mode 100644
index 5ec0b8677..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
-{
- auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
- k->configure(input, output, block_shape);
- _kernel = std::move(k);
-}
-
-Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- int32_t block_shape)
-{
- return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
index 53fb15081..e0ab3e025 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -41,13 +41,13 @@
#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
{
- auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+ auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
k->configure(input, output, lookups);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index f45773251..a123439d9 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -58,7 +58,7 @@ namespace
Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
{
ARM_COMPUTE_RETURN_ON_ERROR(
- NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+ NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
return Status{};
}
@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+ auto k = support::cpp14::make_unique<NETransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
@@ -158,7 +158,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
// Quantize input
_quantized_input.allocator()->init(
- input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+ DataType::QASYMM8_SIGNED));
_scale_factor.allocator()->init(
TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
_quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
@@ -186,7 +187,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
@@ -224,8 +225,9 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
// Validate quantization kernel
- const ITensorInfo &quantized_input = TensorInfo(
- input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+ const ITensorInfo &quantized_input =
+ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+ DataType::QASYMM8_SIGNED));
const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
ARM_COMPUTE_RETURN_ON_ERROR(
NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index fcac3c7ae..dc6c78478 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -56,12 +56,17 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
bool is_hybrid = input->info()->data_type() == DataType::F32 &&
- weights->info()->data_type() == DataType::S8;
+ (weights->info()->data_type() == DataType::S8 ||
+ weights->info()->data_type() == DataType::QASYMM8_SIGNED);
if (is_hybrid)
{
auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
+ ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+ const auto orgin_weights_data_type = weights_info->data_type();
+ weights_info->set_data_type(DataType::QASYMM8_SIGNED);
fc->configure(input_to_use, _weights, _biases, _output);
+ weights_info->set_data_type(orgin_weights_data_type);
return std::unique_ptr<arm_compute::IFunction>(fc);
}
else
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
deleted file mode 100644
index 1290cfd39..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
- std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
- _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
- _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
- _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
- _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
- _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
- _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
- _fuse_output_stage(false), _flip_signedness(false)
-{
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
- ITensor *output, const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_UNUSED(c);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
- a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
- const ITensor *matrix_a = a;
- const ITensor *matrix_b = b;
- GEMMInfo info = gemm_info;
-
- // Clear state
- _mtx_a_reshape_kernel = nullptr;
- _mtx_b_reshape_kernel = nullptr;
-
- // Set internal variables
- _a_offset = a->info()->quantization_info().uniform().offset;
- _b_offset = b->info()->quantization_info().uniform().offset;
- _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
- _is_prepared = false;
- _fused_assembly_path = false;
- _original_b = b;
-
- const ITensor *a_to_use = a;
-
- // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
- {
- _fuse_output_stage = true;
- _memory_group.manage(&_mm_result_s32);
- TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
- _mm_result_s32.allocator()->init(info_mm_result_s32);
- }
-
-#ifdef __aarch64__
- switch (a->info()->data_type())
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- case DataType::U8:
- case DataType::S8:
- {
- if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
- info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- _asm_glue.configure(a_to_use, b, c, output, gemm_info);
- _fused_assembly_path = _asm_glue.is_configured();
- }
- else
- {
- _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
- gemm_info);
- }
- _assembly_path = _asm_glue.is_configured();
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Datatype not supported");
- break;
- }
- }
-#endif /* __aarch64__ */
- if (!(_assembly_path || _run_vector_matrix_multiplication))
- {
- matrix_a = &_tmp_a;
- matrix_b = &_tmp_b;
-
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
- // 4.0f) ]
- TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
- a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
- // 16.0f) ]
- TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
- b->info()->quantization_info());
- _tmp_a.allocator()->init(a_info);
- _tmp_b.allocator()->init(b_info);
- _memory_group.manage(&_tmp_a);
- if (!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
-
- // Configure interleave kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
- k->configure(a_to_use, &_tmp_a);
- _mtx_a_reshape_kernel = std::move(k);
- }
-
- // Configure transpose kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
- k->configure(b, &_tmp_b);
- _mtx_b_reshape_kernel = std::move(k);
- }
- }
-
- if (!_fused_assembly_path)
- {
- // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
- if (_a_offset != 0)
- {
- TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
-
- _vector_sum_col.allocator()->init(info_vector_sum_col);
- if (!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_vector_sum_col);
- }
-
- // Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
- }
-
- // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
- if (_b_offset != 0)
- {
- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
-
- _vector_sum_row.allocator()->init(info_vector_sum_row);
- _memory_group.manage(&_vector_sum_row);
-
- // Configure matrix A reduction kernel
- _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
- false);
- }
-
- if (_fuse_output_stage)
- {
- // Configure matrix multiply kernel
- if (!_assembly_path)
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- k->configure(matrix_a, matrix_b, &_mm_result_s32);
- _mm_kernel = std::move(k);
- }
-
- _offset_contribution_output_stage_kernel.configure(
- &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
- _b_offset == 0 ? nullptr : &_vector_sum_row, c,
- _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
- _b_offset, info.gemmlowp_output_stage());
- }
- else
- {
- // Configure matrix multiply kernel
- if (!_assembly_path)
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- k->configure(matrix_a, matrix_b, output);
- _mm_kernel = std::move(k);
- }
- // Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
- _b_offset == 0 ? nullptr : &_vector_sum_row,
- a_to_use->info()->dimension(0), _a_offset, _b_offset);
- }
- }
-
- // Allocate tensors
- if (!_assembly_path && !_run_vector_matrix_multiplication)
- {
- _tmp_a.allocator()->allocate();
- if (!_reshape_b_only_on_first_run)
- {
- _tmp_b.allocator()->allocate();
- }
- }
-
- if (!_fused_assembly_path)
- {
- if (_a_offset != 0 && !_reshape_b_only_on_first_run)
- {
- _vector_sum_col.allocator()->allocate();
- }
-
- if (_b_offset != 0)
- {
- _vector_sum_row.allocator()->allocate();
- }
- }
-
- if (_fuse_output_stage)
- {
- _mm_result_s32.allocator()->allocate();
- }
-}
-
-Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
- const ITensorInfo *c, const ITensorInfo *output,
- const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
- "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
- "The product AB is defined only if the number of columns in A is "
- "equal to the number of rows in B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
- "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
- "Matrix B already reshaped is not supported");
-
- GEMMInfo info = gemm_info;
- const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
-
- const ITensorInfo *a_to_use = a;
-
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
- TensorInfo mm_result_s32_info{};
-
- int32_t a_offset = a->quantization_info().uniform().offset;
- int32_t b_offset = b->quantization_info().uniform().offset;
-
- bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
- if (fuse_output_stage)
- {
- auto_init_if_empty(
- mm_result_s32_info,
- a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
- }
-
- // Check if we need to run the optimized assembly kernel
- bool run_optimised = false;
- bool run_optimised_requantized = false;
- if (a_to_use->data_type() == DataType::QASYMM8 &&
- info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
- run_optimised_requantized = run_optimised;
- }
- else
- {
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(
- a_to_use, b, c, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
- }
-
- if (run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if (info.depth_output_gemm3d() != 0)
- {
- if (info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
- "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
- "NEGEMM cannot reinterpret the output tensor as 3D");
-
- const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
- if (!run_vector_matrix_multiplication)
- {
- matrix_a_info = &tmp_a_info;
- matrix_b_info = &tmp_b_info;
-
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
- // 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
- // / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
- }
- }
-
- if (!run_optimised_requantized)
- {
- TensorInfo info_vector_sum_col{};
- TensorInfo info_vector_sum_row{};
-
- // Validate matrix B reduction kernel only if _a_offset is not equal to 0
- if (a_offset != 0)
- {
- info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
- // Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
- b, &info_vector_sum_col, a->dimension(0), false));
- }
-
- // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
- if (b_offset != 0)
- {
- info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
- // Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
- a_to_use, &info_vector_sum_row, a->dimension(0), false));
- }
-
- if (fuse_output_stage)
- {
- if (!run_optimised)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
- matrix_a_info, matrix_b_info, &mm_result_s32_info));
- }
-
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
- &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
- info.gemmlowp_output_stage()));
- }
- else
- {
- if (!run_optimised)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(
- NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
- }
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
- output, a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
- }
- }
- return Status{};
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::run()
-{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Reshape inputs
- if (_mtx_a_reshape_kernel)
- {
- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
- }
- if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
- {
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- }
-
- // Run GEMM
- if (_asm_glue.is_configured())
- {
- _asm_glue.run();
- }
- else
- {
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
- }
-
- if (!_fused_assembly_path)
- {
- // Run matrix A reduction kernel only if _b_offset is not equal to 0
- if (_b_offset != 0)
- {
- NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
- }
-
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if (_a_offset != 0 && !_reshape_b_only_on_first_run)
- {
- NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
- }
-
- if (_fuse_output_stage)
- {
- // Run offset contribution kernel
- NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
- }
- else
- {
- // Run offset contribution kernel
- NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
- }
- }
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
-{
- if (!_is_prepared)
- {
- // Run assembly reshape
- if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
- _asm_glue.prepare();
- _original_b->mark_as_unused();
- }
- // Run non-assembly reshape
- else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
- // Run reshape kernel and mark original weights tensor as unused
- _tmp_b.allocator()->allocate();
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- _original_b->mark_as_unused();
- }
-
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if (_a_offset != 0 && _reshape_b_only_on_first_run)
- {
- _vector_sum_col.allocator()->allocate();
- NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
- }
-
- _is_prepared = true;
- }
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
index c8bb88aea..433c35d58 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -41,7 +41,7 @@
#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -49,7 +49,7 @@ namespace arm_compute
{
void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
{
- auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
+ auto k = support::cpp14::make_unique<NEGatherKernelEx>();
k->configure(input, indices, output, axis);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
index 078019f4e..52d58accf 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -41,14 +41,14 @@
#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
ITensor *output, ITensor *hits)
{
- auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
+ auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
k->configure(lookups, keys, input, output, hits);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
deleted file mode 100644
index dac3b849d..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
-
-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
-{
- auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
- k->configure(input, alpha, output);
- _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
deleted file mode 100644
index 0e9a5e969..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
- _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
- _gemm_output(), _add_output(), _is_prepared(false)
-{
-}
-
-Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
- const ITensorInfo *hidden_state, const ITensorInfo *output,
- const ActivationLayerInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
- output);
-
- const int idx_width = 0;
- const int idx_height = 1;
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
- recurrent_weights->dimension(idx_width));
- ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
- recurrent_weights->dimension(idx_height));
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
- hidden_state->tensor_shape());
-
- auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
- recurrent_weights, hidden_state->dimension(idx_height)),
- 1, input->data_type());
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
- &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
-
- return Status{};
-}
-
-void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
- const ITensor *recurrent_weights, const ITensor *bias,
- ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
- ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
- recurrent_weights->info(), bias->info(),
- hidden_state->info(), output->info(), info));
-
- const int idx_height = 1;
- TensorShape shape = misc::shape_calculator::compute_rnn_shape(
- recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
-
- _is_prepared = false;
-
- // Manage intermediate buffers and configure
- _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
- _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-
- // Manage intermediate buffers and configure
- _memory_group.manage(&_fully_connected_out);
- _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
-
- _memory_group.manage(&_gemm_output);
- _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
-
- _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
- _memory_group.manage(&_add_output);
-
- _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
- ConvertPolicy::SATURATE);
-
- _fully_connected_out.allocator()->allocate();
- _gemm_output.allocator()->allocate();
-
- _activation_kernel.configure(&_add_output, hidden_state, info);
- _add_output.allocator()->allocate();
-
- _copy_kernel.configure(hidden_state, output);
-}
-
-void NERNNLayerEx::run()
-{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- _fully_connected_kernel.run();
-
- _gemm_state_f.run();
-
- NEScheduler::get().schedule(&_add_kernel, Window::DimY);
- NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
-
- // copy hidden out to output
- NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
-}
-
-void NERNNLayerEx::prepare()
-{
- if (!_is_prepared)
- {
- _fully_connected_kernel.prepare();
- _gemm_state_f.prepare();
-
- _is_prepared = true;
- }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
deleted file mode 100644
index 116bba3c0..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
- _reduction_ops(), _keep_dims()
-{
-}
-
-Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
- bool keep_dims, const ITensorInfo *output)
-{
- ARM_COMPUTE_UNUSED(keep_dims);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
- TensorShape out_shape = input->tensor_shape();
- const unsigned int reduction_ops = reduction_axis.num_dimensions();
- const int input_dims = input->num_dimensions();
- Coordinates axis_local = reduction_axis;
-
- // Convert negative axis
- for (unsigned int i = 0; i < reduction_ops; ++i)
- {
- axis_local[i] = wrap_around(axis_local[i], input_dims);
- }
-
- std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
- for (unsigned int i = 0; i < reduction_ops; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
- input->num_dimensions() - 1);
- if (output->total_size() > 0 && keep_dims)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
- }
- if (keep_dims)
- {
- out_shape.set(axis_local[i], 1);
- }
- else
- {
- out_shape.remove_dimension(axis_local[i] - i);
- }
- }
- const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
- return Status{};
-}
-
-void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
- ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
- _reduction_ops = reduction_axis.num_dimensions();
- _reduction_kernels =
- arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
- _reduced_outs =
- arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
- _keep_dims = keep_dims;
-
- Coordinates axis_local = reduction_axis;
- const int input_dims = input->info()->num_dimensions();
- const unsigned int reduction_ops = reduction_axis.num_dimensions();
-
- // Convert negative axis
- for (unsigned int i = 0; i < reduction_ops; ++i)
- {
- axis_local[i] = wrap_around(axis_local[i], input_dims);
- }
-
- // Perform reduction for every axis
- for (unsigned int i = 0; i < _reduction_ops; ++i)
- {
- TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
- : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
- out_shape.set(axis_local[i], 1);
- auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
-
- if (i == _reduction_ops - 1 && keep_dims)
- {
- _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
- }
- else
- {
- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
- input->info()->data_type(),
- input->info()->quantization_info())
- .set_data_layout(output->info()->data_layout()));
- _memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
- ReductionOperation::MEAN_SUM);
- }
- }
-
- // Allocate intermediate tensors
- for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
- {
- _reduced_outs[i].allocator()->allocate();
- }
-
- // Configure reshape layer if we want to drop the dimensions
- if (!keep_dims)
- {
- TensorShape out_shape = input->info()->tensor_shape();
-
- // We have to sort the reduction axis vectors in order for remove_dimension
- // to work properly
- std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for (unsigned int i = 0; i < _reduction_ops; ++i)
- {
- out_shape.remove_dimension(axis_local[i] - i);
- }
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
- _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
- }
-}
-
-void NEReduceMeanEx::run()
-{
- _memory_group.acquire();
-
- for (unsigned int i = 0; i < _reduction_ops; ++i)
- {
- _reduction_kernels[i].run();
- }
-
- if (!_keep_dims)
- {
- _reshape.run();
- }
- _memory_group.release();
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
deleted file mode 100644
index 198bb7672..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
- : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
-{
-}
-
-void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
- const ITensor *paddings, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-
- if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
- {
- _has_padding = true;
- _memset_kernel.configure(
- output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
- }
- _space_to_batch_kernel.configure(input, block_shape, paddings, output);
-}
-
-void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
- const int block_shape_y, const Size2D &padding_left,
- const Size2D &padding_right, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
- {
- _has_padding = true;
- _memset_kernel.configure(
- output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
- }
- _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
- output);
-}
-
-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
- const ITensorInfo *paddings, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(
- NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
-
- return Status{};
-}
-
-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
- const int block_shape_y, const Size2D &padding_left,
- const Size2D &padding_right, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
- input, block_shape_x, block_shape_y, padding_left, padding_right, output));
-
- return Status{};
-}
-
-void NESpaceToBatchLayerEx::run()
-{
- // Zero out output only if we have paddings
- if (_has_padding)
- {
- NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
- }
- NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
deleted file mode 100644
index 97697e3ea..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
-{
- auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
- k->configure(input, output, block_shape);
- _kernel = std::move(k);
-}
-
-Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- int32_t block_shape)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
- return Status{};
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index df0689273..09f178005 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -1,21 +1,5 @@
/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,14 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/core/UtilsEx.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -52,20 +33,15 @@ using namespace arm_compute::misc::shape_calculator;
namespace arm_compute
{
+
NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
: _memory_group(std::move(memory_manager)),
_conv_f(),
_upsample_f(),
_flip_weights(),
- _permute_input(),
- _permute_weights(),
- _permute_output(),
_scaled_output(),
_weights_flipped(),
- _permuted_input(),
- _permuted_weights(),
- _permuted_output(),
- _is_nchw(false),
+ _flip_axis(),
_original_weights(nullptr),
_input(nullptr),
_info(),
@@ -80,7 +56,7 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
- DataType::QASYMM8);
+ DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
const unsigned int width_idx =
@@ -95,13 +71,16 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
weights->dimension(height_idx), info, invalid_right, invalid_bottom);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
+ if (bias != nullptr)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- }
- else if (bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ if (is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
}
if (output->tensor_shape().total_size() > 0)
@@ -110,12 +89,12 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
- "Output's dim 0 is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
- "Output's dim 1 is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
- "Output's dim 2 is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+ "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+ "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+ "Output's depth is invalid.");
}
unsigned int pad_left = 0;
@@ -127,7 +106,6 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
pad_bottom);
TensorInfo scale_out_info(
input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
- scale_out_info.set_data_layout(input->data_layout());
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
const unsigned int batches_idx =
@@ -149,19 +127,13 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
ITensor *output, const PadStrideInfo &info,
unsigned int invalid_right, unsigned int invalid_bottom)
{
+ // Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+ input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
+ info, invalid_right, invalid_bottom));
const DataLayout data_layout = input->info()->data_layout();
-
- _input = input;
- _original_weights = weights;
- _info = info;
- _is_prepared = false;
- _is_nchw = data_layout == DataLayout::NCHW;
-
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
-
const unsigned int width_idx =
get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx =
@@ -173,101 +145,54 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
const TensorShape output_shape =
compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+ _input = input;
+ _original_weights = weights;
+ _info = info;
+ _is_prepared = false;
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
// Output auto initialization if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
input->info()->quantization_info());
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
- input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
- info, invalid_right, invalid_bottom));
-
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
_memory_group.manage(&_scaled_output);
- if (!_is_nchw)
- {
- _memory_group.manage(&_permuted_input);
- _memory_group.manage(&_permuted_weights);
- _memory_group.manage(&_permuted_output);
-
- // Configure the function to transform the input tensor from NHWC -> NCHW
- _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
- _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
- _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
- // Configure the function to transform the weights tensor from NHWC -> NCHW
- _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
- _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-
- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
- // order to match output shape
-
- unsigned int pad_left = 0;
- unsigned int pad_right = 0;
- unsigned int pad_top = 0;
- unsigned int pad_bottom = 0;
- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
- *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
- invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
-
- TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
- _permuted_input.info()->quantization_info());
- scale_out_info.set_data_layout(DataLayout::NCHW);
- _scaled_output.allocator()->init(scale_out_info);
-
- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
- DimensionRoundingType::CEIL);
- _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
-
- _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
- _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
- _flip_weights.configure(&_permuted_weights, &_weights_flipped);
-
- // setup the function to convolve the upscaled output
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
- const auto out_shape = output->info()->tensor_shape();
- TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
- TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
- output->info()->quantization_info());
- _permuted_output.allocator()->init(permuted_out_info);
- _permuted_output.info()->set_data_layout(DataLayout::NCHW);
- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
-
- // Configure the function to transform the convoluted output to NHWC
- _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-
- _permuted_input.allocator()->allocate();
- _permuted_weights.allocator()->allocate();
- _permuted_output.allocator()->allocate();
- }
- else
- {
- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
- // order to match output shape
- unsigned int pad_left = 0;
- unsigned int pad_right = 0;
- unsigned int pad_top = 0;
- unsigned int pad_bottom = 0;
- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
- *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
- pad_right, pad_top, pad_bottom);
-
- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
- input->info()->quantization_info());
- _scaled_output.allocator()->init(scale_out_info);
- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
- DimensionRoundingType::FLOOR);
- _upsample_f.configure(input, &_scaled_output, upsample_info);
-
- _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
- _flip_weights.configure(weights, &_weights_flipped);
-
- // setup the function to convolve the upscaled output
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
- }
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+
+ // setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+ pad_right, pad_top, pad_bottom);
+
+ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+ DimensionRoundingType::FLOOR);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ axis_data[0] = static_cast<uint32_t>(width_idx);
+ axis_data[1] = static_cast<uint32_t>(height_idx);
+
_scaled_output.allocator()->allocate();
}
@@ -275,22 +200,10 @@ void NETransposeConvLayer::run()
{
prepare();
- // MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Permute input
- if (!_is_nchw)
- {
- _permute_input.run();
- }
+ MemoryGroupResourceScope scope_mg(_memory_group);
_upsample_f.run();
_conv_f.run();
-
- // Permute output
- if (!_is_nchw)
- {
- _permute_output.run();
- }
}
void NETransposeConvLayer::prepare()
@@ -301,22 +214,12 @@ void NETransposeConvLayer::prepare()
// Run weights flipping and mark original weights tensor as unused
_weights_flipped.allocator()->allocate();
- // Permute weights
- if (!_is_nchw)
- {
- _permute_weights.run();
- }
- NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+ _flip_weights.run();
_original_weights->mark_as_unused();
// Prepare convolution
_conv_f.prepare();
- if (!_weights_flipped.is_used())
- {
- _weights_flipped.allocator()->free();
- }
-
_is_prepared = true;
}
}
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
index 09f67259c..609dd45a3 100644
--- a/compute/cker/CMakeLists.txt
+++ b/compute/cker/CMakeLists.txt
@@ -8,6 +8,9 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
target_link_libraries(nnfw_lib_cker INTERFACE ruy)
target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
+if(EXPERIMENTAL_RUY_FEATURE)
+ target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE)
+endif(EXPERIMENTAL_RUY_FEATURE)
if(PROFILE_RUY)
target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
endif(PROFILE_RUY)
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index 41b1916cf..1bde64073 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -259,6 +259,12 @@ struct FullyConnectedParams
// FullyConnectedWeightsFormat weights_format;
};
+struct L2NormParams
+{
+ // uint8 inference params.
+ int32_t input_zero_point;
+};
+
struct GatherParams
{
int32_t axis;
@@ -338,6 +344,11 @@ struct SpaceToBatchParams
int32_t output_offset;
};
+struct SpaceToDepthParams
+{
+ int32_t block_size;
+};
+
enum class Order
{
kColMajor,
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index b69d55c26..2abb998d0 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -123,6 +123,68 @@ inline int CountLeadingZeros(uint32_t integer_input)
return leading_zeros;
}
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+ int32_t *output_inv_sqrt, int *output_shift)
+{
+ assert(input >= 0);
+ if (input <= 1)
+ {
+ // Handle the input value 1 separately to avoid overflow in that case
+ // in the general computation below (b/143972021). Also handle 0 as if it
+ // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
+ // but rare/unrealistic input value. We can expect both to occur in some
+ // incompletely trained models, but probably not in fully trained models.
+ *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
+ *output_shift = 0;
+ return;
+ }
+ assert(input > 1);
+ *output_shift = 11;
+ while (input >= (1 << 29))
+ {
+ input /= 4;
+ ++*output_shift;
+ }
+ const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
+ const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+ const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+ *output_shift -= left_shift_bit_pairs;
+ input <<= 2 * left_shift_bit_pairs;
+ assert(input >= (1 << 27));
+ assert(input < (1 << 29));
+ using gemmlowp::FixedPoint;
+ using gemmlowp::Rescale;
+ using gemmlowp::SaturatingRoundingMultiplyByPOT;
+ // Using 3 integer bits gives us enough room for the internal arithmetic in
+ // this Newton-Raphson iteration.
+ using F3 = FixedPoint<int32_t, 3>;
+ using F0 = FixedPoint<int32_t, 0>;
+ const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+ const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+ const F3 fixedpoint_half_three =
+ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+ // Newton-Raphson iteration
+ // Naive unoptimized starting guess: x = 1
+ F3 x = F3::One();
+ // Naive unoptimized number of iterations: 5
+ for (int i = 0; i < 5; i++)
+ {
+ const F3 x3 = Rescale<3>(x * x * x);
+ x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+ }
+ const F0 fixedpoint_half_sqrt_2 =
+ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+ x = x * fixedpoint_half_sqrt_2;
+ *output_inv_sqrt = x.raw();
+ if (*output_shift < 0)
+ {
+ *output_inv_sqrt <<= -*output_shift;
+ *output_shift = 0;
+ }
+ // Convert right shift (right is positive) to left shift.
+ *output_shift *= reverse_shift;
+}
+
// Comment from tensorflow lite:
//
// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
index 9bcf3fd82..9b72811c0 100644
--- a/compute/cker/include/cker/operation/FullyConnected.h
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -78,8 +78,11 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
output_data, /*result_stride=*/1);
- // Apply activation function
- ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+ if (params.activation != FusedActivationFunctionType::kNone)
+ {
+ // Apply activation function
+ ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+ }
}
inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
@@ -195,7 +198,11 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
#endif
// Apply activation function to floats.
- ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+ if (params.activation != FusedActivationFunctionType::kNone)
+ {
+ // Apply activation function
+ ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+ }
return;
}
diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
new file mode 100644
index 000000000..a0075c3d0
--- /dev/null
+++ b/compute/cker/include/cker/operation/L2Normalize.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_L2NORMALIZE_H__
+#define __NNFW_CKER_L2NORMALIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+void L2NormalizeFloat32(const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ float epsilon = 1e-6;
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+ for (int i = 0; i < outer_size; ++i)
+ {
+ float squared_l2_norm = 0;
+ for (int c = 0; c < depth; ++c)
+ {
+ const float val = input_data[c];
+ squared_l2_norm += val * val;
+ }
+ float l2_norm = std::sqrt(squared_l2_norm);
+ l2_norm = std::max(l2_norm, epsilon);
+ for (int c = 0; c < depth; ++c)
+ {
+ *output_data = *input_data / l2_norm;
+ ++output_data;
+ ++input_data;
+ }
+ }
+}
+
+void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uint8_t *input_data,
+ const Shape &output_shape, uint8_t *output_data)
+{
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int32_t input_zero_point = params.input_zero_point;
+
+ for (int i = 0; i < outer_size; ++i)
+ {
+ int32_t square_l2_norm = 0;
+ for (int c = 0; c < depth; c++)
+ {
+ // Note that input_data advances by depth in the second pass below.
+ int32_t diff = input_data[c] - input_zero_point;
+ square_l2_norm += diff * diff;
+ }
+ int32_t inv_l2norm_multiplier;
+ int inv_l2norm_shift;
+ GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift);
+ for (int c = 0; c < depth; c++)
+ {
+ int32_t diff = *input_data - input_zero_point;
+ int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+ int32_t unclamped_output_val = 128 + rescaled_diff;
+ int32_t output_val = std::min(static_cast<int32_t>(255),
+ std::max(static_cast<int32_t>(0), unclamped_output_val));
+ *output_data = static_cast<uint8_t>(output_val);
+ ++input_data;
+ ++output_data;
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_L2NORMALIZE_H__
diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
index 7477858fc..3d3e59e55 100644
--- a/compute/cker/include/cker/operation/Logistic.h
+++ b/compute/cker/include/cker/operation/Logistic.h
@@ -32,18 +32,9 @@ namespace cker
inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
float *output_data)
{
-#ifdef __aarch64__
auto input_map = MapAsVector(input_data, input_shape);
auto output_map = MapAsVector(output_data, output_shape);
output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
-#else
- // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2)
- const int size = MatchingFlatSize(input_shape, output_shape);
- for (int i = 0; i < size; i++)
- {
- output_data[i] = 1.f / (1.f + std::exp(-input_data[i]));
- }
-#endif
}
} // namespace cker
diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h
index af432f3a8..4a2732d82 100644
--- a/compute/cker/include/cker/operation/Pad.h
+++ b/compute/cker/include/cker/operation/Pad.h
@@ -26,9 +26,10 @@ namespace nnfw
{
namespace cker
{
+template <typename T>
inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape,
- const float *input_data, const Shape &output_shape, float *output_data,
- const float *constant_value_data)
+ const T *input_data, const Shape &output_shape, T *output_data,
+ const T *constant_value_data)
{
// Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
// TODO: come up with more subtle solution that uses subtensors like arm compute
@@ -38,7 +39,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
/** List of padding information */
using PaddingList = std::vector<PaddingInfo>;
- auto constant_value = constant_value_data ? *constant_value_data : 0;
+ const T constant_value = constant_value_data ? *constant_value_data : 0;
assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
PaddingList padding_list(pad_rank);
@@ -64,7 +65,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
{
const int32_t in_row_len = input_shape.Dims(0);
std::fill_n(output_data, padding_list[0].first, constant_value);
- std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float));
+ std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T));
std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
constant_value);
break;
@@ -89,7 +90,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
out_offset += padding_list[1].first;
// copy a row of input data
- memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
out_offset += in_row_len;
@@ -132,7 +133,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
out_offset += padding_list[2].first;
// copy a row of input data
- memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
out_offset += in_row_len;
@@ -191,7 +192,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
out_c_offset += padding_list[3].first;
// copy a row of input data
- memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float));
+ memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T));
out_c_offset += in_row_len;
diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
new file mode 100644
index 000000000..5c82d111f
--- /dev/null
+++ b/compute/cker/include/cker/operation/Quantize.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_QUANTIZE_H__
+#define __NNFW_CKER_QUANTIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <stdexcept>
+#include <iostream>
+namespace nnfw
+{
+namespace cker
+{
+template <typename InputT, typename OutputT>
+inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape,
+ OutputT *output_data, const float output_scale, const int32_t output_offset)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ int min_val = std::numeric_limits<OutputT>::min();
+ int max_val = std::numeric_limits<OutputT>::max();
+
+ for (int i = 0; i < flat_size; i++)
+ {
+ int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset;
+ int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+ output_data[i] = clamped;
+ }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_QUANTIZE_H__
diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h
new file mode 100644
index 000000000..ef679315e
--- /dev/null
+++ b/compute/cker/include/cker/operation/SpaceToDepth.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__
+#define __NNFW_CKER_SPACE_TO_DEPTH_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void SpaceToDepth(const SpaceToDepthParams &params, const Shape &unextended_input_shape,
+ const T *input_data, const Shape &unextended_output_shape, T *output_data)
+{
+ assert(unextended_input_shape.DimensionsCount() <= 4);
+ assert(unextended_output_shape.DimensionsCount() <= 4);
+ const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+ const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+ const int output_depth = output_shape.Dims(3);
+ const int output_width = output_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+
+ const int input_depth = input_shape.Dims(3);
+ const int batch_size = input_shape.Dims(0);
+
+ // Number of continuous values that we can copy in one interation.
+ const int stride = params.block_size * input_depth;
+
+ for (int batch = 0; batch < batch_size; ++batch)
+ {
+ for (int out_h = 0; out_h < output_height; ++out_h)
+ {
+ T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
+ for (int offset_h = 0; offset_h < params.block_size; ++offset_h)
+ {
+ T *dst = output_ptr;
+ for (int out_w = 0; out_w < output_width; ++out_w)
+ {
+ memcpy(dst, input_data, stride * sizeof(T));
+ input_data += stride;
+ dst += output_depth;
+ }
+ output_ptr += stride;
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
index 432b181bd..080f66f26 100644
--- a/compute/cker/include/cker/ruy/RuySupport.h
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -24,7 +24,7 @@
namespace
{
-const int kDefaultNumThreadpoolThreads = 4;
+const int kDefaultNumThreadpoolThreads = 1;
}
namespace nnfw
diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md
index 2bfd14c63..657f0f704 100644
--- a/docs/howto/how-to-build-runtime.md
+++ b/docs/howto/how-to-build-runtime.md
@@ -13,7 +13,7 @@ In the Ubuntu, you can easily install it with the following command.
```
$ sudo apt-get install cmake libboost-all-dev
-```
+```
If your linux system does not have the basic development configuration, you will need to install more packages. A list of all packages needed to configure the development environment can be found in the https://github.com/Samsung/ONE/blob/master/infra/docker/Dockerfile.1804 file.
@@ -44,7 +44,7 @@ python3-venv \
scons \
software-properties-common \
unzip \
-wget
+wget
$ mkdir /tmp/gtest
$ cd /tmp/gtest
@@ -63,7 +63,7 @@ In a typical linux development environment, including Ubuntu, you can build the
```
$ git clone https://github.com/Samsung/ONE.git one
$ cd one
-$ cp -n Makefile.template Makefile; make install
+$ make -f Makefile.template install
```
Unfortunately, the debug build on the x86_64 architecture currently has an error. To solve the problem, you must use gcc version 9 or higher. Another workaround is to do a release build rather than a debug build. This is not a suitable method for debugging during development, but it is enough to check the function of the runtime. To release build the runtime, add the environment variable `BUILD_TYPE=release` to the build command as follows.
diff --git a/docs/nnfw/howto/CrossBuildForAndroid.md b/docs/nnfw/howto/CrossBuildForAndroid.md
index d7e48c89a..08d5fd680 100644
--- a/docs/nnfw/howto/CrossBuildForAndroid.md
+++ b/docs/nnfw/howto/CrossBuildForAndroid.md
@@ -44,11 +44,9 @@ Different from cross build for linux,
Here is an example of using Makefile.
```bash
-cp -n Makefile.template Makefile
-
TARGET_OS=android \
CROSS_BUILD=1 \
NDK_DIR=/path/android-tools/r20/ndk \
EXT_ACL_FOLDER=/path/arm_compute-v19.11.1-bin-android/lib/android-arm64-v8a-neon-cl \
-make install
+make -f Makefile.template install
```
diff --git a/docs/runtime/core.md b/docs/runtime/core.md
index 42ba75f02..64a6c620c 100644
--- a/docs/runtime/core.md
+++ b/docs/runtime/core.md
@@ -68,7 +68,7 @@ Let's say we have some functions written in a certain programming language. Then
With generated tensors and kernels, the compiler creates executor objects. There are 3 types of executors are supported - Linear, Dataflow, and Parallel. Linear executor is the default executor and Dataflow Executor and Parallel Executor are experimental.
-For more about executors, please refer to [Executors](./executors.md) document.
+For more about executors, please refer to [Executors](executors.md) document.
### Module `exec`
@@ -83,4 +83,4 @@ For more about executors, please refer to [Executors](./executors.md) document.
Backends are plugins and they are loaded dynamically(via `dlopen`). So this module is a set of interface classes for backend implementation. `compiler` can compile with a variety of backends without knowing specific backend implementation.
-Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](./backend-api.md) document.
+Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](backend-api.md) document.
diff --git a/docs/runtime/heterogeneous-execution.md b/docs/runtime/heterogeneous-execution.md
index dc39dae59..e7a5e2734 100644
--- a/docs/runtime/heterogeneous-execution.md
+++ b/docs/runtime/heterogeneous-execution.md
@@ -12,11 +12,11 @@ Here is another case. Let's say we have a model that is not sequential so there
![Add-3Conv model](heterogeneous-execution-add-3-conv-model.png)
-Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](./executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
+Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
## Graph Transformation
-Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from `PermutationInsertionPass`. This process is done from [Lowering](./core.md#1-lowering) phase of compilation.
+Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from `PermutationInsertionPass`. This process is done from [Lowering](core.md#1-lowering) phase of compilation.
Here is an example of that. Let's say we have assigned different backends for Add and Conv2D. So a Permute operation is inserted between them.
diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake
index 51a235a35..adec1f91b 100644
--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake
+++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake
@@ -8,7 +8,7 @@ function(_ARMComputeSource_import)
nnas_include(OptionTools)
envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
- set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v19.11.1.tar.gz)
+ set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v20.05.tar.gz)
ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL})
set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE)
diff --git a/infra/cmake/packages/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffersConfig.cmake
index ab0b7708f..da084e7d3 100644
--- a/infra/cmake/packages/FlatBuffersConfig.cmake
+++ b/infra/cmake/packages/FlatBuffersConfig.cmake
@@ -25,7 +25,8 @@ function(_FlatBuffers_build)
BUILD_DIR ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS/build
INSTALL_DIR ${EXT_OVERLAY_DIR}
BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS}
- IDENTIFIER "1.10-fix1"
+ IDENTIFIER "1.10-fix2"
+ EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF"
PKG_NAME "FLATBUFFERS")
endfunction(_FlatBuffers_build)
diff --git a/infra/cmake/packages/HDF5Config.cmake b/infra/cmake/packages/HDF5Config.cmake
index e282e0bc9..19803f1ea 100644
--- a/infra/cmake/packages/HDF5Config.cmake
+++ b/infra/cmake/packages/HDF5Config.cmake
@@ -27,6 +27,7 @@ _HDF5_build()
find_path(HDF5_CONFIG_DIR "hdf5-config.cmake"
PATHS ${EXT_OVERLAY_DIR}
PATH_SUFFIXES
+ cmake
share/cmake
share/cmake/hdf5
cmake/hdf5
diff --git a/infra/cmake/packages/Pybind11Config.cmake b/infra/cmake/packages/Pybind11Config.cmake
new file mode 100644
index 000000000..306177903
--- /dev/null
+++ b/infra/cmake/packages/Pybind11Config.cmake
@@ -0,0 +1,21 @@
+function(_Pybind11_import)
+ nnas_find_package(Pybind11Source QUIET)
+
+ if(NOT Pybind11Source_FOUND)
+ set(Pybind11_FOUND FALSE PARENT_SCOPE)
+ return()
+ endif(NOT Pybind11Source_FOUND)
+
+ nnas_include(ExternalBuildTools)
+ ExternalBuild_CMake(CMAKE_DIR ${Pybind11Source_DIR}
+ BUILD_DIR ${CMAKE_BINARY_DIR}/externals/PYBIND11/build
+ INSTALL_DIR ${EXT_OVERLAY_DIR}
+ IDENTIFIER "2.3.0"
+ PKG_NAME "PYBIND11")
+
+ find_path(Pybind11_INCLUDE_DIRS NAMES pybind11.h PATHS ${EXT_OVERLAY_DIR} PATH_SUFFIXES include/pybind11)
+
+ set(Pybind11_FOUND TRUE PARENT_SCOPE)
+endfunction(_Pybind11_import)
+
+_Pybind11_import()
diff --git a/infra/cmake/packages/Pybind11SourceConfig.cmake b/infra/cmake/packages/Pybind11SourceConfig.cmake
new file mode 100644
index 000000000..4a9c676af
--- /dev/null
+++ b/infra/cmake/packages/Pybind11SourceConfig.cmake
@@ -0,0 +1,18 @@
+function(_Pybind11Source_import)
+ if(NOT DOWNLOAD_PYBIND11)
+ set(Pybind11Source_FOUND FALSE PARENT_SCOPE)
+ return()
+ endif(NOT DOWNLOAD_PYBIND11)
+
+ nnas_include(ExternalSourceTools)
+ nnas_include(OptionTools)
+
+ envoption(PYBIND11_URL https://github.com/pybind/pybind11/archive/v2.3.0.tar.gz)
+
+ ExternalSource_Download(PYBIND11 ${PYBIND11_URL})
+
+ set(Pybind11Source_DIR ${PYBIND11_SOURCE_DIR} PARENT_SCOPE)
+ set(Pybind11Source_FOUND TRUE PARENT_SCOPE)
+endfunction(_Pybind11Source_import)
+
+_Pybind11Source_import()
diff --git a/infra/docker/Dockerfile b/infra/docker/Dockerfile
index e675b53ad..052cc4fb6 100644
--- a/infra/docker/Dockerfile
+++ b/infra/docker/Dockerfile
@@ -1,8 +1,6 @@
FROM ubuntu:16.04
ARG UBUNTU_MIRROR
-ENV http_proxy $http_proxy
-ENV https_proxy $https_proxy
RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
@@ -22,6 +20,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
# Additonal tools
RUN apt-get update && apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
+RUN pip3 install --upgrade pip
RUN pip3 install yapf==0.22.0 numpy
# Install google test (source)
diff --git a/infra/docker/Dockerfile.1804 b/infra/docker/Dockerfile.1804
index fc6fc9a1a..cc31bba1f 100644
--- a/infra/docker/Dockerfile.1804
+++ b/infra/docker/Dockerfile.1804
@@ -1,12 +1,6 @@
FROM ubuntu:18.04
ARG UBUNTU_MIRROR
-ENV http_proxy $http_proxy
-ENV https_proxy $https_proxy
-
-RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
-RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
-RUN if [ -n "$UBUNTU_MIRROR" ] ; then sed "s/archive.ubuntu.com/${UBUNTU_MIRROR}/g" -i /etc/apt/sources.list ; fi
# Install 'add-apt-repository'
RUN apt-get update && apt-get -qqy install software-properties-common
@@ -22,6 +16,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
# Additonal tools
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
+RUN pip3 install --upgrade pip
RUN pip3 install yapf==0.22.0 numpy
# Install google test (source)
diff --git a/infra/nncc/CMakeLists.txt b/infra/nncc/CMakeLists.txt
index 3ac6680de..0be6885e2 100644
--- a/infra/nncc/CMakeLists.txt
+++ b/infra/nncc/CMakeLists.txt
@@ -98,6 +98,7 @@ option(DOWNLOAD_CAFFE "Download Caffe source" ON)
option(DOWNLOAD_PYTORCH "Download Pytorch source" ON)
option(DOWNLOAD_ONNX "Download ONNX source" ON)
option(DOWNLOAD_ABSEIL "Download Abseil-cpp source" ON)
+option(DOWNLOAD_PYBIND11 "Download Pybind11 source" ON)
option(DOWNLOAD_GTEST "Download Google Test source" ON)
option(BUILD_GTEST "Build Google Test from the downloaded source" ON)
diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount
index d4610e3f0..d06c5c9de 100644
--- a/infra/nncc/command/utcount
+++ b/infra/nncc/command/utcount
@@ -13,7 +13,7 @@ BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp stdex \
oops pepper-assert \
hermes hermes-std \
loco locop locomotiv logo-core logo \
-foder souschef arser \
+foder souschef arser vconone \
safemain mio-circle mio-tflite \
tflite2circle \
luci \
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
index 8e7f78eb1..2442a2d7c 100644
--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
@@ -100,7 +100,7 @@ target_include_directories(tensorflow-lite-2.2.0 SYSTEM PUBLIC ${TFLITE_INCLUDES
target_compile_definitions(tensorflow-lite-2.2.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV")
set_property(TARGET tensorflow-lite-2.2.0 PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(tensorflow-lite-2.2.0 eigen ${LIB_PTHREAD} dl)
-if(${BUILD_WITH_NNAPI})
+if(NOT ANDROID AND ${BUILD_WITH_NNAPI})
target_link_libraries(tensorflow-lite-2.2.0 rt)
endif()
diff --git a/infra/nnfw/config/gbs.conf b/infra/nnfw/config/gbs.conf
index 515cadaba..bad9eb204 100644
--- a/infra/nnfw/config/gbs.conf
+++ b/infra/nnfw/config/gbs.conf
@@ -5,7 +5,7 @@ profile = profile.tizen
[profile.tizen]
user=obs_viewer
obs = obs.tizen
-repos = repo.tizen_base,repo.tizen_mobile
+repos = repo.tizen_one,repo.tizen_base,repo.tizen_mobile
buildroot = /home/GBS-ROOT/
[obs.tizen]
@@ -15,6 +15,8 @@ url = http://api.tizen.org
url = http://download.tizen.org/snapshots/tizen/unified/latest/repos/standard/packages/
[repo.tizen_base]
-url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
+url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
+[repo.tizen_one]
+url = http://nnfw.mooo.com/archive/tizen/
diff --git a/infra/packaging/preset/20200630 b/infra/packaging/preset/20200630
index e1599357a..c3ca4b6d0 100644
--- a/infra/packaging/preset/20200630
+++ b/infra/packaging/preset/20200630
@@ -14,6 +14,7 @@ function preset_configure()
REQUIRED_UNITS+=("souschef")
REQUIRED_UNITS+=("safemain")
REQUIRED_UNITS+=("arser")
+ REQUIRED_UNITS+=("vconone")
# Hermes Logging Framework
REQUIRED_UNITS+=("hermes" "hermes-std")
# loco IR and related utilities
@@ -28,11 +29,14 @@ function preset_configure()
REQUIRED_UNITS+=("record-minmax" "circle-quantizer")
REQUIRED_UNITS+=("one-cmds")
+ NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)}
+
# TODO Use "nncc configure" and "nncc build"
cmake \
-DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
-DCMAKE_BUILD_TYPE=release \
-DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
+ -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
${EXTRA_OPTIONS[@]} \
"${NNAS_PROJECT_PATH}/infra/nncc"
}
@@ -44,14 +48,4 @@ function preset_install()
# Install tf2nnpkg
install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
-
- # Create python virtual enviornment
- python3 -m venv "${NNAS_INSTALL_PREFIX}/bin/venv"
-
- # Install tensorflow
- source "${NNAS_INSTALL_PREFIX}/bin/venv/bin/activate"
- python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
- install -U pip setuptools
- python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
- install tensorflow-cpu==2.3.0rc0
}
diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630
index 9101f8273..7846fd388 100644
--- a/infra/packaging/res/tf2nnpkg.20200630
+++ b/infra/packaging/res/tf2nnpkg.20200630
@@ -14,10 +14,16 @@ command_exists() {
usage()
{
echo "Convert TensorFlow model to nnpackage."
- echo "Usage: tf2nnpkg --info <path/to/info> --graphdef <path/to/pb> [OPTION] -o <path/to/nnpkg/directory>"
- exit 0
+ echo "Usage: tf2nnpkg"
+ echo " --info <path/to/info>"
+ echo " --graphdef <path/to/pb>"
+ echo " -o <path/to/nnpkg/directory>"
+ echo " --v2 (optional) Use TF 2.x interface"
+ exit 255
}
+TF_INTERFACE="--v1"
+
# Parse command-line arguments
#
while [ "$#" -ne 0 ]; do
@@ -39,6 +45,10 @@ while [ "$#" -ne 0 ]; do
export OUTPUT_DIR="$2"
shift 2
;;
+ '--v2')
+ TF_INTERFACE="--v2"
+ shift
+ ;;
*)
echo "${CUR}"
shift
@@ -83,10 +93,7 @@ OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' '
INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':')
# generate tflite file
-python "${ROOT}/bin/tf2tfliteV2.py" --v2 --input_path ${GRAPHDEF_FILE} \
---output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
---input_arrays ${INPUT} --output_arrays ${OUTPUT} || \
-python "${ROOT}/bin/tf2tfliteV2.py" --v1 --input_path ${GRAPHDEF_FILE} \
+python "${ROOT}/bin/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${GRAPHDEF_FILE} \
--output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
--input_arrays ${INPUT} --input_shapes ${INPUT_SHAPES} \
--output_arrays ${OUTPUT}
diff --git a/infra/scripts/build-tcm.sh b/infra/scripts/build-tcm.sh
new file mode 100644
index 000000000..22fb33558
--- /dev/null
+++ b/infra/scripts/build-tcm.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# STEP 1
+# Download latest TCM tool from
+# https://github.sec.samsung.net/RS-TCM/tca-standalone/releases/download/v0.0.8/tca-standalone-0.0.8.jar
+#
+# STEP 2
+# Create symbolic link `./src` for source directory to be analyzed which has `.ahub` configuration.
+#
+# STEP 3
+# run this `build-tcm.sh` script.
+#
+# See the following link for additional details.
+# https://github.sec.samsung.net/RS-TCM/tca-standalone/wiki/Tutorials-CPP-Gtest
+#
+
+echo ${PROJECT_DIR:=${PWD}}
+
+java -jar $PROJECT_DIR/tca-standalone-0.0.8.jar \
+ --outdir=$PROJECT_DIR/tcm-output \
+ --config=$PROJECT_DIR/.ahub/tcchecker-tca/config.yaml \
+ --local=$PROJECT_DIR/src \
+ --logfile=$PROJECT_DIR/tcm-output/tcm.log \
+ --debug
diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh
index d436e8a1f..a0323e0a0 100644
--- a/infra/scripts/compiler_modules.sh
+++ b/infra/scripts/compiler_modules.sh
@@ -7,7 +7,7 @@ DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp;stdex"
DEBUG_BUILD_ITEMS+=";oops;pepper-assert"
DEBUG_BUILD_ITEMS+=";hermes;hermes-std"
DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo"
-DEBUG_BUILD_ITEMS+=";foder;souschef;arser"
+DEBUG_BUILD_ITEMS+=";foder;souschef;arser;vconone"
DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite"
DEBUG_BUILD_ITEMS+=";tflite2circle"
DEBUG_BUILD_ITEMS+=";luci"
diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh
index 7da673601..011d14c18 100755
--- a/infra/scripts/docker_build_cross_aarch64_runtime.sh
+++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_PATH="$CURRENT_PATH/../../"
# prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
echo "It will use default rootfs path"
else
DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh
index f1f666aa3..551fb5700 100755
--- a/infra/scripts/docker_build_cross_arm_runtime.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_PATH="$CURRENT_PATH/../../"
# prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
echo "It will use default rootfs path"
else
DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh
index ea66f1774..876f318f4 100755
--- a/infra/scripts/docker_build_cross_arm_runtime_release.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_PATH="$CURRENT_PATH/../../"
# prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
echo "It will use default rootfs path"
else
DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh
index 08244e5d8..f42251baa 100755
--- a/infra/scripts/docker_build_cross_coverage.sh
+++ b/infra/scripts/docker_build_cross_coverage.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_PATH="$CURRENT_PATH/../../"
# prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
echo "It will use default rootfs path"
else
DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh
index 418b50dfe..5b125318e 100755
--- a/infra/scripts/docker_build_nncc.sh
+++ b/infra/scripts/docker_build_nncc.sh
@@ -54,6 +54,16 @@ pushd $ROOT_PATH > /dev/null
mkdir -p ${NNCC_INSTALL_PREFIX}
./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}"
+# create python virtual environment
+./nncc docker-run python3 -m venv "${NNCC_INSTALL_PREFIX}/bin/venv"
+
+./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
+ -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+ install -U pip setuptools
+./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
+ -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+ install tensorflow-cpu==2.3.0rc0
+
mkdir -p ${ARCHIVE_PATH}
tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./
diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh
index 18809ad07..ee0f183f1 100755
--- a/infra/scripts/docker_build_tizen_cross.sh
+++ b/infra/scripts/docker_build_tizen_cross.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_PATH="$CURRENT_PATH/../../"
# prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
echo "It will use default rootfs path"
else
DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh
index 556c5bd74..55adaa15d 100755
--- a/infra/scripts/docker_collect_nnpkg_resources.sh
+++ b/infra/scripts/docker_collect_nnpkg_resources.sh
@@ -60,7 +60,7 @@ pushd $ROOT_PATH > /dev/null
REQUIRED_UNITS=()
# Common Libraries
REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
-REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "oops")
+REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "vconone")
# Hermes Logging Framework
REQUIRED_UNITS+=("hermes" "hermes-std")
# loco IR and related utilities
diff --git a/infra/scripts/tizen_xu4_test.sh b/infra/scripts/tizen_xu4_test.sh
index 5521b5fdc..640a0e0a3 100755
--- a/infra/scripts/tizen_xu4_test.sh
+++ b/infra/scripts/tizen_xu4_test.sh
@@ -23,7 +23,7 @@ function install_model()
{
# download tflite model files
pushd $HOST_HOME
- tests/scripts/framework/run_test.sh --download=on
+ tests/scripts/framework/run_test.sh --download=on --run=off
# TODO Since this command removes model file(.zip),
# We must always download the file unlike model file(.tflite).
# Because caching applies only to tflite file.
diff --git a/master_diff_1.7.0.patch b/master_diff_1.7.0.patch
new file mode 100644
index 000000000..feae39863
--- /dev/null
+++ b/master_diff_1.7.0.patch
@@ -0,0 +1,30424 @@
+diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml
+new file mode 100644
+index 0000000..cd34d79
+--- /dev/null
++++ b/.ahub/tcchecker-tca/config.yaml
+@@ -0,0 +1,43 @@
++version: 2
++test:
++ - name: NN Runtime
++ testCaseLanguage: CPP
++ testFW: GTEST
++ testCaseFolder:
++ - ./compute/test/cker
++ - ./runtime/onert/core/src/backend/cpu_common
++ - ./runtime/onert/frontend/nnapi
++ - ./runtime/onert/test/core/compiler
++ - ./runtime/onert/test/core/exec
++ - ./runtime/onert/test/core/interp
++ - ./runtime/onert/test/graph
++ - ./runtime/onert/test/graph/operand
++ - ./runtime/onert/test/graph/operation
++ - ./runtime/onert/test/graph/verifier
++ - ./runtime/onert/test/ir
++ - ./runtime/onert/test/util
++ - ./tests/nnapi/src
++ - ./tests/nnfw_api/src
++ - ./tests/tools/tflite_run/src
++
++ testFile:
++ - extension: cpp
++ any: true
++ - extension: cc
++ any: true
++
++ testCase:
++ - condition:
++ - functionName:
++ starts:
++ - TEST
++
++ negativeTestCase:
++ - condition:
++ - testName:
++ starts:
++ - neg_
++
++ positiveTestCase:
++ - condition:
++ - inverse: negativeTestCase
+diff --git a/compiler/.ahub/tcchecker-tca/config.yaml b/compiler/.ahub/tcchecker-tca/config.yaml
+new file mode 100644
+index 0000000..ef681de
+--- /dev/null
++++ b/compiler/.ahub/tcchecker-tca/config.yaml
+@@ -0,0 +1,54 @@
++version: 2
++test:
++ - name: NN Compiler
++ testCaseLanguage: CPP
++ testFW: GTEST
++ testCaseFolder:
++ - ./angkor
++ - ./arser
++ - ./circle2circle
++ - ./circle-quantizer
++ - ./cwrap
++ - ./foder
++ - ./hermes
++ - ./hermes-std
++ - ./loco
++ - ./locomotiv
++ - ./locop
++ - ./logo
++ - ./logo-core
++ - ./luci
++ - ./luci-interpreter
++ - ./luci-value-test
++ - ./mio-circle
++ - ./mio-tflite
++ - ./oops
++ - ./pepper-assert
++ - ./pepper-str
++ - ./pepper-strcast
++ - ./pp
++ - ./record-minmax
++ - ./safemain
++ - ./souschef
++ - ./stdex
++ - ./tflite2circle
++
++ testFile:
++ - extension: .test.cpp
++ any: true
++
++ testCase:
++ - condition:
++ - functionName:
++ starts:
++ - TEST
++
++ negativeTestCase:
++ - condition:
++ - testName:
++ ends:
++ - _NEG
++
++ positiveTestCase:
++ - condition:
++ - inverse: negativeTestCase
+diff --git a/compiler/bcq-tools/CMakeLists.txt b/compiler/bcq-tools/CMakeLists.txt
+new file mode 100644
+index 0000000..ae231bd
+--- /dev/null
++++ b/compiler/bcq-tools/CMakeLists.txt
+@@ -0,0 +1,27 @@
++set(BCQ_TOOLS_FILES
++ generate_bcq_output_arrays
++ preserve_bcq_info
++)
++
++foreach(BCQ_TOOLS IN ITEMS ${BCQ_TOOLS_FILES})
++
++ set(BCQ_TOOLS_FILE ${BCQ_TOOLS})
++ set(BCQ_TOOLS_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${BCQ_TOOLS_FILE}")
++ set(BCQ_TOOLS_BIN "${CMAKE_CURRENT_BINARY_DIR}/${BCQ_TOOLS_FILE}")
++ set(BCQ_TOOLS_TARGET "${BCQ_TOOLS}_target")
++
++ add_custom_command(OUTPUT ${BCQ_TOOLS_BIN}
++ COMMAND ${CMAKE_COMMAND} -E copy "${BCQ_TOOLS_SRC}" "${BCQ_TOOLS_BIN}"
++ DEPENDS ${BCQ_TOOLS_SRC}
++ COMMENT "Generate ${BCQ_TOOLS_BIN}"
++ )
++
++ add_custom_target(${BCQ_TOOLS_TARGET} ALL DEPENDS ${BCQ_TOOLS_BIN})
++
++ install(FILES ${BCQ_TOOLS_BIN}
++ PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
++ GROUP_READ GROUP_WRITE GROUP_EXECUTE
++ WORLD_READ WORLD_EXECUTE
++ DESTINATION bin)
++
++endforeach(BCQ_TOOLS)
+diff --git a/compiler/bcq-tools/README.md b/compiler/bcq-tools/README.md
+new file mode 100644
+index 0000000..18b0f48
+--- /dev/null
++++ b/compiler/bcq-tools/README.md
+@@ -0,0 +1,78 @@
++# BCQ Tools
++
++This directory includes some tools related with BCQ.
++
++## preserve_bcq_info
++
++### Purpose
++
++`preserve_bcq_info` is for preserving constant nodes which include BCQ information.
++When `.pb` file is converted to `.tflite` file by TFlite converter, constant nodes whose values are exactly same are removed and then linked to only one representative node.
++This makes us impossible to know what constant node should be linked to a node which we want to apply BCQ.
++One of the solutions is making all the same constant nodes different by inserting unique values and ignore the newly generated unique values when BCQ fusing is applied.
++`preserve_bcq_info` will generate and insert unique dummy values to the constant nodes whose values are same not to be removed by Tensorflow Lite converter.
++As a result, BCQ information will be preserved.
++
++### How to use
++
++```bash
++preserve_bcq_info \
++--input_path /path/to/original_model.pb \
++--output_path /path/to/preserved_model.pb
++```
++
++### How it works
++
++If we add unique dummy value at the end of each constant nodes, all the constant nodes would be different. Following is an example.
++
++```
++[Original Constant Nodes]
++const(value=[1, 2, 3], name='const1')
++const(value=[1, 2, 3], name='const2')
++const(value=[1, 2, 3], name='const3')
++
++[After BCQ information preserved]
++const(value=[1, 2, 3, -1], name='const1')
++const(value=[1, 2, 3, -2], name='const2')
++const(value=[1, 2, 3, -3], name='const3')
++```
++
++For dummy values, negative values are used instead of positive values.
++This is because positive valus may be confused with original constant node values.
++For your information, unique dummy value starts from -1 and moves to -2, -3, ..., -N, where N is the number of preserved constant nodes.
++
++### Caution
++
++- Newly generated dummy values should be ignored when the constant nodes are used.
++
++## generate_bcq_output_arrays
++
++### Purpose
++
++To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished.
++However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big.
++`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes.
++
++### How to use
++
++```bash
++generate_bcq_output_arrays \
++--input_path /path/to/original_model.pb \
++--output_path /path/to/output_arrays.txt
++```
++
++### How it works
++
++```
++[Original BCQ information nodes]
++const(value=[1, 2, 3, -1], name='const1')
++const(value=[1, 2, 3, -2], name='const2')
++const(value=[1, 2, 3, -3], name='const3')
++
++[Generated output_arrays]
++,const1,const2,const3
++```
++
++### Caution
++
++- Generated output_arrays will be start with comma.
+diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays
+new file mode 100644
+index 0000000..48e8a93
+--- /dev/null
++++ b/compiler/bcq-tools/generate_bcq_output_arrays
+@@ -0,0 +1,90 @@
++#!/usr/bin/env python3
++
++import tensorflow as tf
++
++import argparse
++import sys
++
++
++def _get_parser():
++ """
++ Returns an ArgumentParser for generating output_arrays.
++ """
++ parser = argparse.ArgumentParser(
++ description=("Command line tool to generated output_arrays of BCQ nodes"))
++
++ # Input and output path.
++ parser.add_argument(
++ "-i",
++ "--input_path",
++ type=str,
++ help="Full filepath of the input file.",
++ required=True)
++ parser.add_argument(
++ "-o",
++ "--output_path",
++ type=str,
++ help="Full filepath of the output file.",
++ required=True)
++
++ return parser
++
++
++def load_graph(frozen_graph_filename):
++ """
++ Load graph from frozen pb file
++ """
++ with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
++ graph_def = tf.compat.v1.GraphDef()
++ graph_def.ParseFromString(f.read())
++ with tf.Graph().as_default() as graph:
++ tf.import_graph_def(graph_def, name='')
++ return graph
++
++
++def dtype2str(dtype):
++ if dtype == "int32":
++ return "TF_INT32"
++ elif dtype == "int64":
++ return "TF_INT64"
++ elif dtype == "float32":
++ return "TF_FLOAT"
++ elif dtype == "bool":
++ return "TF_BOOL"
++ else:
++ raise Exception("Not supported dtype")
++
++
++def print_output_arrays(flags):
++ graph_model = load_graph(flags.input_path)
++ graph_model_def = graph_model.as_graph_def()
++ ops = graph_model.get_operations()
++
++ output_names = [op.outputs[0].name for op in ops
++ if op.type == "Const" and "bcqinfo_" in op.outputs[0].name]
++
++ output_arrays = ""
++ for output_name in output_names:
++ output_arrays += ","
++
++ colon_index = output_name.find(":")
++ if colon_index == -1:
++ output_arrays += output_name
++ else:
++ output_arrays += output_name[:colon_index]
++
++ f = open(flags.output_path, 'w')
++ f.write(output_arrays)
++ f.close()
++
++
++def main():
++ # Parse argument.
++ parser = _get_parser()
++ flags = parser.parse_known_args(args=sys.argv[1:])
++
++ print_output_arrays(flags[0])
++
++
++if __name__ == "__main__":
++ main()
+diff --git a/compiler/bcq-tools/preserve_bcq_info b/compiler/bcq-tools/preserve_bcq_info
+new file mode 100644
+index 0000000..2ede8d4
+--- /dev/null
++++ b/compiler/bcq-tools/preserve_bcq_info
+@@ -0,0 +1,116 @@
++#!/usr/bin/env python3
++
++import tensorflow as tf
++import numpy as np
++
++import argparse
++import sys
++
++
++def _get_parser():
++ """
++ Returns an ArgumentParser for preserving BCQ information.
++ """
++ parser = argparse.ArgumentParser(
++ description=("Command line tool to preserve BCQ information"))
++
++ # Input and output path.
++ parser.add_argument(
++ "-i",
++ "--input_path",
++ type=str,
++ help="Full filepath of the input file.",
++ required=True)
++ parser.add_argument(
++ "-o",
++ "--output_path",
++ type=str,
++ help="Full filepath of the output file.",
++ required=True)
++
++ return parser
++
++
++def load_graph(frozen_graph_filename):
++ """
++ Load graph from frozen pb file
++ """
++ with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
++ graph_def = tf.compat.v1.GraphDef()
++ graph_def.ParseFromString(f.read())
++ with tf.Graph().as_default() as graph:
++ tf.import_graph_def(graph_def, name='')
++ return graph
++
++
++def preserve_bcq_info(flags):
++ """
++ Generate unique dummy value from -1 to -N.
++
++ We use negative values to preserve BCQ information because
++ positive values may cause some confusion with real BCQ information values.
++ """
++
++ class UniqueValueGen:
++ def __init__(self):
++ self.unique_value = -1
++
++ def gen(self):
++ val = self.unique_value
++ self.unique_value = val - 1
++ return val
++
++ unique_value = UniqueValueGen()
++
++ original_graph_model = load_graph(flags.input_path)
++ original_graph_model_def = original_graph_model.as_graph_def()
++
++ new_graph = tf.compat.v1.GraphDef()
++ substitution_dict = {}
++
++ DT_INT32 = None # Just for copying DT_INT32 attribute value
++
++ for node in original_graph_model_def.node:
++ if node.op == "Const":
++ # Because bcqinfo_do_w_x is BOOL type, we cannot add dummy value at the end.
++ # Therefore we should convert the type to INT32 type.
++ if "/bcqinfo_do_w_x" in node.name:
++ original_tensor = tf.make_ndarray(node.attr["value"].tensor)
++ substitution_dict[node.name] = tf.make_tensor_proto(
++ [int(original_tensor[0]), unique_value.gen()], tf.int32)
++
++ preserved_bcqinfo_list = ["/bcqinfo_number_of_clusters", "/bcqinfo_size_of_clusters",
++ "/bcqinfo_qbits_of_clusters"]
++
++ if any(name in node.name for name in preserved_bcqinfo_list):
++ original_tensor = tf.make_ndarray(
++ node.attr["value"].tensor) # variable name change
++ substitution_dict[node.name] = tf.make_tensor_proto(
++ np.append(original_tensor, unique_value.gen()), tf.int32)
++ DT_INT32 = node.attr["dtype"]
++
++ for node in original_graph_model_def.node:
++ if node.name in substitution_dict:
++ new_node = new_graph.node.add()
++ new_node.op = "Const"
++ new_node.name = node.name
++ new_node.attr["dtype"].CopyFrom(DT_INT32)
++ new_node.attr["value"].tensor.CopyFrom(substitution_dict[node.name])
++ else:
++ new_node = new_graph.node.add()
++ new_node.CopyFrom(node)
++
++ tf.io.write_graph(new_graph, '.', flags.output_path, False)
++
++
++def main():
++ # Parse argument.
++ parser = _get_parser()
++ flags = parser.parse_known_args(args=sys.argv[1:])
++
++ # Generate a new pb file, which BCQ information is preserved.
++ preserve_bcq_info(flags[0])
++
++
++if __name__ == "__main__":
++ main()
+diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt
+index 1335057..009bfab 100644
+--- a/compiler/circle-quantizer/CMakeLists.txt
++++ b/compiler/circle-quantizer/CMakeLists.txt
+@@ -13,5 +13,6 @@ target_link_libraries(circle-quantizer luci_service)
+ target_link_libraries(circle-quantizer luci_pass)
+ target_link_libraries(circle-quantizer luci_export)
+ target_link_libraries(circle-quantizer arser)
++target_link_libraries(circle-quantizer vconone)
+
+ install(TARGETS circle-quantizer DESTINATION bin)
+diff --git a/compiler/circle-quantizer/requires.cmake b/compiler/circle-quantizer/requires.cmake
+index 2293e53..c21e28e 100644
+--- a/compiler/circle-quantizer/requires.cmake
++++ b/compiler/circle-quantizer/requires.cmake
+@@ -5,3 +5,4 @@ require("safemain")
+ require("luci")
+ require("oops")
+ require("arser")
++require("vconone")
+diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
+index b56b547..8d3a80c 100644
+--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
++++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
+@@ -25,6 +25,7 @@
+
+ #include <oops/InternalExn.h>
+ #include <arser/arser.h>
++#include <vconone/vconone.h>
+
+ #include <functional>
+ #include <iostream>
+@@ -36,6 +37,12 @@ using OptionHook = std::function<int(const char **)>;
+ using Algorithms = luci::CircleOptimizer::Options::Algorithm;
+ using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
+
++void print_version(void)
++{
++ std::cout << "circle-quantizer version " << vconone::get_string() << std::endl;
++ std::cout << vconone::get_copyright() << std::endl;
++}
++
+ int entry(int argc, char **argv)
+ {
+ // Simple argument parser (based on map)
+@@ -49,13 +56,20 @@ int entry(int argc, char **argv)
+
+ arser::Arser arser("circle-quantizer provides circle model quantization");
+
++ arser.add_argument("--version")
++ .nargs(0)
++ .required(false)
++ .default_value(false)
++ .help("Show version information and exit")
++ .exit_with(print_version);
++
+ arser.add_argument(qdqw)
+ .nargs(3)
+ .type(arser::DataType::STR_VEC)
+ .required(false)
+ .help("Quantize-dequantize weight values required action before quantization. "
+ "Three arguments required: input_dtype(float32) "
+- "output_dtype(uint8) granularity(layer)");
++ "output_dtype(uint8) granularity(layer, channel)");
+
+ arser.add_argument(qwmm)
+ .nargs(3)
+@@ -63,7 +77,7 @@ int entry(int argc, char **argv)
+ .required(false)
+ .help("Quantize with min/max values. "
+ "Three arguments required: input_dtype(float32) "
+- "output_dtype(uint8) granularity(layer)");
++ "output_dtype(uint8) granularity(layer, channel)");
+
+ arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
+ arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
+diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp
+index a55cd45..38e3073 100644
+--- a/compiler/circle-tensordump/driver/Driver.cpp
++++ b/compiler/circle-tensordump/driver/Driver.cpp
+@@ -46,7 +46,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ std::unique_ptr<circletensordump::DumpInterface> dump;
+diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp
+index dfa78f0..a8d3256 100644
+--- a/compiler/circle-tensordump/src/Dump.cpp
++++ b/compiler/circle-tensordump/src/Dump.cpp
+@@ -136,6 +136,7 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
+ auto max = quant_param->max();
+ auto scale = quant_param->scale();
+ auto zero_point = quant_param->zero_point();
++ auto quantized_dimension = quant_param->quantized_dimension();
+
+ os << " " + print_format2 + "   ├── min : ";
+ ::print_comma_sepearted(os, min);
+@@ -146,9 +147,11 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
+ os << " " + print_format2 + "   ├── scale : ";
+ ::print_comma_sepearted(os, scale);
+ os << std::endl;
+- os << " " + print_format2 + "   └── zero_point : ";
++ os << " " + print_format2 + "   ├── zero_point : ";
+ ::print_comma_sepearted(os, zero_point);
+ os << std::endl;
++ os << " " + print_format2 + "   └── quantized_dimension : " << quantized_dimension;
++ os << std::endl;
+ }
+
+ // buffer
+@@ -229,7 +232,7 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
+ }
+
+ /**
+- * This function writes data to given hdf5 file like below.
++ * This function writes vector data to given hdf5 file like below.
+ *
+ * GROUP "group_name"
+ * ㄴDATATYPE "type"
+@@ -238,9 +241,9 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
+ * ㄴDATA "data"
+ */
+ template <typename T>
+-void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
+- const H5::PredType &type, const flatbuffers::Vector<T> *data,
+- std::vector<hsize_t> dims)
++void write_vector_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
++ const H5::PredType &type, const flatbuffers::Vector<T> *data,
++ std::vector<hsize_t> dims)
+ {
+ if (data == nullptr)
+ return;
+@@ -250,6 +253,17 @@ void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string d
+ dataset->write(data->data(), type);
+ }
+
++/// @brief This function writes scalar data to given hdf5 file
++template <typename T>
++void write_scalar_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
++ const H5::PredType &type, T data)
++{
++ auto dataspace = std::make_unique<H5::DataSpace>(H5S_SCALAR);
++ auto dataset = std::make_unique<H5::DataSet>(
++ file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
++ dataset->write(&data, type);
++}
++
+ } // namespace
+
+ namespace circletensordump
+@@ -297,8 +311,9 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
+ auto buff_data_ptr = reader.buffers()->Get(buff_idx)->data();
+ if (buff_data_ptr)
+ {
+- ::write_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
+- buff_data_ptr, ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
++ ::write_vector_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
++ buff_data_ptr,
++ ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
+ }
+
+ // write quantization parameters
+@@ -306,17 +321,20 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
+ if (quant_param)
+ {
+ auto min = quant_param->min();
+- ::write_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
+- ::hdf5_dims_cast(min));
++ ::write_vector_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
++ ::hdf5_dims_cast(min));
+ auto max = quant_param->max();
+- ::write_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
+- ::hdf5_dims_cast(max));
++ ::write_vector_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
++ ::hdf5_dims_cast(max));
+ auto scale = quant_param->scale();
+- ::write_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
+- ::hdf5_dims_cast(scale));
++ ::write_vector_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
++ ::hdf5_dims_cast(scale));
+ auto zero_point = quant_param->zero_point();
+- ::write_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, zero_point,
+- ::hdf5_dims_cast(zero_point));
++ ::write_vector_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64,
++ zero_point, ::hdf5_dims_cast(zero_point));
++ auto quantized_dimension = quant_param->quantized_dimension();
++ ::write_scalar_data_to_hdf5(file, group_name, "quantized_dimension",
++ H5::PredType::NATIVE_INT32, quantized_dimension);
+ }
+ }
+ }
+diff --git a/compiler/circle-verify/src/Driver.cpp b/compiler/circle-verify/src/Driver.cpp
+index 1af31d9..7a44c65 100644
+--- a/compiler/circle-verify/src/Driver.cpp
++++ b/compiler/circle-verify/src/Driver.cpp
+@@ -35,7 +35,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ auto verifier = std::make_unique<VerifyFlatbuffers>();
+diff --git a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
+index 6663cb9..4bcaae3 100644
+--- a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
++++ b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
+@@ -1,25 +1,12 @@
+ nnas_include(TargetRequire)
+
+ unset(REQUIRED_TARGETS)
+-list(APPEND REQUIRED_TARGETS circlechef)
+ list(APPEND REQUIRED_TARGETS circle-inspect)
+ list(APPEND REQUIRED_TARGETS circle-verify)
+ list(APPEND REQUIRED_TARGETS circle2circle)
+ list(APPEND REQUIRED_TARGETS dredd_rule_lib)
+-list(APPEND REQUIRED_TARGETS tflchef)
+-list(APPEND REQUIRED_TARGETS tflite2circle)
+ TargetRequire_Return(${REQUIRED_TARGETS})
+
+-nncc_find_resource(TensorFlowLiteRecipes)
+-nncc_find_resource(CircleRecipes)
+-
+-set(TFLITE_RECIPE_REPO "${TensorFlowLiteRecipes_DIR}")
+-set(CIRCLE_RECIPE_REPO "${CircleRecipes_DIR}")
+-unset(RECIPE_REPO)
+-
+-set(TEST_RECIPE_FILENAME "test.recipe")
+-set(TEST_RULE_FILENAME "test.rule")
+-
+ unset(TEST_DEPS)
+ unset(TEST_NAMES)
+
+@@ -27,21 +14,9 @@ set(options "")
+ set(oneValueArgs "")
+ set(multiValueArgs PASS)
+
+-macro(Add RECIPE)
+- if(NOT EXISTS "${TFLITE_RECIPE_REPO}/${RECIPE}/test.recipe")
+- if(NOT EXISTS "${CIRCLE_RECIPE_REPO}/${RECIPE}/test.recipe")
+- message(FATAL_ERROR "Missing recipe of '${RECIPE}' test")
+- else()
+- set(RECIPE_REPO ${CIRCLE_RECIPE_REPO})
+- endif()
+- else()
+- set(RECIPE_REPO ${TFLITE_RECIPE_REPO})
+- endif()
+-
+- if(NOT EXISTS "${RECIPE_REPO}/${RECIPE}/test.rule")
+- message(FATAL_ERROR "Missing rule of '${RECIPE}' test")
+- endif()
++get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
+
++macro(Add RECIPE)
+ cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+ unset(OPT_OPTIONS)
+ foreach(src ${ARG_PASS})
+@@ -49,71 +24,20 @@ macro(Add RECIPE)
+ list(APPEND OPT_OPTIONS "--${src}")
+ endforeach(src ${ARG_PASS})
+
+- set(RECIPE_FILE "${RECIPE}.recipe")
+- set(RECIPE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RECIPE_FILENAME}")
+- set(RECIPE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE_FILE}")
+-
+- set(RULE_FILE "${RECIPE}.rule")
+- set(RULE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RULE_FILENAME}")
+- set(RULE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RULE_FILE}")
+-
+- set(TFLITE_FILE "${RECIPE}.tflite")
+- set(TFLITE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${TFLITE_FILE}")
+-
+ set(CIRCLE_FILE "${RECIPE}.circle")
+- set(CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${CIRCLE_FILE}")
++ set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${CIRCLE_FILE}")
+
+ set(OPT_CIRCLE_FILE "${RECIPE}.opt.circle")
+ set(OPT_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${OPT_CIRCLE_FILE}")
+
+- # Copy .recipe
+- add_custom_command(OUTPUT ${RECIPE_BINARY_PATH}
+- COMMAND ${CMAKE_COMMAND} -E copy "${RECIPE_SOURCE_PATH}" "${RECIPE_BINARY_PATH}"
+- DEPENDS ${RECIPE_SOURCE_PATH}
+- COMMENT "Generate ${RECIPE_FILE}"
+- )
+-
+- # Copy .rule
+- add_custom_command(OUTPUT ${RULE_BINARY_PATH}
+- COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}"
+- DEPENDS ${RULE_SOURCE_PATH}
+- COMMENT "Generate ${RULE_FILE}"
+- )
+-
+- if(${RECIPE_REPO} STREQUAL ${TFLITE_RECIPE_REPO})
+- # Generate .tflite
+- add_custom_command(OUTPUT ${TFLITE_OUTPUT_PATH}
+- COMMAND $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH} ${TFLITE_OUTPUT_PATH}
+- DEPENDS $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH}
+- COMMENT "Generate ${TFLITE_FILE}"
+- )
+-
+- # Generate .circle
+- add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
+- COMMAND $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH}
+- DEPENDS $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH}
+- COMMENT "Generate ${CIRCLE_FILE}"
+- )
+-
+- list(APPEND TEST_DEPS ${TFLITE_OUTPUT_PATH})
+- else()
+- # Generate .circle
+- add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
+- COMMAND $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH}
+- DEPENDS $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH}
+- COMMENT "Generate ${CIRCLE_FILE}"
+- )
+- endif()
+-
+ # Generate optimized .circle
+ add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH}
+- COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
+- DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_OUTPUT_PATH}
++ COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
++ DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_PATH}
+ COMMENT "Generate ${OPT_CIRCLE_FILE}"
+ )
+
+- list(APPEND TEST_DEPS ${RECIPE_BINARY_PATH} ${RULE_BINARY_PATH}
+- ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH})
++ list(APPEND TEST_DEPS ${OPT_CIRCLE_OUTPUT_PATH})
+ list(APPEND TEST_NAMES ${RECIPE})
+ endmacro(Add)
+
+@@ -174,12 +98,15 @@ list(APPEND TEST_DEPS "${RULE_LIB_BINARY_PATH}")
+
+ # Generate dependencies
+ add_custom_target(circle2circle_dredd_recipe_test ALL DEPENDS ${TEST_DEPS})
++add_dependencies(circle2circle_dredd_recipe_test common_artifacts_deps)
++
++get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
+
+ # Run tests
+ add_test(
+ NAME circle2circle_dredd_recipe_test
+ COMMAND "${TEST_RUNNER}"
+ "${TEST_CONFIG}"
+- "${CMAKE_CURRENT_BINARY_DIR}"
++ "${ARTIFACTS_BIN_PATH}"
+ ${TEST_NAMES}
+ )
+diff --git a/compiler/circle2circle-dredd-recipe-test/requires.cmake b/compiler/circle2circle-dredd-recipe-test/requires.cmake
+index e4a5b71..70e7c52 100644
+--- a/compiler/circle2circle-dredd-recipe-test/requires.cmake
++++ b/compiler/circle2circle-dredd-recipe-test/requires.cmake
+@@ -1,7 +1,5 @@
+-require("circlechef")
+ require("circle2circle")
+ require("circle-inspect")
+ require("circle-verify")
++require("common-artifacts")
+ require("dredd-rule-lib")
+-require("tflchef")
+-require("tflite2circle")
+diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
+index 202f669..6328a64 100644
+--- a/compiler/circle2circle-dredd-recipe-test/test.lst
++++ b/compiler/circle2circle-dredd-recipe-test/test.lst
+@@ -11,9 +11,10 @@
+ ## TFLITE RECIPE
+
+ Add(Net_InstanceNorm_001 PASS fuse_instnorm)
+-# Add(Net_InstanceNorm_002 PASS fuse_instnorm)
++Add(Net_InstanceNorm_002 PASS fuse_instnorm)
+ Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul)
+ Add(MatMul_000 PASS resolve_customop_matmul)
++Add(DepthwiseConv2D_003 PASS)
+
+ ## CIRCLE RECIPE
+
+diff --git a/compiler/circle2circle-dredd-recipe-test/testall.sh b/compiler/circle2circle-dredd-recipe-test/testall.sh
+index 33a2036..2899587 100755
+--- a/compiler/circle2circle-dredd-recipe-test/testall.sh
++++ b/compiler/circle2circle-dredd-recipe-test/testall.sh
+@@ -13,21 +13,22 @@ if [[ $# -lt 2 ]]; then
+ exit 255
+ fi
+
++WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ CONFIG_PATH="$1"; shift
+-WORKDIR="$1"; shift
++RESOURCE_DIR="$1"; shift
+
+ source "${CONFIG_PATH}"
+
+ echo "-- Found circle-inspect: ${CIRCLE_INSPECT_PATH}"
+ echo "-- Found circle-verify: ${CIRCLE_VERIFY_PATH}"
+ echo "-- Found circle2circle: ${CIRCLE2CIRCLE_PATH}"
+-echo "-- Found workdir: ${WORKDIR}"
++echo "-- Found common-artifacts: ${RESOURCE_DIR}"
+
+ TESTED=()
+ PASSED=()
+ FAILED=()
+
+-pushd "${WORKDIR}"
++pushd ${WORKDIR}
+ while [[ $# -ne 0 ]]; do
+ PREFIX="$1"; shift
+
+@@ -40,7 +41,7 @@ while [[ $# -ne 0 ]]; do
+ cat > "${PREFIX}.log" <(
+ exec 2>&1
+
+- echo "-- Found tflite: ${PREFIX}.tflite"
++ echo "-- Found circle: ${PREFIX}.opt.circle"
+
+ # Exit immediately if any command fails
+ set -e
+@@ -55,7 +56,7 @@ while [[ $# -ne 0 ]]; do
+ set +x
+
+ # (COMPILED_FILE, INSPECT_PROG_PATH, VERIFY_PROG_PATH, ERROR_LOG) must be set for rule-lib.sh
+- COMPILED_FILE="${WORKDIR}/${PREFIX}.opt.circle"
++ COMPILED_FILE="${PREFIX}.opt.circle"
+ INSPECT_PROG_PATH=${CIRCLE_INSPECT_PATH}
+ VERIFY_PROG_PATH=${CIRCLE_VERIFY_PATH}
+ ERROR_LOG="${PREFIX}.error"
+@@ -66,7 +67,7 @@ while [[ $# -ne 0 ]]; do
+ trap 'echo "** ERROR **" ; cat "${ERROR_LOG}"' ERR
+
+ source rule-lib.sh
+- source "${PREFIX}.rule"
++ source "${RESOURCE_DIR}/${PREFIX}.rule"
+
+ # unset
+ trap - ERR
+diff --git a/compiler/circle2circle/CMakeLists.txt b/compiler/circle2circle/CMakeLists.txt
+index 7b2bf9b..f60c896 100644
+--- a/compiler/circle2circle/CMakeLists.txt
++++ b/compiler/circle2circle/CMakeLists.txt
+@@ -19,6 +19,7 @@ target_link_libraries(circle2circle luci_service)
+ target_link_libraries(circle2circle luci_pass)
+ target_link_libraries(circle2circle luci_export)
+ target_link_libraries(circle2circle arser)
++target_link_libraries(circle2circle vconone)
+
+ install(TARGETS circle2circle DESTINATION bin)
+
+@@ -44,3 +45,4 @@ target_link_libraries(circle2circle_test luci_service)
+ target_link_libraries(circle2circle_test luci_pass)
+ target_link_libraries(circle2circle_test luci_export)
+ target_link_libraries(circle2circle_test arser)
++target_link_libraries(circle2circle_test vconone)
+diff --git a/compiler/circle2circle/requires.cmake b/compiler/circle2circle/requires.cmake
+index 8cbb90d..36a9efd 100644
+--- a/compiler/circle2circle/requires.cmake
++++ b/compiler/circle2circle/requires.cmake
+@@ -9,3 +9,4 @@ require("hermes")
+ require("hermes-std")
+ require("luci")
+ require("arser")
++require("vconone")
+diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
+index 6888d26..849597b 100644
+--- a/compiler/circle2circle/src/Circle2Circle.cpp
++++ b/compiler/circle2circle/src/Circle2Circle.cpp
+@@ -26,6 +26,7 @@
+
+ #include <oops/InternalExn.h>
+ #include <arser/arser.h>
++#include <vconone/vconone.h>
+
+ #include <functional>
+ #include <iostream>
+@@ -34,6 +35,12 @@
+ using Algorithms = luci::CircleOptimizer::Options::Algorithm;
+ using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
+
++void print_version(void)
++{
++ std::cout << "circle2circle version " << vconone::get_string() << std::endl;
++ std::cout << vconone::get_copyright() << std::endl;
++}
++
+ int entry(int argc, char **argv)
+ {
+ // Simple argument parser (based on map)
+@@ -44,6 +51,13 @@ int entry(int argc, char **argv)
+
+ arser::Arser arser("circle2circle provides circle model optimization and transformations");
+
++ arser.add_argument("--version")
++ .nargs(0)
++ .required(false)
++ .default_value(false)
++ .help("Show version information and exit")
++ .exit_with(print_version);
++
+ arser.add_argument("--all").nargs(0).required(false).default_value(false).help(
+ "Enable all optimize options");
+
+diff --git a/compiler/circlechef/CMakeLists.txt b/compiler/circlechef/CMakeLists.txt
+index cba7d0a..3e2ddcb 100644
+--- a/compiler/circlechef/CMakeLists.txt
++++ b/compiler/circlechef/CMakeLists.txt
+@@ -18,4 +18,6 @@ add_subdirectory(core)
+ add_subdirectory(circle)
+ # Tools
+ add_subdirectory(tools)
+-add_subdirectory(tests)
++if(ENABLE_TEST)
++ add_subdirectory(tests)
++endif(ENABLE_TEST)
+diff --git a/compiler/circlechef/circle/src/RecipeChef.cpp b/compiler/circlechef/circle/src/RecipeChef.cpp
+index 17ef1be..51326c7 100644
+--- a/compiler/circlechef/circle/src/RecipeChef.cpp
++++ b/compiler/circlechef/circle/src/RecipeChef.cpp
+@@ -181,6 +181,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const circle::Model *model)
+ for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
+ chef_quant->add_zero_point(quant->zero_point()->Get(idx));
+ }
++ circlechef::TensorQuantization *chef_quant = operand->mutable_quant();
++ chef_quant->set_quantized_dimension(quant->quantized_dimension());
+ }
+ }
+
+diff --git a/compiler/circlechef/core/src/ModelChef.cpp b/compiler/circlechef/core/src/ModelChef.cpp
+index 76aeacd..d81467d 100644
+--- a/compiler/circlechef/core/src/ModelChef.cpp
++++ b/compiler/circlechef/core/src/ModelChef.cpp
+@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
+ quant_builder.add_min(quant_min);
+ quant_builder.add_scale(quant_scale);
+ quant_builder.add_zero_point(quant_zero_point);
++ quant_builder.add_quantized_dimension(quant.quantized_dimension());
+
+ // Update QuantizationParameters Index
+ quant_index = quant_builder.Finish();
+diff --git a/compiler/circlechef/proto/circlechef.proto b/compiler/circlechef/proto/circlechef.proto
+index b8c009b..3e5e6b1 100644
+--- a/compiler/circlechef/proto/circlechef.proto
++++ b/compiler/circlechef/proto/circlechef.proto
+@@ -35,6 +35,7 @@ message TensorQuantization {
+ repeated float max = 2;
+ repeated float scale = 3;
+ repeated int64 zero_point = 4;
++ optional int32 quantized_dimension = 5 [default = 0];
+ }
+
+ message Operand {
+diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp
+index a15da40..bcc0c7a 100644
+--- a/compiler/circlechef/tools/file/Driver.cpp
++++ b/compiler/circlechef/tools/file/Driver.cpp
+@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ int32_t model_version = 1;
+diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp
+index 9c0b9ea..8a2b85f 100644
+--- a/compiler/circlechef/tools/reverse/Driver.cpp
++++ b/compiler/circlechef/tools/reverse/Driver.cpp
+@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ std::string circle_path = arser.get<std::string>("circle");
+diff --git a/compiler/circledump/driver/Driver.cpp b/compiler/circledump/driver/Driver.cpp
+index b8f561f..657f24f 100644
+--- a/compiler/circledump/driver/Driver.cpp
++++ b/compiler/circledump/driver/Driver.cpp
+@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << '\n';
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ std::string circle_path = arser.get<std::string>("circle");
+diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
+index 2c03203..5aa5d51 100644
+--- a/compiler/circledump/src/OpPrinter.cpp
++++ b/compiler/circledump/src/OpPrinter.cpp
+@@ -593,6 +593,20 @@ public:
+ }
+ };
+
++class UniquePrinter : public OpPrinter
++{
++public:
++ void options(const circle::Operator *op, std::ostream &os) const override
++ {
++ if (auto *params = op->builtin_options_as_UniqueOptions())
++ {
++ os << " ";
++ os << "idx_out_type(" << EnumNameTensorType(params->idx_out_type()) << ") ";
++ os << std::endl;
++ }
++ }
++};
++
+ class WhilePrinter : public OpPrinter
+ {
+ public:
+@@ -744,6 +758,7 @@ OpPrinterRegistry::OpPrinterRegistry()
+ _op_map[circle::BuiltinOperator_SUM] = make_unique<ReducerPrinter>();
+ _op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
+ // There is no Option for TOPK_V2
++ _op_map[circle::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
+ _op_map[circle::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
+ _op_map[circle::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
+
+diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
+index b614b71..d3f5601 100644
+--- a/compiler/common-artifacts/exclude.lst
++++ b/compiler/common-artifacts/exclude.lst
+@@ -5,9 +5,12 @@
+
+ #[[ optimize : Exclude from circle optimization(circle2circle) ]]
+ ## TensorFlowLiteRecipes
+-optimize(ReLU6_000)
+-optimize(Where_000)
+-optimize(Where_001)
++optimize(Unique_000)
++optimize(Unique_001)
++optimize(Unique_002)
++optimize(Unique_003)
++optimize(Unique_U8_000)
++optimize(Unique_U8_001)
+
+ ## CircleRecipes
+
+@@ -46,6 +49,7 @@ tcgenerate(DepthToSpace_000)
+ tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation
+ tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation
+ tcgenerate(DepthwiseConv2D_U8_000)
++tcgenerate(DepthwiseConv2D_U8_001) # luci-interpreter doesn't support channel-wise quantization yet
+ tcgenerate(Div_000)
+ tcgenerate(ELU_000)
+ tcgenerate(Equal_000)
+@@ -96,7 +100,7 @@ tcgenerate(Neg_000)
+ tcgenerate(Net_Dangle_001)
+ tcgenerate(Net_InstanceNorm_001)
+ tcgenerate(Net_InstanceNorm_002)
+-tcgenerate(Net_ZeroDim_001) # fix luci
++tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim
+ tcgenerate(NotEqual_000)
+ tcgenerate(OneHot_000)
+ tcgenerate(OneHot_001)
+@@ -120,9 +124,9 @@ tcgenerate(ReduceProd_001)
+ tcgenerate(ReduceProd_002)
+ tcgenerate(ReduceProd_003)
+ tcgenerate(ReLU_000)
+-tcgenerate(ReLU6_000) # luci NYI
++tcgenerate(ReLU6_000)
+ tcgenerate(ReLUN1To1_000)
+-tcgenerate(Reshape_003) # fix luci
++tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option
+ tcgenerate(Reshape_U8_000)
+ tcgenerate(ResizeBilinear_000)
+ tcgenerate(ResizeNearestNeighbor_000)
+@@ -148,7 +152,7 @@ tcgenerate(SpaceToBatchND_002)
+ tcgenerate(SpaceToBatchND_003)
+ tcgenerate(SpaceToDepth_000)
+ tcgenerate(SparseToDense_000)
+-tcgenerate(SplitV_000) # fix luci
++tcgenerate(SplitV_000)
+ tcgenerate(Sqrt_000)
+ tcgenerate(Square_000)
+ tcgenerate(SquaredDifference_000)
+@@ -164,22 +168,21 @@ tcgenerate(Sum_001)
+ tcgenerate(Tanh_000)
+ tcgenerate(Tile_000)
+ tcgenerate(Tile_U8_000)
+-tcgenerate(TopKV2_000) # fix luci
+-tcgenerate(TopKV2_001) # fix luci
+-tcgenerate(TransposeConv_000) # fix interpreter
++tcgenerate(TopKV2_000)
++tcgenerate(TopKV2_001)
+ tcgenerate(Unique_000)
+ tcgenerate(Unique_001)
+ tcgenerate(Unique_002)
+ tcgenerate(Unique_003)
+ tcgenerate(Unique_U8_000)
+ tcgenerate(Unique_U8_001)
+-tcgenerate(Where_000) # luci NYI
+-tcgenerate(Where_001) # luci NYI
+-tcgenerate(While_000) # fix luci
++tcgenerate(Where_000)
++tcgenerate(Where_001)
++tcgenerate(While_000)
+ tcgenerate(While_001)
+ tcgenerate(While_002)
+ tcgenerate(While_003)
+-tcgenerate(YUV_TO_RGB_000) # fix luci
++tcgenerate(YUV_TO_RGB_000)
+ tcgenerate(YUV_TO_RGB_U8_000)
+ tcgenerate(ZerosLike_000)
+
+diff --git a/compiler/hermes/src/hermes.test.cpp b/compiler/hermes/src/hermes.test.cpp
+index 2cbc093..ea7ef65 100644
+--- a/compiler/hermes/src/hermes.test.cpp
++++ b/compiler/hermes/src/hermes.test.cpp
+@@ -18,7 +18,28 @@
+
+ #include <gtest/gtest.h>
+
+-TEST(HermesTest, simple_usecase)
++namespace
+ {
+- // TO BE FILLED
++
++class Logger final : public hermes::Source
++{
++public:
++ Logger(hermes::Context *ctx);
++ ~Logger();
++};
++
++Logger::Logger(hermes::Context *ctx) { activate(ctx->sources(), ctx->bus()); }
++Logger::~Logger() { deactivate(); }
++
++} // namespace
++
++TEST(HermesTest, logger_constructor_NEG)
++{
++ hermes::Context context;
++ // we expect segmentfault from nullptr->sources()
++ ASSERT_DEATH(Logger logger(&context), "");
++
++ SUCCEED();
+ }
++
++// TODO add HermesTest simple_usecase
+diff --git a/compiler/locomotiv/src/Node/BiasEncode.test.cpp b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
+index cdb255c..4680f5c 100644
+--- a/compiler/locomotiv/src/Node/BiasEncode.test.cpp
++++ b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
+@@ -90,6 +90,16 @@ template <typename T> void test()
+ }
+ } // namespace
+
+-TEST(NodeExecution_BiasEncode, s32) { test<int32_t>(); }
++TEST(NodeExecution_BiasEncode, s32)
++{
++ test<int32_t>();
++
++ SUCCEED();
++}
+
+-TEST(NodeExecution_BiasEncode, f32) { test<float>(); }
++TEST(NodeExecution_BiasEncode, f32)
++{
++ test<float>();
++
++ SUCCEED();
++}
+diff --git a/compiler/locomotiv/src/Node/MatMul.test.cpp b/compiler/locomotiv/src/Node/MatMul.test.cpp
+index f1f3a52..7d942e1 100644
+--- a/compiler/locomotiv/src/Node/MatMul.test.cpp
++++ b/compiler/locomotiv/src/Node/MatMul.test.cpp
+@@ -142,6 +142,8 @@ TEST(NodeExecution_MatMul, f32_2x3_3x3)
+ };
+
+ run_test<float>(lhs, rhs, out, Shape{2, 3}, Shape{3, 3}, Shape{2, 3}, loco::DataType::FLOAT32);
++
++ SUCCEED();
+ }
+
+ /* from the code below:
+@@ -183,6 +185,8 @@ TEST(NodeExecution_MatMul, s32_4x2_2x6)
+ };
+
+ run_test<int32_t>(lhs, rhs, out, Shape{4, 2}, Shape{2, 6}, Shape{4, 6}, loco::DataType::S32);
++
++ SUCCEED();
+ }
+
+ // clang-format on
+diff --git a/compiler/locop/src/FormattedGraph.test.cpp b/compiler/locop/src/FormattedGraph.test.cpp
+index c9808d3..aff9ebe 100644
+--- a/compiler/locop/src/FormattedGraph.test.cpp
++++ b/compiler/locop/src/FormattedGraph.test.cpp
+@@ -28,6 +28,8 @@ TEST(LinearV1FormatterTest, simple)
+
+ // TODO Validate the output (when the implementation becomes stable)
+ std::cout << locop::fmt<locop::LinearV1>(g) << std::endl;
++
++ SUCCEED();
+ }
+
+ TEST(LinearV1FormatterTest, user_defined_node_summary_builder)
+diff --git a/compiler/locop/src/FormattedTensorShape.test.cpp b/compiler/locop/src/FormattedTensorShape.test.cpp
+index 0f0017a..fc85df3 100644
+--- a/compiler/locop/src/FormattedTensorShape.test.cpp
++++ b/compiler/locop/src/FormattedTensorShape.test.cpp
+@@ -30,4 +30,6 @@ TEST(FormattedTensorShapeTest, BracketFormat)
+ tensor_shape->dim(0) = 4;
+
+ std::cout << fmt<TensorShapeFormat::Bracket>(tensor_shape.get()) << std::endl;
++
++ SUCCEED();
+ }
+diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+index 9987898..4ac3d86 100644
+--- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
++++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+@@ -79,12 +79,11 @@ private:
+ //
+ // Note that due to historical and performance reasons, per-tensor quantization uses unsigned
+ // integer types, while per-channel uses signed types assuming 'zero_point' == 0.
+-//
+-// TODO Add 'quantized_dimension' field for per-channel case when IR provides it.
+ struct AffineQuantization
+ {
+ std::vector<float> scale;
+ std::vector<int32_t> zero_point;
++ int32_t quantized_dimension;
+ };
+
+ class Tensor
+@@ -108,6 +107,12 @@ public:
+ return _quantization.zero_point[0];
+ }
+
++ const std::vector<float> &scales() const { return _quantization.scale; }
++
++ const std::vector<int32_t> &zero_points() const { return _quantization.zero_point; }
++
++ int32_t quantized_dimension() const { return _quantization.quantized_dimension; }
++
+ template <typename T> const T *data() const { return reinterpret_cast<const T *>(_data.get()); }
+
+ template <typename T> T *data() { return reinterpret_cast<T *>(_data.get()); }
+diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
+index a32e0d4..65d1197 100644
+--- a/compiler/luci-interpreter/src/core/KernelParams.h
++++ b/compiler/luci-interpreter/src/core/KernelParams.h
+@@ -56,6 +56,11 @@ struct Conv2DParams
+ Activation activation;
+ };
+
++struct DepthToSpaceParams
++{
++ int block_size;
++};
++
+ struct DepthwiseConv2DParams
+ {
+ Padding padding;
+diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+index fe36231..a1fd1de 100644
+--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt
++++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+@@ -12,6 +12,8 @@ set(SOURCES
+ Concatenation.cpp
+ Conv2D.h
+ Conv2D.cpp
++ DepthToSpace.h
++ DepthToSpace.cpp
+ DepthwiseConv2D.h
+ DepthwiseConv2D.cpp
+ Elu.h
+@@ -40,6 +42,10 @@ set(SOURCES
+ Pad.cpp
+ Reshape.h
+ Reshape.cpp
++ Reverse.h
++ Reverse.cpp
++ Slice.h
++ Slice.cpp
+ Softmax.h
+ Softmax.cpp
+ SpaceToDepth.h
+@@ -77,6 +83,7 @@ set(TEST_SOURCES
+ AveragePool2D.test.cpp
+ Concatenation.test.cpp
+ Conv2D.test.cpp
++ DepthToSpace.test.cpp
+ DepthwiseConv2D.test.cpp
+ Elu.test.cpp
+ FullyConnected.test.cpp
+@@ -91,6 +98,8 @@ set(TEST_SOURCES
+ Mul.test.cpp
+ Pad.test.cpp
+ Reshape.test.cpp
++ Reverse.test.cpp
++ Slice.test.cpp
+ Softmax.test.cpp
+ SpaceToDepth.test.cpp
+ Split.test.cpp
+diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
+new file mode 100644
+index 0000000..cab63e2
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
+@@ -0,0 +1,90 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "DepthToSpace.h"
++#include "Utils.h"
++#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++
++DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params)
++ : KernelWithParams<DepthToSpaceParams>({input}, {output}, params)
++{
++}
++
++void DepthToSpace::configure()
++{
++ if (input()->shape().num_dims() != 4)
++ {
++ throw std::runtime_error("Invalid input num_dims.");
++ }
++ if (output()->element_type() != DataType::FLOAT32 && output()->element_type() != DataType::U8 &&
++ output()->element_type() != DataType::S8 && output()->element_type() != DataType::S32 &&
++ output()->element_type() != DataType::S64)
++ {
++ throw std::runtime_error("Invalid output type");
++ }
++ if (input()->element_type() != output()->element_type())
++ {
++ throw std::runtime_error("Type mismatch on input and output.");
++ }
++ const int block_size = params().block_size;
++ const int32_t input_height = input()->shape().dim(1);
++ const int32_t input_width = input()->shape().dim(2);
++ const int32_t input_channels = input()->shape().dim(3);
++ int32_t output_height = input_height * block_size;
++ int32_t output_width = input_width * block_size;
++ int32_t output_channels = input_channels / block_size / block_size;
++
++ assert(input_height == output_height / block_size);
++ assert(input_width == output_width / block_size);
++ assert(input_channels == output_channels * block_size * block_size);
++
++ Shape output_shape(4);
++ output_shape.dim(0) = input()->shape().dim(0);
++ output_shape.dim(1) = output_height;
++ output_shape.dim(2) = output_width;
++ output_shape.dim(3) = output_channels;
++
++ output()->resize(output_shape);
++}
++
++void DepthToSpace::execute() const
++{
++ tflite::DepthToSpaceParams op_params;
++ op_params.block_size = params().block_size;
++ switch (input()->element_type())
++ {
++ case DataType::FLOAT32:
++ tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
++ getTensorData<float>(input()), getTensorShape(output()),
++ getTensorData<float>(output()));
++ break;
++ case DataType::U8:
++ tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
++ getTensorData<uint8_t>(input()), getTensorShape(output()),
++ getTensorData<uint8_t>(output()));
++ break;
++ default:
++ throw std::runtime_error("Unsupported Type.");
++ }
++}
++
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.h b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
+new file mode 100644
+index 0000000..63ce376
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
+@@ -0,0 +1,45 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
++#define LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
++
++#include "core/Kernel.h"
++#include "core/KernelParams.h"
++
++#include <vector>
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++
++class DepthToSpace : public KernelWithParams<DepthToSpaceParams>
++{
++public:
++ DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params);
++
++ const Tensor *input() const { return _inputs[0]; }
++ Tensor *output() const { return _outputs[0]; }
++
++ void configure() override;
++ void execute() const override;
++};
++
++} // namespace kernels
++} // namespace luci_interpreter
++
++#endif // LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
+new file mode 100644
+index 0000000..1b80570
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
+@@ -0,0 +1,60 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/DepthToSpace.h"
++#include "kernels/TestUtils.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++namespace
++{
++
++using namespace testing;
++
++template <typename T> class DepthToSpaceTest : public ::testing::Test
++{
++};
++
++using DataTypes = ::testing::Types<float, uint8_t>;
++TYPED_TEST_CASE(DepthToSpaceTest, DataTypes);
++
++TYPED_TEST(DepthToSpaceTest, SimpleCase)
++{
++ std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8};
++ Shape input_shape{1, 1, 2, 4};
++ std::vector<TypeParam> output_data{1, 2, 5, 6, 3, 4, 7, 8};
++ std::vector<int32_t> output_shape{1, 2, 4, 1};
++
++ Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
++ Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
++
++ DepthToSpaceParams params{};
++ params.block_size = 2;
++
++ DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
++ kernel.configure();
++ kernel.execute();
++
++ EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
++ ::testing::ElementsAreArray(output_data));
++ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
++}
++
++} // namespace
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+index fad450d..f53eaca 100644
+--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
++++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+@@ -45,12 +45,9 @@ TEST(L2NormalizeTest, Float)
+ ElementsAreArray(ArrayFloatNear(ref_output_data)));
+ }
+
+-TEST(L2NormalizeTest, Uint8Quantized)
+-{
+- // TODO
+- // Implement GetDequantizedOutput Function.
+- // Create Test for Uint8 Case
+-}
++// TODO Uint8Quantized
++// Implement GetDequantizedOutput Function.
++// Create Test for Uint8 Case
+
+ } // namespace
+ } // namespace kernels
+diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+index b0c06e7..c79d3d6 100644
+--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
++++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+@@ -61,15 +61,14 @@ TEST(LeakReluTest, FloatSimple)
+ 1.0f, -0.5f, -1.0f, // Row 2
+ },
+ /*alpha=*/0.5f, getElementType<float>());
+-}
+
+-TEST(LeakReluTest, Uint8Simple)
+-{
+- // TODO
+- // Implement GetDequantizedOutput Function.
+- // Create Test for Uint8 Case
++ SUCCEED();
+ }
+
++// TODO Uint8Simple
++// Implement GetDequantizedOutput Function.
++// Create Test for Uint8 Case
++
+ } // namespace
+ } // namespace kernels
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+index 17456a4..00feddf 100644
+--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
++++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+@@ -49,10 +49,8 @@ TEST(LogisticTest, Float)
+ // TODO make a Shape checking of output_tensor.
+ }
+
+-TEST(LogisticTest, Uint8)
+-{
+- // Need to Implement GetDequantizedOutput Function.
+-}
++// TODO Uint8
++// Need to Implement GetDequantizedOutput Function.
+
+ } // namespace
+ } // namespace kernels
+diff --git a/compiler/luci-interpreter/src/kernels/Reverse.cpp b/compiler/luci-interpreter/src/kernels/Reverse.cpp
+new file mode 100644
+index 0000000..a463084
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Reverse.cpp
+@@ -0,0 +1,81 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/Reverse.h"
++#include "kernels/Utils.h"
++#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
++
++namespace luci_interpreter
++{
++
++namespace kernels
++{
++
++Reverse::Reverse(const Tensor *input, const Tensor *axes, Tensor *output)
++ : Kernel({input, axes}, {output})
++{
++}
++
++void Reverse::configure()
++{
++ assert(axes()->shape().num_dims() == 1);
++ assert(input()->shape().num_dims() >= axes()->shape().num_elements());
++ if (input()->element_type() != DataType::S32 && input()->element_type() != DataType::FLOAT32 &&
++ input()->element_type() != DataType::U8 && input()->element_type() != DataType::S16 &&
++ input()->element_type() != DataType::S64)
++ {
++ throw std::runtime_error("Unsupported input type.");
++ }
++ if (axes()->element_type() != DataType::S32)
++ {
++ throw std::runtime_error("Unsupported axes type.");
++ }
++ if (axes()->shape().num_elements() > 1)
++ {
++ throw std::runtime_error("Current implementation does not support more than 1 axis.");
++ }
++ int axis_value = getTensorData<int32_t>(axes())[0];
++ if (axis_value < 0 || axis_value >= input()->shape().num_dims())
++ {
++ throw std::runtime_error("Invalid axes value");
++ }
++ assert(input()->element_type() == output()->element_type());
++
++ output()->resize(input()->shape());
++}
++
++void Reverse::execute() const
++{
++ int axis_value = getTensorData<int32_t>(axes())[0];
++ switch (output()->element_type())
++ {
++ case DataType::FLOAT32:
++ tflite::reference_ops::Reverse<float>(axis_value, getTensorShape(input()),
++ getTensorData<float>(input()), getTensorShape(output()),
++ getTensorData<float>(output()));
++ break;
++ case DataType::U8:
++ tflite::reference_ops::Reverse<uint8_t>(
++ axis_value, getTensorShape(input()), getTensorData<uint8_t>(input()),
++ getTensorShape(output()), getTensorData<uint8_t>(output()));
++ break;
++ default:
++ throw std::runtime_error("Unsupported output type");
++ }
++}
++
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/Reverse.h b/compiler/luci-interpreter/src/kernels/Reverse.h
+new file mode 100644
+index 0000000..3489dae
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Reverse.h
+@@ -0,0 +1,43 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef LUCI_INTERPRETER_KERNELS_REVERSE_H
++#define LUCI_INTERPRETER_KERNELS_REVERSE_H
++
++#include "core/Kernel.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++
++class Reverse : public Kernel
++{
++public:
++ Reverse(const Tensor *input, const Tensor *axes, Tensor *output);
++
++ const Tensor *input() const { return _inputs[0]; }
++ const Tensor *axes() const { return _inputs[1]; }
++ Tensor *output() const { return _outputs[0]; }
++
++ void configure() override;
++ void execute() const override;
++};
++
++} // namespace kernels
++} // namespace luci_interpreter
++
++#endif // LUCI_INTERPRETER_KERNELS_REVERSE_H
+diff --git a/compiler/luci-interpreter/src/kernels/Reverse.test.cpp b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
+new file mode 100644
+index 0000000..5475a8b
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
+@@ -0,0 +1,66 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/Reverse.h"
++#include "kernels/TestUtils.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++namespace
++{
++
++using namespace testing;
++
++template <typename T> class ReverseTest : public ::testing::Test
++{
++};
++
++using DataTypes = ::testing::Types<float, uint8_t>;
++TYPED_TEST_CASE(ReverseTest, DataTypes);
++
++TYPED_TEST(ReverseTest, MultiDimensions)
++{
++ // TypeParam
++ std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
++ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
++ Shape input_shape{4, 3, 2};
++ std::vector<int32_t> axis_data{1};
++ Shape axis_shape{1};
++
++ std::vector<TypeParam> output_data{5, 6, 3, 4, 1, 2, 11, 12, 9, 10, 7, 8,
++ 17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20};
++ std::vector<int32_t> output_shape{4, 3, 2};
++
++ Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
++ Tensor axis_tensor = makeInputTensor<DataType::S32>(axis_shape, axis_data);
++
++ Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
++
++ Reverse kernel = Reverse(&input_tensor, &axis_tensor, &output_tensor);
++ kernel.configure();
++ kernel.execute();
++
++ EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
++ ::testing::ElementsAreArray(output_data));
++ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
++}
++
++} // namespace
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp
+new file mode 100644
+index 0000000..c4bc3c5
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Slice.cpp
+@@ -0,0 +1,149 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/Slice.h"
++#include "Utils.h"
++#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
++
++#include <cassert>
++#include <cstring>
++
++namespace luci_interpreter
++{
++
++namespace kernels
++{
++const int max_dim = 4;
++
++Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output)
++ : Kernel({input, begin, size}, {output})
++{
++}
++
++template <typename T>
++Shape calculateOutputShape(const Tensor *input, const Tensor *begin, const Tensor *size)
++{
++ Shape output_shape = Shape(input->shape().num_dims());
++ for (int idx = 0; idx < input->shape().num_dims(); idx++)
++ {
++ T size_value = getTensorData<T>(size)[idx];
++ if (size_value < 0)
++ {
++ if (size_value != -1)
++ {
++ throw std::runtime_error("Invalid size.");
++ }
++ size_value = input->shape().dim(idx) - getTensorData<T>(begin)[idx];
++ }
++ else
++ {
++ if (input->shape().dim(idx) < getTensorData<T>(begin)[idx] + size_value)
++ {
++ throw std::runtime_error("Invalid begin and size.");
++ }
++ }
++ output_shape.dim(idx) = static_cast<int>(size_value);
++ }
++ return output_shape;
++}
++
++template <typename T>
++void getBeginAndSizeVectors(int dimensions, const Tensor *begin, const Tensor *size,
++ std::vector<int> *begins, std::vector<int> *sizes)
++{
++ for (int idx = dimensions - 1; idx >= 0; --idx)
++ {
++ begins->push_back(getTensorData<T>(begin)[idx]);
++ sizes->push_back(getTensorData<T>(size)[idx]);
++ }
++}
++
++void Slice::configure()
++{
++ assert(input()->element_type() == output()->element_type());
++ assert(begin()->element_type() == DataType::S32 || begin()->element_type() == DataType::S64);
++ assert(size()->element_type() == DataType::S32 || size()->element_type() == DataType::S64);
++ assert(begin()->shape().num_dims() == 1);
++ assert(size()->shape().num_dims() == 1);
++ assert(input()->shape().num_dims() <= max_dim);
++
++ if (begin()->element_type() == DataType::S32)
++ {
++ output()->resize(calculateOutputShape<int32_t>(input(), begin(), size()));
++ }
++ else if (begin()->element_type() == DataType::S64)
++ {
++ output()->resize(calculateOutputShape<int64_t>(input(), begin(), size()));
++ }
++ else
++ {
++ throw std::runtime_error("Unsupported type.");
++ }
++}
++
++void Slice::execute() const
++{
++ std::vector<int> begins;
++ begins.reserve(max_dim);
++ std::vector<int> sizes;
++ sizes.reserve(max_dim);
++ if (begin()->element_type() == DataType::S32)
++ {
++ getBeginAndSizeVectors<int32_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
++ }
++ else if (begin()->element_type() == DataType::S64)
++ {
++ getBeginAndSizeVectors<int64_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
++ }
++ else
++ {
++ throw std::runtime_error("Unsupported begin type.");
++ }
++ for (int i = input()->shape().num_dims(); i < max_dim; ++i)
++ {
++ begins.push_back(0);
++ sizes.push_back(1);
++ }
++
++ assert(begins.size() == 4);
++ assert(sizes.size() == 4);
++ tflite::SliceParams op_params{};
++ op_params.begin_count = 4;
++ op_params.size_count = 4;
++ for (int i = 0; i < 4; i++)
++ {
++ op_params.begin[i] = begins[3 - i];
++ op_params.size[i] = sizes[3 - i];
++ }
++ switch (input()->element_type())
++ {
++ case DataType::FLOAT32:
++ tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
++ getTensorData<float>(input()), getTensorShape(output()),
++ getTensorData<float>(output()));
++ break;
++ case DataType::U8:
++ tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
++ getTensorData<uint8_t>(input()), getTensorShape(output()),
++ getTensorData<uint8_t>(output()));
++ break;
++ default:
++ throw std::runtime_error("Unsupported input type.");
++ }
++}
++
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/Slice.h b/compiler/luci-interpreter/src/kernels/Slice.h
+new file mode 100644
+index 0000000..23c3596
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Slice.h
+@@ -0,0 +1,44 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef LUCI_INTERPRETER_KERNELS_SLICE_H
++#define LUCI_INTERPRETER_KERNELS_SLICE_H
++
++#include "core/Kernel.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++
++class Slice : public Kernel
++{
++public:
++ Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output);
++
++ const Tensor *input() const { return _inputs[0]; }
++ const Tensor *begin() const { return _inputs[1]; }
++ const Tensor *size() const { return _inputs[2]; }
++ Tensor *output() const { return _outputs[0]; }
++
++ void configure() override;
++ void execute() const override;
++};
++
++} // namespace kernels
++} // namespace luci_interpreter
++
++#endif // LUCI_INTERPRETER_KERNELS_SLICE_H
+diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
+new file mode 100644
+index 0000000..a360a29
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
+@@ -0,0 +1,64 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/Slice.h"
++#include "kernels/TestUtils.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++namespace
++{
++
++using namespace testing;
++
++template <typename T> class SliceTest : public ::testing::Test
++{
++};
++
++using DataTypes = ::testing::Types<float, uint8_t>;
++TYPED_TEST_CASE(SliceTest, DataTypes);
++
++TYPED_TEST(SliceTest, SimpleTest)
++{
++ std::vector<TypeParam> input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
++ Shape input_shape{3, 2, 3, 1};
++ std::vector<int32_t> begin_data{1, 0, 0, 0};
++ Shape begin_shape{4};
++ std::vector<int32_t> size_data{2, 1, -1, 1};
++ Shape size_shape{4};
++ std::vector<TypeParam> output_data{3, 3, 3, 5, 5, 5};
++ std::vector<int32_t> output_shape{2, 1, 3, 1};
++
++ Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
++ Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
++ Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
++
++ Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
++
++ Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor);
++ kernel.configure();
++ kernel.execute();
++
++ EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
++ ::testing::ElementsAreArray(output_data));
++ EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
++}
++
++} // namespace
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+index 3386d36..b8c0ac4 100644
+--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
++++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+@@ -68,6 +68,8 @@ TEST(TransposeConvTest, FloatSimple)
+ /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
+ /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
+ getElementType<float>());
++
++ SUCCEED();
+ }
+
+ TEST(TransposeConvTest, FloatTwoFiltersTest)
+@@ -82,21 +84,18 @@ TEST(TransposeConvTest, FloatTwoFiltersTest)
+ 3352, 3652, 2760},
+ /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
+ getElementType<float>());
+-}
+
+-TEST(TransposeConvTest, Uint8Simple)
+-{
+- // TODO
+- // Implement GetDequantizedOutput Function.
+- // Create Test for Uint8 Case
+-}
+-TEST(TransposeConvTest, Uint8FiltersTest)
+-{
+- // TODO
+- // Implement GetDequantizedOutput Function.
+- // Create Test for Uint8 Case
++ SUCCEED();
+ }
+
++// TODO Uint8Simple
++// Implement GetDequantizedOutput Function.
++// Create Test for Uint8 Case
++
++// TODO Uint8FiltersTest
++// Implement GetDequantizedOutput Function.
++// Create Test for Uint8 Case
++
+ } // namespace
+ } // namespace kernels
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt
+index fb36c4a..d99485d 100644
+--- a/compiler/luci-interpreter/src/loader/CMakeLists.txt
++++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt
+@@ -1,3 +1,5 @@
++nnas_find_package(GTest REQUIRED)
++
+ set(SOURCES
+ GraphLoader.h
+ GraphLoader.cpp
+@@ -13,3 +15,8 @@ target_include_directories(luci_interpreter_loader PUBLIC "${LUCI_INTERPRETER_SO
+ target_link_libraries(luci_interpreter_loader
+ PUBLIC luci_lang luci_interpreter_core
+ PRIVATE luci_interpreter_kernels nncc_common)
++
++set(TEST_SOURCES KernelBuilder.test.cpp)
++
++GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES})
++target_link_libraries(luci_interpreter_loader_test luci_interpreter_loader)
+diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+index 779fa06..6ebf979 100644
+--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
++++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+@@ -16,7 +16,6 @@
+
+ #include "loader/GraphLoader.h"
+
+-#include "loader/ModuleLoader.h"
+ #include "loader/KernelBuilder.h"
+
+ #include <loco/IR/Algorithm.h>
+@@ -71,6 +70,7 @@ bool isExecutableNode(const luci::CircleNode *node)
+ {
+ // These nodes denote inputs / outputs of a graph.
+ case luci::CircleOpcode::CONST:
++ case luci::CircleOpcode::CIRCLECONST:
+ case luci::CircleOpcode::CIRCLEINPUT:
+ case luci::CircleOpcode::CIRCLEOUTPUT:
+ // The following nodes denote outputs of multiple-output nodes.
+@@ -102,11 +102,12 @@ bool isTensorProducingNode(const luci::CircleNode *node)
+
+ } // namespace
+
+-GraphLoader::GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
+- RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+- std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+- : _module_loader(module_loader), _graph(graph), _runtime_graph(runtime_graph),
+- _runtime_to_ir(runtime_to_ir), _node_to_tensor(node_to_tensor)
++GraphLoader::GraphLoader(
++ const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
++ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
++ std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
++ : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
++ _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
+ {
+ }
+
+@@ -136,6 +137,7 @@ void GraphLoader::loadTensors()
+ const luci::CircleQuantParam *params = node->quantparam();
+ quantization.scale.assign(params->scale.cbegin(), params->scale.cend());
+ quantization.zero_point.assign(params->zerop.cbegin(), params->zerop.cend());
++ quantization.quantized_dimension = params->quantized_dimension;
+ }
+
+ auto tensor = std::make_unique<Tensor>(node->dtype(), std::move(shape), std::move(quantization),
+@@ -178,7 +180,7 @@ void GraphLoader::initInputOutputTensors() const
+
+ void GraphLoader::loadOperators()
+ {
+- KernelBuilder kernel_builder(_module_loader, *this);
++ KernelBuilder kernel_builder(_graph_to_runtime_graph, _node_to_tensor);
+
+ // Create kernels for executable nodes. This has to be done in execution order.
+ for (const loco::Node *loco_node :
+@@ -195,11 +197,4 @@ void GraphLoader::loadOperators()
+ }
+ }
+
+-void GraphLoader::load()
+-{
+- loadTensors();
+- initInputOutputTensors();
+- loadOperators();
+-}
+-
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-interpreter/src/loader/GraphLoader.h
+index e0adc0f..89c5bca 100644
+--- a/compiler/luci-interpreter/src/loader/GraphLoader.h
++++ b/compiler/luci-interpreter/src/loader/GraphLoader.h
+@@ -27,29 +27,23 @@
+ namespace luci_interpreter
+ {
+
+-class ModuleLoader;
+-
+ class GraphLoader
+ {
+ public:
+- GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
+- RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
++ GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
++ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+ std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
+
+- void load();
+-
+- Tensor *getTensorForNode(const loco::Node *node) const { return _node_to_tensor.at(node); }
+-
+-private:
+- void loadOperators();
+- void initInputOutputTensors() const;
+ void loadTensors();
++ void initInputOutputTensors() const;
++ void loadOperators();
+
+- const ModuleLoader &_module_loader;
++private:
+ const loco::Graph *_graph;
+ RuntimeGraph *_runtime_graph;
+ RuntimeToIR &_runtime_to_ir;
+
++ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
+ std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
+ };
+
+diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+index 56da961..c19f897 100644
+--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+@@ -21,6 +21,7 @@
+ #include "kernels/AveragePool2D.h"
+ #include "kernels/Concatenation.h"
+ #include "kernels/Conv2D.h"
++#include "kernels/DepthToSpace.h"
+ #include "kernels/DepthwiseConv2D.h"
+ #include "kernels/Elu.h"
+ #include "kernels/FullyConnected.h"
+@@ -35,6 +36,8 @@
+ #include "kernels/Mul.h"
+ #include "kernels/Pad.h"
+ #include "kernels/Reshape.h"
++#include "kernels/Reverse.h"
++#include "kernels/Slice.h"
+ #include "kernels/Softmax.h"
+ #include "kernels/SpaceToDepth.h"
+ #include "kernels/Split.h"
+@@ -43,8 +46,6 @@
+ #include "kernels/Unpack.h"
+ #include "kernels/Transpose.h"
+ #include "kernels/TransposeConv.h"
+-#include "loader/GraphLoader.h"
+-#include "loader/ModuleLoader.h"
+
+ #include <stdexcept>
+
+@@ -68,7 +69,7 @@ static std::vector<const loco::Node *> collectOutputNodes(const luci::CircleNode
+
+ const Tensor *KernelBuilder::getInputTensor(const loco::Node *node) const
+ {
+- const Tensor *tensor = _graph_loader.getTensorForNode(node);
++ const Tensor *tensor = _node_to_tensor.at(node);
+ assert(tensor != nullptr);
+ return tensor;
+ }
+@@ -81,7 +82,7 @@ const Tensor *KernelBuilder::getOptionalInputTensor(const loco::Node *node) cons
+
+ Tensor *KernelBuilder::getOutputTensor(const loco::Node *node) const
+ {
+- Tensor *tensor = _graph_loader.getTensorForNode(node);
++ Tensor *tensor = _node_to_tensor.at(node);
+ assert(tensor != nullptr);
+ return tensor;
+ }
+@@ -98,7 +99,7 @@ KernelBuilder::getOutputTensors(const std::vector<const loco::Node *> &nodes) co
+
+ RuntimeGraph *KernelBuilder::getRuntimeGraph(const loco::Graph *graph) const
+ {
+- RuntimeGraph *runtime_graph = _module_loader.getRuntimeGraph(graph);
++ RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
+ assert(runtime_graph != nullptr);
+ return runtime_graph;
+ }
+@@ -120,14 +121,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAdd *node)
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleArgMax *node)
+ {
+ assert(node->arity() == 2);
+- const Tensor *input1 = getInputTensor(node->input());
+- const Tensor *input2 = getInputTensor(node->dimension());
++ const Tensor *input = getInputTensor(node->input());
++ const Tensor *axis = getInputTensor(node->dimension());
+ Tensor *output = getOutputTensor(node);
+
+ ArgMaxParams params{};
+ params.output_type = node->output_type();
+
+- return std::make_unique<kernels::ArgMax>(input1, input2, output, params);
++ return std::make_unique<kernels::ArgMax>(input, axis, output, params);
+ }
+
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAveragePool2D *node)
+@@ -188,6 +189,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleConv2D *node)
+ return std::make_unique<kernels::Conv2D>(input, filter, bias, output, params);
+ }
+
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthToSpace *node)
++{
++ assert(node->arity() == 1);
++
++ const Tensor *input = getInputTensor(node->input());
++ Tensor *output = getOutputTensor(node);
++
++ DepthToSpaceParams params{};
++ params.block_size = node->block_size();
++
++ return std::make_unique<kernels::DepthToSpace>(input, output, params);
++}
++
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthwiseConv2D *node)
+ {
+ assert(node->arity() == 3);
+@@ -224,14 +238,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFullyConnected *n
+ assert(node->arity() == 3);
+
+ const Tensor *input = getInputTensor(node->input());
+- const Tensor *filter = getInputTensor(node->weights());
++ const Tensor *weights = getInputTensor(node->weights());
+ const Tensor *bias = getOptionalInputTensor(node->bias());
+ Tensor *output = getOutputTensor(node);
+
+ FullyConnectedParams params{};
+ params.activation = node->fusedActivationFunction();
+
+- return std::make_unique<kernels::FullyConnected>(input, filter, bias, output, params);
++ return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
+ }
+
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
+@@ -255,6 +269,11 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
+ else_graph);
+ }
+
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
++{
++ throw std::runtime_error("Input node cannot be executed.");
++}
++
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleL2Normalize *node)
+ {
+ assert(node->arity() == 1);
+@@ -323,11 +342,6 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLogistic *node)
+ return std::make_unique<kernels::Logistic>(input, output);
+ }
+
+-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
+-{
+- throw std::runtime_error("Input node cannot be executed.");
+-}
+-
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMaxPool2D *node)
+ {
+ assert(node->arity() == 1);
+@@ -402,6 +416,30 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReshape *node)
+ return std::make_unique<kernels::Reshape>(input, shape, output);
+ }
+
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReverseV2 *node)
++{
++ assert(node->arity() == 2);
++
++ const Tensor *input = getInputTensor(node->tensor());
++ const Tensor *axes = getInputTensor(node->axis());
++ Tensor *output = getOutputTensor(node);
++
++ return std::make_unique<kernels::Reverse>(input, axes, output);
++}
++
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSlice *node)
++{
++ assert(node->arity() == 3);
++
++ const Tensor *input = getInputTensor(node->input());
++ const Tensor *begin = getInputTensor(node->begin());
++ const Tensor *size = getInputTensor(node->size());
++
++ Tensor *output = getOutputTensor(node);
++
++ return std::make_unique<kernels::Slice>(input, begin, size, output);
++}
++
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSoftmax *node)
+ {
+ assert(node->arity() == 1);
+@@ -442,6 +480,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSplit *node)
+ return std::make_unique<kernels::Split>(axis, input, std::move(outputs));
+ }
+
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
++{
++ assert(node->arity() == 1);
++
++ const Tensor *input = getInputTensor(node->input());
++ Tensor *output = getOutputTensor(node);
++
++ SqueezeParams params{};
++ params.squeeze_dims = node->squeeze_dims();
++
++ return std::make_unique<kernels::Squeeze>(input, output, params);
++}
++
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *node)
+ {
+ assert(node->arity() == 4);
+@@ -463,21 +514,15 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *nod
+ return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
+ }
+
+-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
+ {
+- assert(node->arity() == 1);
++ assert(node->arity() == 2);
+
+- const Tensor *input = getInputTensor(node->input());
++ const Tensor *input = getInputTensor(node->a());
++ const Tensor *perm = getInputTensor(node->perm());
+ Tensor *output = getOutputTensor(node);
+
+- SqueezeParams params{};
+- assert(node->squeeze_dims().size() <= 4);
+- for (size_t i = 0; i < node->squeeze_dims().size(); i++)
+- {
+- params.squeeze_dims.push_back(node->squeeze_dims().at(i));
+- }
+-
+- return std::make_unique<kernels::Squeeze>(input, output, params);
++ return std::make_unique<kernels::Transpose>(input, perm, output);
+ }
+
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTransposeConv *node)
+@@ -515,15 +560,4 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleUnpack *node)
+ return std::make_unique<kernels::Unpack>(input, std::move(outputs), params);
+ }
+
+-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
+-{
+- assert(node->arity() == 2);
+-
+- const Tensor *input = getInputTensor(node->a());
+- const Tensor *perm = getInputTensor(node->perm());
+- Tensor *output = getOutputTensor(node);
+-
+- return std::make_unique<kernels::Transpose>(input, perm, output);
+-}
+-
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h
+index 7e30d39..d5c5a4b 100644
+--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h
++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h
+@@ -24,18 +24,18 @@
+
+ #include <memory>
+ #include <vector>
++#include <unordered_map>
+
+ namespace luci_interpreter
+ {
+
+-class GraphLoader;
+-class ModuleLoader;
+-
+ class KernelBuilder : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>
+ {
+ public:
+- KernelBuilder(const ModuleLoader &module_loader, const GraphLoader &graph_loader)
+- : _module_loader(module_loader), _graph_loader(graph_loader)
++ KernelBuilder(
++ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
++ const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
++ : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
+ {
+ }
+
+@@ -45,6 +45,7 @@ public:
+ std::unique_ptr<Kernel> visit(const luci::CircleConcatenation *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleConv2D *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleConst *node) override;
++ std::unique_ptr<Kernel> visit(const luci::CircleDepthToSpace *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleDepthwiseConv2D *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleElu *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleFullyConnected *node) override;
+@@ -61,6 +62,8 @@ public:
+ std::unique_ptr<Kernel> visit(const luci::CircleOutput *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CirclePad *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleReshape *node) override;
++ std::unique_ptr<Kernel> visit(const luci::CircleReverseV2 *node) override;
++ std::unique_ptr<Kernel> visit(const luci::CircleSlice *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleSoftmax *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleSpaceToDepth *node) override;
+ std::unique_ptr<Kernel> visit(const luci::CircleSplit *node) override;
+@@ -82,8 +85,8 @@ private:
+ RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const;
+
+ private:
+- const ModuleLoader &_module_loader;
+- const GraphLoader &_graph_loader;
++ const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
++ const std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
+ };
+
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
+new file mode 100644
+index 0000000..33bc8ec
+--- /dev/null
++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
+@@ -0,0 +1,743 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "loader/GraphLoader.h"
++#include "loader/KernelBuilder.h"
++
++#include <kernels/Add.h>
++#include <kernels/ArgMax.h>
++#include <kernels/AveragePool2D.h>
++#include <kernels/Concatenation.h>
++#include <kernels/Conv2D.h>
++#include <kernels/DepthToSpace.h>
++#include <kernels/DepthwiseConv2D.h>
++#include <kernels/Elu.h>
++#include <kernels/FullyConnected.h>
++#include <kernels/L2Normalize.h>
++#include <kernels/L2Pool2D.h>
++#include <kernels/LeakyRelu.h>
++#include <kernels/LocalResponseNormalization.h>
++#include <kernels/Logistic.h>
++#include <kernels/MaxPool2D.h>
++#include <kernels/Mean.h>
++#include <kernels/Mul.h>
++#include <kernels/Pad.h>
++#include <kernels/Reshape.h>
++#include <kernels/Reverse.h>
++#include <kernels/Slice.h>
++#include <kernels/Softmax.h>
++#include <kernels/SpaceToDepth.h>
++#include <kernels/Split.h>
++#include <kernels/Squeeze.h>
++#include <kernels/StridedSlice.h>
++#include <kernels/Transpose.h>
++#include <kernels/TransposeConv.h>
++#include <kernels/Unpack.h>
++
++#include <gmock/gmock.h>
++
++namespace luci_interpreter
++{
++namespace
++{
++
++using namespace testing;
++
++class KernelBuilderTest : public Test
++{
++protected:
++ luci::CircleInput *createInputNode() { return createNode<luci::CircleInput>(); }
++
++ template <typename NodeT, typename... Args> NodeT *createNode(Args &&... args)
++ {
++ auto *node = _graph.nodes()->create<NodeT>(std::forward<Args>(args)...);
++ // The actual type does not matter for the purpose of the tests.
++ // NOTE The type is meaningless for nodes with multiple outputs (corresponding *Out nodes carry
++ // actual output types).
++ node->dtype(loco::DataType::FLOAT32);
++ return node;
++ }
++
++ template <typename NodeOutT> NodeOutT *createNodeOut(loco::Node *node, int index)
++ {
++ auto *node_out = createNode<NodeOutT>();
++ node_out->input(node);
++ node_out->index(index);
++ return node_out;
++ }
++
++ template <typename KernelT> std::unique_ptr<KernelT> buildKernel(const luci::CircleNode *op)
++ {
++ std::unordered_map<const loco::Graph *, RuntimeGraph *> graph_to_runtime_graph;
++
++ RuntimeGraph runtime_graph(nullptr);
++ RuntimeToIR runtime_to_ir;
++ GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph,
++ _node_to_tensor);
++ graph_loader.loadTensors();
++
++ KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor);
++
++ auto kernel = op->accept(&kernel_builder);
++ return std::unique_ptr<KernelT>(dynamic_cast<KernelT *>(kernel.release()));
++ }
++
++ void checkTensor(const Tensor *tensor, const loco::Node *node)
++ {
++ EXPECT_THAT(tensor, Eq(_node_to_tensor.at(node)));
++ }
++
++private:
++ loco::Graph _graph;
++ std::unordered_map<const loco::Node *, Tensor *> _node_to_tensor;
++};
++
++TEST_F(KernelBuilderTest, Add)
++{
++ auto *input1 = createInputNode();
++ auto *input2 = createInputNode();
++
++ auto *op = createNode<luci::CircleAdd>();
++ op->x(input1);
++ op->y(input2);
++
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::Add>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input1(), input1);
++ checkTensor(kernel->input2(), input2);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, ArgMax)
++{
++ auto *input = createInputNode();
++ auto *axis = createInputNode();
++
++ auto *op = createNode<luci::CircleArgMax>();
++ op->input(input);
++ op->dimension(axis);
++
++ op->output_type(loco::DataType::FLOAT32);
++
++ auto kernel = buildKernel<kernels::ArgMax>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->axis(), axis);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().output_type, Eq(op->output_type()));
++}
++
++TEST_F(KernelBuilderTest, AveragePool2D)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleAveragePool2D>();
++ op->value(input);
++
++ op->padding(luci::Padding::SAME);
++ op->filter()->h(11);
++ op->filter()->w(13);
++ op->stride()->h(17);
++ op->stride()->w(19);
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::AveragePool2D>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
++ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, Concatenation)
++{
++ auto *input1 = createInputNode();
++ auto *input2 = createInputNode();
++
++ auto *op = createNode<luci::CircleConcatenation>(2);
++ op->values(0, input1);
++ op->values(1, input2);
++ op->axis(11);
++
++ auto kernel = buildKernel<kernels::Concatenation>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(0), input1);
++ checkTensor(kernel->input(1), input2);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
++}
++
++TEST_F(KernelBuilderTest, Conv2D)
++{
++ auto *input = createInputNode();
++ auto *filter = createInputNode();
++ auto *bias = createInputNode();
++
++ auto *op = createNode<luci::CircleConv2D>();
++ op->input(input);
++ op->filter(filter);
++ op->bias(bias);
++
++ op->padding(luci::Padding::SAME);
++ op->stride()->h(11);
++ op->stride()->w(13);
++ op->dilation()->h(17);
++ op->dilation()->w(19);
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::Conv2D>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->filter(), filter);
++ checkTensor(kernel->bias(), bias);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++ EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
++ EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, DepthToSpace)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleDepthToSpace>();
++ op->input(input);
++
++ op->block_size(11);
++
++ auto kernel = buildKernel<kernels::DepthToSpace>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().block_size, Eq(op->block_size()));
++}
++
++TEST_F(KernelBuilderTest, DepthwiseConv2D)
++{
++ auto *input = createInputNode();
++ auto *filter = createInputNode();
++ auto *bias = createInputNode();
++
++ auto *op = createNode<luci::CircleDepthwiseConv2D>();
++ op->input(input);
++ op->filter(filter);
++ op->bias(bias);
++
++ op->padding(luci::Padding::SAME);
++ op->depthMultiplier(11);
++ op->stride()->h(13);
++ op->stride()->w(17);
++ op->dilation()->h(19);
++ op->dilation()->w(23);
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::DepthwiseConv2D>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->filter(), filter);
++ checkTensor(kernel->bias(), bias);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++ EXPECT_THAT(kernel->params().depth_multiplier, Eq(op->depthMultiplier()));
++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++ EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
++ EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, Elu)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleElu>();
++ op->features(input);
++
++ auto kernel = buildKernel<kernels::Elu>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, FullyConnected)
++{
++ auto *input = createInputNode();
++ auto *weights = createInputNode();
++ auto *bias = createInputNode();
++
++ auto *op = createNode<luci::CircleFullyConnected>();
++ op->input(input);
++ op->weights(weights);
++ op->bias(bias);
++
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::FullyConnected>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->weights(), weights);
++ checkTensor(kernel->bias(), bias);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, L2Normalize)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleL2Normalize>();
++ op->x(input);
++
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::L2Normalize>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, L2Pool2D)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleL2Pool2D>();
++ op->value(input);
++
++ op->padding(luci::Padding::SAME);
++ op->filter()->h(11);
++ op->filter()->w(13);
++ op->stride()->h(17);
++ op->stride()->w(19);
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::L2Pool2D>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
++ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, LeakyRelu)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleLeakyRelu>();
++ op->features(input);
++
++ op->alpha(11.0f);
++
++ auto kernel = buildKernel<kernels::LeakyRelu>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
++}
++
++TEST_F(KernelBuilderTest, LocalResponseNormalization)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleLocalResponseNormalization>();
++ op->input(input);
++
++ op->radius(11);
++ op->bias(13.0f);
++ op->alpha(15.0f);
++ op->beta(17.0f);
++
++ auto kernel = buildKernel<kernels::LocalResponseNormalization>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().radius, Eq(op->radius()));
++ EXPECT_THAT(kernel->params().bias, Eq(op->bias()));
++ EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
++ EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
++}
++
++TEST_F(KernelBuilderTest, Logistic)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleLogistic>();
++ op->x(input);
++
++ auto kernel = buildKernel<kernels::Logistic>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, MaxPool2D)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleMaxPool2D>();
++ op->value(input);
++
++ op->padding(luci::Padding::SAME);
++ op->filter()->h(11);
++ op->filter()->w(13);
++ op->stride()->h(17);
++ op->stride()->w(19);
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::MaxPool2D>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++ EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
++ EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, Mean)
++{
++ auto *input = createInputNode();
++ auto *axes = createInputNode();
++
++ auto *op = createNode<luci::CircleMean>();
++ op->input(input);
++ op->reduction_indices(axes);
++
++ op->keep_dims(true);
++
++ auto kernel = buildKernel<kernels::Mean>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->axes(), axes);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims()));
++}
++
++TEST_F(KernelBuilderTest, Mul)
++{
++ auto *input1 = createInputNode();
++ auto *input2 = createInputNode();
++
++ auto *op = createNode<luci::CircleMul>();
++ op->x(input1);
++ op->y(input2);
++
++ op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++ auto kernel = buildKernel<kernels::Mul>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input1(), input1);
++ checkTensor(kernel->input2(), input2);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, Pad)
++{
++ auto *input = createInputNode();
++ auto *paddings = createInputNode();
++
++ auto *op = createNode<luci::CirclePad>();
++ op->input(input);
++ op->paddings(paddings);
++
++ auto kernel = buildKernel<kernels::Pad>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->paddings(), paddings);
++ checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, Reshape)
++{
++ auto *input = createInputNode();
++ auto *shape = createInputNode();
++
++ auto *op = createNode<luci::CircleReshape>();
++ op->tensor(input);
++ op->shape(shape);
++
++ auto kernel = buildKernel<kernels::Reshape>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->shape(), shape);
++ checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, ReverseV2)
++{
++ auto *input = createInputNode();
++ auto *axes = createInputNode();
++
++ auto *op = createNode<luci::CircleReverseV2>();
++ op->tensor(input);
++ op->axis(axes);
++
++ auto kernel = buildKernel<kernels::Reverse>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->axes(), axes);
++ checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, Slice)
++{
++ auto *input = createInputNode();
++ auto *begin = createInputNode();
++ auto *size = createInputNode();
++
++ auto *op = createNode<luci::CircleSlice>();
++ op->input(input);
++ op->begin(begin);
++ op->size(size);
++
++ auto kernel = buildKernel<kernels::Slice>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->begin(), begin);
++ checkTensor(kernel->size(), size);
++ checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, Softmax)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleSoftmax>();
++ op->logits(input);
++
++ op->beta(11.0f);
++
++ auto kernel = buildKernel<kernels::Softmax>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
++}
++
++TEST_F(KernelBuilderTest, SpaceToDepth)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleSpaceToDepth>();
++ op->input(input);
++
++ op->block_size(11);
++
++ auto kernel = buildKernel<kernels::SpaceToDepth>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().block_size, op->block_size());
++}
++
++TEST_F(KernelBuilderTest, Split)
++{
++ auto *axis = createInputNode();
++ auto *input = createInputNode();
++ auto *op = createNode<luci::CircleSplit>();
++ auto *output1 = createNodeOut<luci::CircleSplitOut>(op, 0);
++ auto *output2 = createNodeOut<luci::CircleSplitOut>(op, 1);
++
++ op->split_dim(axis);
++ op->input(input);
++
++ op->num_split(2);
++
++ auto kernel = buildKernel<kernels::Split>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->axis(), axis);
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(0), output1);
++ checkTensor(kernel->output(1), output2);
++}
++
++TEST_F(KernelBuilderTest, Squeeze)
++{
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleSqueeze>();
++ op->input(input);
++
++ op->squeeze_dims({11, 13});
++
++ auto kernel = buildKernel<kernels::Squeeze>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().squeeze_dims, ElementsAreArray(op->squeeze_dims()));
++}
++
++TEST_F(KernelBuilderTest, StridedSlice)
++{
++ auto *input = createInputNode();
++ auto *begin = createInputNode();
++ auto *end = createInputNode();
++ auto *strides = createInputNode();
++
++ auto *op = createNode<luci::CircleStridedSlice>();
++ op->input(input);
++ op->begin(begin);
++ op->end(end);
++ op->strides(strides);
++
++ op->begin_mask(11);
++ op->ellipsis_mask(13);
++ op->end_mask(17);
++ op->new_axis_mask(19);
++ op->shrink_axis_mask(23);
++
++ auto kernel = buildKernel<kernels::StridedSlice>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->begin(), begin);
++ checkTensor(kernel->end(), end);
++ checkTensor(kernel->strides(), strides);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().begin_mask, Eq(op->begin_mask()));
++ EXPECT_THAT(kernel->params().ellipsis_mask, Eq(op->ellipsis_mask()));
++ EXPECT_THAT(kernel->params().end_mask, Eq(op->end_mask()));
++ EXPECT_THAT(kernel->params().new_axis_mask, Eq(op->new_axis_mask()));
++ EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask()));
++}
++
++TEST_F(KernelBuilderTest, Transpose)
++{
++ auto *input = createInputNode();
++ auto *perm = createInputNode();
++
++ auto *op = createNode<luci::CircleTranspose>();
++ op->a(input);
++ op->perm(perm);
++
++ auto kernel = buildKernel<kernels::Transpose>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->perm(), perm);
++ checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, TransposeConv)
++{
++ auto *output_shape = createInputNode();
++ auto *filter = createInputNode();
++ auto *input = createInputNode();
++
++ auto *op = createNode<luci::CircleTransposeConv>();
++ op->inputSizes(output_shape);
++ op->filter(filter);
++ op->outBackprop(input);
++
++ op->padding(luci::Padding::SAME);
++ op->stride()->h(11);
++ op->stride()->w(13);
++
++ auto kernel = buildKernel<kernels::TransposeConv>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->output_shape(), output_shape);
++ checkTensor(kernel->filter(), filter);
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(), op);
++ EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++ EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++ EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++}
++
++TEST_F(KernelBuilderTest, Unpack)
++{
++ auto *input = createInputNode();
++ auto *op = createNode<luci::CircleUnpack>();
++ auto *output1 = createNodeOut<luci::CircleUnpackOut>(op, 0);
++ auto *output2 = createNodeOut<luci::CircleUnpackOut>(op, 1);
++
++ op->value(input);
++
++ op->num(2);
++ op->axis(11);
++
++ auto kernel = buildKernel<kernels::Unpack>(op);
++ ASSERT_THAT(kernel, NotNull());
++
++ checkTensor(kernel->input(), input);
++ checkTensor(kernel->output(0), output1);
++ checkTensor(kernel->output(1), output2);
++ EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
++}
++
++TEST_F(KernelBuilderTest, NonExisting1_NEG)
++{
++ auto *op = createNode<luci::CircleConst>();
++ ASSERT_ANY_THROW(buildKernel<Kernel>(op));
++}
++
++TEST_F(KernelBuilderTest, NonExisting2_NEG)
++{
++ auto *op = createNode<luci::CircleInput>();
++ ASSERT_ANY_THROW(buildKernel<Kernel>(op));
++}
++
++TEST_F(KernelBuilderTest, NonExisting3_NEG)
++{
++ auto *op = createNode<luci::CircleOutput>();
++ ASSERT_ANY_THROW(buildKernel<Kernel>(op));
++}
++
++} // namespace
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+index 7780a61..b9a2ae0 100644
+--- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
++++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+@@ -41,8 +41,11 @@ void ModuleLoader::load()
+ {
+ const loco::Graph *graph = _module->graph(i);
+ RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
+- GraphLoader loader(*this, graph, runtime_graph, _runtime_to_ir, _node_to_tensor);
+- loader.load();
++ GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph,
++ _node_to_tensor);
++ loader.loadTensors();
++ loader.initInputOutputTensors();
++ loader.loadOperators();
+ }
+ }
+
+diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-interpreter/src/loader/ModuleLoader.h
+index 954dbfb..1af0ed7 100644
+--- a/compiler/luci-interpreter/src/loader/ModuleLoader.h
++++ b/compiler/luci-interpreter/src/loader/ModuleLoader.h
+@@ -36,11 +36,6 @@ public:
+
+ void load();
+
+- RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const
+- {
+- return _graph_to_runtime_graph.at(graph);
+- }
+-
+ private:
+ const luci::Module *_module;
+ RuntimeModule *_runtime_module;
+diff --git a/compiler/luci-value-test/evalverify.sh b/compiler/luci-value-test/evalverify.sh
+index dfd55a6..12c9a45 100755
+--- a/compiler/luci-value-test/evalverify.sh
++++ b/compiler/luci-value-test/evalverify.sh
+@@ -4,8 +4,10 @@
+ #
+ # HOW TO USE
+ #
+-# ./evalverify.sh <path/to/work_dir> <TEST 1> <TEST 2> ...
+-# work_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test)
++# ./evalverify.sh <path/to/bin_dir> <path/to/work_dir> <path/to/venv_dir> <TEST 1> <TEST 2> ...
++# bin_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test)
++# work_dir : artifacts directoy where test materials exist
++# venv_dir : python virtual environment home directory
+
+ VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/luci_eval_verifier.py"
+diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst
+index 6a332f9..364d881 100644
+--- a/compiler/luci-value-test/test.lst
++++ b/compiler/luci-value-test/test.lst
+@@ -1,6 +1,8 @@
+ #addeval(Abs_000)
+ addeval(Add_000)
++#addeval(Add_001)
+ addeval(Add_U8_000)
++#addeval(AddN_000)
+ #addeval(ArgMax_000)
+ #addeval(ArgMax_001)
+ #addeval(ArgMax_002)
+@@ -9,73 +11,173 @@ addeval(Add_U8_000)
+ #addeval(ArgMax_U8_001)
+ #addeval(ArgMax_U8_002)
+ #addeval(ArgMax_U8_003)
++#addeval(ArgMin_000)
++#addeval(ArgMin_001)
++#addeval(ArgMin_002)
++#addeval(ArgMin_003)
++#addeval(ArgMin_U8_000)
++#addeval(ArgMin_U8_001)
++#addeval(ArgMin_U8_002)
++#addeval(ArgMin_U8_003)
+ addeval(AveragePool2D_000)
++#addeval(BatchMatMul_000)
+ #addeval(BatchMatMulV2_000)
+ #addeval(BatchMatMulV2_001)
+ #addeval(BatchToSpaceND_000)
+ #addeval(Cast_000)
++#addeval(Cast_001)
++#addeval(Ceil_000)
+ addeval(Concatenation_000)
+ addeval(Concatenation_U8_000)
+ addeval(Conv2D_000)
+ addeval(Conv2D_001)
+ addeval(Conv2D_002)
++#addeval(Conv2D_003)
+ addeval(Conv2D_U8_000)
+ addeval(Conv2D_U8_001)
+ #addeval(Cos_000)
++#addeval(DepthToSpace_000)
+ addeval(DepthwiseConv2D_000)
+ addeval(DepthwiseConv2D_U8_000)
++#addeval(DepthwiseConv2D_U8_001)
++addeval(DepthwiseConv2D_001)
+ #addeval(Div_000)
++#addeval(ELU_000)
+ #addeval(Equal_000)
+ #addeval(Exp_000)
++#addeval(ExpandDims_000)
++#addeval(ExpandDims_001)
++#addeval(ExpandDims_002)
++#addeval(ExpandDims_003)
++#addeval(Fill_000)
++#addeval(Fill_001)
++#addeval(Floor_000)
++#addeval(FloorDiv_000)
++#addeval(FloorDiv_001)
++#addeval(FloorMod_000)
++#addeval(FloorMod_001)
+ addeval(FullyConnected_000)
+ addeval(FullyConnected_001)
+ #addeval(FullyConnected_002)
+ #addeval(FullyConnected_U8_000)
+ #addeval(Gather_000)
++#addeval(GatherNd_000)
++#addeval(Greater_000)
++#addeval(GreaterEqual_000)
+ #addeval(If_000)
+ #addeval(If_001)
++addeval(L2Normalize_000)
++addeval(L2Pool2D_000)
++#addeval(L2Pool2D_U8_000)
++#addeval(LeakyRelu_000)
++#addeval(Less_000)
++#addeval(LessEqual_000)
++#addeval(LocalResponseNormalization_000)
++#addeval(Log_000)
++#addeval(LogicalAnd_000)
+ #addeval(LogicalNot_000)
+ #addeval(LogicalOr_000)
+-#addeval(Logistic_000)
++addeval(Logistic_000)
++#addeval(LogSoftmax_000)
++#addeval(MatMul_000)
++#addeval(MatrixDiag_000)
++#addeval(MatrixSetDiag_000)
++#addeval(Maximum_000)
+ addeval(MaxPool2D_000)
+ addeval(MaxPool2D_U8_000)
+ addeval(Mean_000)
+ addeval(Mean_001)
+ addeval(Mean_U8_000)
++#addeval(Minimum_000)
++#addeval(MirrorPad_000)
+ addeval(Mul_000)
+ #addeval(Mul_U8_000)
++#addeval(Neg_000)
++#addeval(NotEqual_000)
++#addeval(OneHot_000)
++#addeval(OneHot_001)
++#addeval(OneHot_002)
++#addeval(OneHot_003)
+ #addeval(Pack_000)
+ #addeval(Pack_U8_000)
+ addeval(Pad_000)
+ addeval(Pad_U8_000)
++#addeval(Pow_000)
++#addeval(PRelu_000)
++#addeval(Range_000)
++#addeval(Rank_000)
++#addeval(ReduceAny_000)
++#addeval(ReduceAny_001)
++#addeval(ReduceAny_002)
++#addeval(ReduceAny_003)
++#addeval(ReduceMax_000)
++#addeval(ReduceMin_000)
+ #addeval(ReduceProd_000)
+ #addeval(ReduceProd_001)
+ #addeval(ReduceProd_002)
+ #addeval(ReduceProd_003)
+ #addeval(ReLU_000)
++#addeval(ReLU6_000)
++#addeval(ReLUN1To1_000)
+ addeval(Reshape_000)
+ addeval(Reshape_001)
+ addeval(Reshape_002)
+ #addeval(Reshape_003)
+ addeval(Reshape_U8_000)
++#addeval(ResizeBilinear_000)
++#addeval(ResizeNearestNeighbor_000)
++#addeval(ReverseSequence_000)
++#addeval(ReverseV2_000)
++#addeval(Round_000)
+ #addeval(Rsqrt_000)
++#addeval(ScatterNd_000)
++#addeval(SegmentSum_000)
++#addeval(Select_000)
++#addeval(Select_001)
++#addeval(Select_002)
++#addeval(SelectV2_000)
++#addeval(SelectV2_001)
++#addeval(SelectV2_002)
++#addeval(Shape_000)
+ #addeval(Sin_000)
++addeval(Slice_000)
+ addeval(Softmax_000)
+ #addeval(Softmax_U8_000)
+ #addeval(SpaceToBatchND_000)
+ #addeval(SpaceToBatchND_001)
+ #addeval(SpaceToBatchND_002)
+ #addeval(SpaceToBatchND_003)
+-#addeval(StridedSlice_000)
+-#addeval(StridedSlice_001)
++#addeval(SpaceToDepth_000)
++#addeval(SparseToDense_000)
++#addeval(Split_000)
++#addeval(SplitV_000)
++#addeval(Sqrt_000)
++#addeval(Square_000)
++#addeval(SquaredDifference_000)
++addeval(Squeeze_000)
++addeval(StridedSlice_000)
++addeval(StridedSlice_001)
++addeval(StridedSlice_002)
+ #addeval(Sub_000)
+ #addeval(Sub_U8_000)
++#addeval(Sum_000)
++#addeval(Sum_001)
+ #addeval(Tanh_000)
+ #addeval(Tile_000)
+ #addeval(Tile_U8_000)
+-#addeval(Transpose_000)
++#addeval(TopKV2_000)
++#addeval(TopKV2_001)
++addeval(Transpose_000)
++#addeval(TransposeConv_000)
+ #addeval(Unpack_000)
+ #addeval(Unpack_001)
+ #addeval(Unpack_002)
++addeval(Unpack_003)
++#addeval(Where_000)
++#addeval(Where_001)
+ #addeval(While_000)
+ #addeval(While_001)
++#addeval(While_002)
++#addeval(While_003)
++#addeval(YUV_TO_RGB_U8_000)
++#addeval(ZerosLike_000)
+diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
+index 3c01b67..344c99f 100644
+--- a/compiler/luci/export/src/CircleOperationExporter.cpp
++++ b/compiler/luci/export/src/CircleOperationExporter.cpp
+@@ -890,7 +890,7 @@ void OperationExporter::visit(luci::CircleSpaceToDepth *node)
+ {
+ export_simple(node, circle::BuiltinOperator_SPACE_TO_DEPTH,
+ circle::BuiltinOptions_SpaceToDepthOptions,
+- CreateSpaceToDepthOptions(builder).Union());
++ CreateSpaceToDepthOptions(builder, node->block_size()).Union());
+ }
+
+ void OperationExporter::visit(luci::CircleSparseToDense *node)
+diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
+index 5cad392..dc8c2fb 100644
+--- a/compiler/luci/export/src/CircleTensorExporter.cpp
++++ b/compiler/luci/export/src/CircleTensorExporter.cpp
+@@ -302,7 +302,10 @@ encodeQuantizationParameters(FlatBufferBuilder &builder, luci::CircleQuantParam
+ scale = builder.CreateVector(quantparam->scale);
+ zero_point = builder.CreateVector(quantparam->zerop);
+ }
+- return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point);
++ // Note: QuantizationDetails is not supported
++ return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point,
++ circle::QuantizationDetails::QuantizationDetails_NONE,
++ 0, quantparam->quantized_dimension);
+ }
+
+ void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder,
+diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
+index 81e945d..bc7f397 100644
+--- a/compiler/luci/import/src/CircleReader.cpp
++++ b/compiler/luci/import/src/CircleReader.cpp
+@@ -156,6 +156,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
+ const auto &max = quantization->max;
+ const auto &scale = quantization->scale;
+ const auto &zero_point = quantization->zero_point;
++ const auto &quantized_dimension = quantization->quantized_dimension;
+
+ if ((!min.empty() && !max.empty()) || (!scale.empty() && !zero_point.empty()))
+ {
+@@ -165,6 +166,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
+ quantparam->max = max;
+ quantparam->scale = scale;
+ quantparam->zerop = zero_point;
++ quantparam->quantized_dimension = quantized_dimension;
+
+ return quantparam;
+ }
+diff --git a/compiler/luci/import/src/Importer.test.cpp b/compiler/luci/import/src/Importer.test.cpp
+index 4426e15..8366546 100644
+--- a/compiler/luci/import/src/Importer.test.cpp
++++ b/compiler/luci/import/src/Importer.test.cpp
+@@ -20,4 +20,9 @@
+
+ #include <gtest/gtest.h>
+
+-TEST(TensorFlowLiteImport, Dummy) { luci::Importer import; }
++TEST(TensorFlowLiteImport, Dummy)
++{
++ luci::Importer import;
++
++ SUCCEED();
++}
+diff --git a/compiler/luci/import/src/Nodes/CircleLogistic.cpp b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
+index 85e7e55..c77c55e 100644
+--- a/compiler/luci/import/src/Nodes/CircleLogistic.cpp
++++ b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
+@@ -32,21 +32,7 @@ bool CircleLogisticGraphBuilder::validate(const ValidateArgs &args) const
+ if (outputs.size() != 1)
+ return false;
+
+- // Must be one of the following types
+- // float16, float32, float64, complex64, or complex128
+ const auto &tensors = args.reader.tensors();
+- const auto &tensor = tensors.at(inputs[0]);
+- switch (tensor->type)
+- {
+- case circle::TensorType_FLOAT16:
+- case circle::TensorType_FLOAT32:
+- case circle::TensorType_FLOAT64:
+- case circle::TensorType_COMPLEX64:
+- break;
+- default:
+- return false;
+- }
+-
+ if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type)
+ return false;
+
+diff --git a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
+index 7bdf46d..eb0956c 100644
+--- a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
++++ b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
+@@ -30,6 +30,24 @@ bool CircleTransposeConvGraphBuilder::validate(const ValidateArgs &args) const
+ if (args.op.inputs.size() != 3)
+ return false;
+
++ const auto &inputs = args.op.inputs;
++ const auto &tensors = args.reader.tensors();
++ const auto &filter_tensor = tensors.at(inputs[1]);
++ const auto &filter_shape = filter_tensor.get()->shape;
++ const auto &ifm_tensor = tensors.at(inputs[2]);
++ const auto &ifm_shape = ifm_tensor.get()->shape;
++
++ // ifm and filters must be 4-D tensor
++ if (ifm_shape.size() != 4)
++ return false;
++ if (filter_shape.size() != 4)
++ return false;
++
++ // input shape : [batch, height, width, in_channels]
++ // filters shape : [output_channels, height, weight, in_channels]
++ if (ifm_tensor.get()->shape.at(3) != filter_tensor.get()->shape.at(3))
++ return false;
++
+ return true;
+ }
+
+diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+index 488dcfb..acd7921 100644
+--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst
++++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+@@ -120,6 +120,7 @@ CIRCLE_NODE(BCQ_FULLY_CONNECTED, luci::CircleBCQFullyConnected)
+ CIRCLE_NODE(BCQ_GATHER, luci::CircleBCQGather)
+ CIRCLE_NODE(INSTANCE_NORM, luci::CircleInstanceNorm)
+ // Virtual node(s)
++CIRCLE_NODE(CIRCLECONST, void)
+ CIRCLE_NODE(CIRCLEINPUT, luci::CircleInput)
+ CIRCLE_NODE(CIRCLEOUTPUT, luci::CircleOutput)
+ CIRCLE_NODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy)
+diff --git a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
+index 7253e65..6944373 100644
+--- a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
++++ b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
+@@ -29,6 +29,7 @@ struct CircleQuantParam
+ std::vector<float> max;
+ std::vector<float> scale;
+ std::vector<int64_t> zerop;
++ int32_t quantized_dimension{0};
+ };
+
+ } // namespace luci
+diff --git a/compiler/luci/lang/src/Module.test.cpp b/compiler/luci/lang/src/Module.test.cpp
+index 26bf073..a5973e5 100644
+--- a/compiler/luci/lang/src/Module.test.cpp
++++ b/compiler/luci/lang/src/Module.test.cpp
+@@ -22,7 +22,7 @@ TEST(ModuleTest, consturctor)
+ {
+ auto gs = luci::make_module();
+
+- GTEST_SUCCEED();
++ SUCCEED();
+ }
+
+ TEST(ModuleTest, add)
+diff --git a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
+index 74ea82c..c07268c 100644
+--- a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
++++ b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
+@@ -35,7 +35,12 @@ TEST(CircleCustomTest, constructor)
+ ASSERT_EQ(0, custom_node.custom_code().size());
+ }
+
+-TEST(CircleCustomTest, constructor_NEG) { ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); }
++TEST(CircleCustomTest, constructor_NEG)
++{
++ ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, "");
++
++ SUCCEED();
++}
+
+ TEST(CircleCustomTest, invalidIndex_NEG)
+ {
+diff --git a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
+index e3c8c9f..35f28e9 100644
+--- a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
++++ b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
+@@ -41,11 +41,15 @@ TEST(CircleIfTest, constructor)
+ TEST(CircleIfTestDeath, invalid_arity_NEG)
+ {
+ ASSERT_DEBUG_DEATH(luci::CircleIf very_long_name_if_node(0, 1), "");
++
++ SUCCEED();
+ }
+
+ TEST(CircleIfTestDeath, invalid_output_count_NEG)
+ {
+ ASSERT_DEBUG_DEATH(luci::CircleIf if_node(2, 0), "");
++
++ SUCCEED();
+ }
+
+ TEST(CircleIfTestDeath, invalid_input_get_index_NEG)
+diff --git a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
+index 19290c0..913686f 100644
+--- a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
++++ b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
+@@ -41,11 +41,15 @@ TEST(CircleWhileTest, constructor)
+ TEST(CircleWhileTestDeath, invalid_arity_NEG)
+ {
+ ASSERT_DEBUG_DEATH(luci::CircleWhile very_long_name_while_node(0, 1), "");
++
++ SUCCEED();
+ }
+
+ TEST(CircleWhileTestDeath, invalid_output_count_NEG)
+ {
+ ASSERT_DEBUG_DEATH(luci::CircleWhile while_node(2, 0), "");
++
++ SUCCEED();
+ }
+
+ TEST(CircleWhileTestDeath, invalid_input_get_index_NEG)
+diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
+index 90fbe90..2edf7a9 100644
+--- a/compiler/luci/pass/src/CircleOptimizer.cpp
++++ b/compiler/luci/pass/src/CircleOptimizer.cpp
+@@ -145,7 +145,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
+ {
+ static const std::vector<std::string> fakeq_supported_input_dtype{"float32"};
+ static const std::vector<std::string> fakeq_supported_output_dtype{"uint8"};
+- static const std::vector<std::string> fakeq_supported_granularity{"layer"};
++ static const std::vector<std::string> fakeq_supported_granularity{"layer", "channel"};
+
+ auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
+ auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+@@ -173,7 +173,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
+ {
+ static const std::vector<std::string> qwmm_supported_input_dtype{"float32"};
+ static const std::vector<std::string> qwmm_supported_output_dtype{"uint8"};
+- static const std::vector<std::string> qwmm_supported_granularity{"layer"};
++ static const std::vector<std::string> qwmm_supported_granularity{"layer", "channel"};
+
+ auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
+ auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
+index b81db88..edbaefa 100644
+--- a/compiler/luci/pass/src/FuseBCQPass.cpp
++++ b/compiler/luci/pass/src/FuseBCQPass.cpp
+@@ -67,14 +67,190 @@ const std::string node_name_prefix(luci::NodeName node_name)
+ return prefix;
+ }
+
++/**
++ * @brief Create CircleOutputExclude operation, which has same shape and dtype with
++ * original circle_node.
++ */
++luci::CircleOutputExclude *createNoOp(luci::CircleNode *circle_node)
++{
++ auto graph = circle_node->graph();
++ auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
++
++ if (circle_node->shape_status() == luci::ShapeStatus::VALID)
++ {
++ noOp->dtype(circle_node->dtype());
++ noOp->rank(circle_node->rank());
++ for (uint32_t i = 0; i < circle_node->rank(); ++i)
++ noOp->dim(i) = circle_node->dim(i);
++ }
++ else
++ {
++ // For type inference
++ noOp->dtype(loco::DataType::FLOAT32);
++ }
++
++ return noOp;
++};
++
+ } // namespace
+
+ namespace
+ {
+
+-class BCQConverter final
++// V means the version of BCQ.
++template <int32_t V> class BCQFuser;
++
++template <> class BCQFuser<1>
+ {
+ public:
++ bool fuseBCQ(loco::Graph *g)
++ {
++ bool changed = false;
++
++ for (auto node : loco::all_nodes(g))
++ {
++ if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
++ {
++ add_BCQ_info_node(circle_const);
++ }
++ }
++
++ if (!is_bcqinfo_valid())
++ return false;
++
++ for (auto node : loco::active_nodes(loco::output_nodes(g)))
++ {
++ if (auto gather = dynamic_cast<luci::CircleGather *>(node))
++ {
++ auto params = dynamic_cast<luci::CircleConst *>(gather->params());
++ if (params != nullptr && has_BCQ_info(params))
++ {
++ auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
++
++ bcq_gather->op_version(1);
++ bcq_gather->input_scales(get_alpha(params));
++ bcq_gather->input_binary(get_packed_binary_code(params));
++ bcq_gather->indices(gather->indices());
++ bcq_gather->input_clusters(packed_clusters(params));
++
++ // input_binary shape : [output_size, hidden_size]
++ const auto binary_hidden_size =
++ loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
++ bcq_gather->input_hidden_size(binary_hidden_size);
++
++ if (do_w_x(params))
++ {
++ bcq_gather->axis(gather->axis());
++ }
++ else
++ {
++ const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
++ bcq_gather->axis(axis_transpose);
++ }
++
++ loco::replace(gather).with(bcq_gather);
++
++ changed = true;
++ }
++ }
++ else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
++ {
++ auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
++ if (weights != nullptr && has_BCQ_info(weights))
++ {
++ auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
++
++ bcq_fc->op_version(1);
++ bcq_fc->weights_scales(get_alpha(weights));
++ bcq_fc->weights_binary(get_packed_binary_code(weights));
++ bcq_fc->bias(fully_connected->bias());
++ bcq_fc->weights_clusters(packed_clusters(weights));
++ bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
++
++ loco::Node *bcq_input = fully_connected->input();
++ int32_t batch_rank = 0;
++
++ // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
++ const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
++ if (original_input->shape_status() == luci::ShapeStatus::VALID &&
++ original_input->rank() > 2)
++ {
++ auto new_shape = g->nodes()->create<luci::CircleConst>();
++ new_shape->dtype(loco::DataType::S32);
++ new_shape->size<loco::DataType::S32>(2);
++ new_shape->rank(1);
++ new_shape->dim(0) = 2;
++
++ auto batch_size = 1;
++ for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
++ batch_size *= original_input->dim(i).value();
++
++ new_shape->at<loco::DataType::S32>(0) = batch_size;
++ new_shape->at<loco::DataType::S32>(1) =
++ original_input->dim(original_input->rank() - 1).value();
++ new_shape->shape_status(luci::ShapeStatus::VALID);
++
++ auto reshape = g->nodes()->create<luci::CircleReshape>();
++ reshape->tensor(original_input);
++ reshape->shape(new_shape);
++
++ bcq_input = reshape;
++ batch_rank = original_input->rank() - 2;
++ }
++
++ // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
++ if (do_w_x(weights))
++ {
++ const auto binary_hidden_size =
++ loco::must_cast<luci::CircleNode *>(fully_connected->input())
++ ->dim(batch_rank)
++ .value();
++ bcq_fc->weights_hidden_size(binary_hidden_size);
++ bcq_fc->input(bcq_input);
++ loco::replace(fully_connected).with(bcq_fc);
++ }
++ else
++ {
++ const auto binary_hidden_size =
++ loco::must_cast<luci::CircleNode *>(fully_connected->input())
++ ->dim(1 + batch_rank)
++ .value();
++ bcq_fc->weights_hidden_size(binary_hidden_size);
++
++ auto perm = g->nodes()->create<luci::CircleConst>();
++ perm->dtype(loco::DataType::S32);
++ perm->size<loco::DataType::S32>(2);
++ perm->rank(1);
++ perm->dim(0) = 2;
++ perm->at<loco::DataType::S32>(0) = 1;
++ perm->at<loco::DataType::S32>(1) = 0;
++ perm->shape_status(luci::ShapeStatus::VALID);
++
++ auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
++ input_transpose->a(bcq_input);
++ input_transpose->perm(perm);
++
++ bcq_fc->input(input_transpose);
++
++ auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
++ output_transpose->a(bcq_fc);
++ output_transpose->perm(perm);
++
++ loco::replace(fully_connected).with(output_transpose);
++ }
++
++ changed = true;
++ }
++ }
++ }
++
++ if (changed)
++ clear_BCQ_nodes();
++
++ return changed;
++ }
++
++private:
+ void add_BCQ_info_node(luci::CircleConst *node)
+ {
+ const auto node_name = node->name();
+@@ -119,16 +295,65 @@ public:
+ return has_info;
+ }
+
++ /**
++ * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
++ * from graph output by using CircleOutputExclude
++ */
++ void clear_BCQ_nodes()
++ {
++ auto clear_nodes = [](std::map<std::string, luci::CircleConst *> &nodes) {
++ for (auto &n : nodes)
++ {
++ auto node = n.second;
++
++ for (auto s : loco::succs(node))
++ {
++ if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
++ {
++ outnode->from(createNoOp(node));
++ }
++ else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
++ {
++ for (auto o : loco::succs(reshape_node))
++ {
++ auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
++ circle_output->from(createNoOp(reshape_node));
++ }
++ }
++ }
++ }
++ };
++
++ clear_nodes(_do_w_x);
++ clear_nodes(_alpha);
++ clear_nodes(_packed_binary_code);
++ clear_nodes(_number_of_clusters);
++ clear_nodes(_size_of_clusters);
++ clear_nodes(_qbits_of_clusters);
++ clear_nodes(_dequant_weight);
++ }
++
++ bool is_bcqinfo_valid()
++ {
++ // do_w_x should be int32 or bool type
++ for (auto n : _do_w_x)
++ {
++ if (n.second->dtype() != loco::DataType::BOOL && n.second->dtype() != loco::DataType::S32)
++ return false;
++ }
++
++ return true;
++ }
++
++private:
+ bool do_w_x(luci::CircleConst *node)
+ {
+ const auto prefix = node_name_prefix(node->name());
+
+ if (_do_w_x[prefix]->dtype() == loco::DataType::S32)
+ return _do_w_x[prefix]->at<loco::DataType::S32>(0) == 1;
+- else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL)
+- return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
+ else
+- throw std::runtime_error("do_w_x should be int or bool");
++ return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
+ }
+
+ luci::CircleConst *get_alpha(luci::CircleConst *node)
+@@ -187,64 +412,6 @@ public:
+ return packed_clusters;
+ }
+
+- /**
+- * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
+- * from graph output by using CircleOutputExclude
+- */
+- void clear_BCQ_nodes()
+- {
+- auto createNoOp = [](luci::CircleNode *circle_node) {
+- auto graph = circle_node->graph();
+- auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
+-
+- if (circle_node->shape_status() == luci::ShapeStatus::VALID)
+- {
+- noOp->dtype(circle_node->dtype());
+- noOp->rank(circle_node->rank());
+- for (uint32_t i = 0; i < circle_node->rank(); ++i)
+- noOp->dim(i) = circle_node->dim(i);
+- }
+- else
+- {
+- // For type inference
+- noOp->dtype(loco::DataType::FLOAT32);
+- }
+-
+- return noOp;
+- };
+-
+- auto clear_nodes = [createNoOp](std::map<std::string, luci::CircleConst *> &nodes) {
+- for (auto &n : nodes)
+- {
+- auto node = n.second;
+-
+- for (auto s : loco::succs(node))
+- {
+- if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
+- {
+- outnode->from(createNoOp(node));
+- }
+- else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
+- {
+- for (auto o : loco::succs(reshape_node))
+- {
+- auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
+- circle_output->from(createNoOp(reshape_node));
+- }
+- }
+- }
+- }
+- };
+-
+- clear_nodes(_do_w_x);
+- clear_nodes(_alpha);
+- clear_nodes(_packed_binary_code);
+- clear_nodes(_number_of_clusters);
+- clear_nodes(_size_of_clusters);
+- clear_nodes(_qbits_of_clusters);
+- clear_nodes(_dequant_weight);
+- }
+-
+ private:
+ std::map<std::string, luci::CircleConst *> _do_w_x;
+ std::map<std::string, luci::CircleConst *> _alpha;
+@@ -262,142 +429,9 @@ namespace luci
+
+ bool FuseBCQPass::run(loco::Graph *g)
+ {
+- BCQConverter converter;
+-
+ bool changed = false;
+
+- for (auto node : loco::all_nodes(g))
+- {
+- if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
+- {
+- converter.add_BCQ_info_node(circle_const);
+- }
+- }
+-
+- for (auto node : loco::active_nodes(loco::output_nodes(g)))
+- {
+- if (auto gather = dynamic_cast<luci::CircleGather *>(node))
+- {
+- auto params = dynamic_cast<luci::CircleConst *>(gather->params());
+- if (params != nullptr && converter.has_BCQ_info(params))
+- {
+- auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
+-
+- bcq_gather->input_scales(converter.get_alpha(params));
+- bcq_gather->input_binary(converter.get_packed_binary_code(params));
+- bcq_gather->indices(gather->indices());
+- bcq_gather->input_clusters(converter.packed_clusters(params));
+-
+- const auto binary_hidden_size =
+- loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
+- bcq_gather->input_hidden_size(binary_hidden_size);
+-
+- if (converter.do_w_x(params))
+- {
+- bcq_gather->axis(gather->axis());
+- }
+- else
+- {
+- const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
+- bcq_gather->axis(axis_transpose);
+- }
+-
+- loco::replace(gather).with(bcq_gather);
+-
+- changed = true;
+- }
+- }
+- else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
+- {
+- auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
+- if (weights != nullptr && converter.has_BCQ_info(weights))
+- {
+- auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+-
+- bcq_fc->weights_scales(converter.get_alpha(weights));
+- bcq_fc->weights_binary(converter.get_packed_binary_code(weights));
+- bcq_fc->bias(fully_connected->bias());
+- bcq_fc->weights_clusters(converter.packed_clusters(weights));
+- bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
+-
+- loco::Node *bcq_input = fully_connected->input();
+- int32_t batch_rank = 0;
+-
+- // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
+- const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
+- if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2)
+- {
+- auto new_shape = g->nodes()->create<luci::CircleConst>();
+- new_shape->dtype(loco::DataType::S32);
+- new_shape->size<loco::DataType::S32>(2);
+- new_shape->rank(1);
+- new_shape->dim(0) = 2;
+-
+- auto batch_size = 1;
+- for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
+- batch_size *= original_input->dim(i).value();
+-
+- new_shape->at<loco::DataType::S32>(0) = batch_size;
+- new_shape->at<loco::DataType::S32>(1) =
+- original_input->dim(original_input->rank() - 1).value();
+- new_shape->shape_status(ShapeStatus::VALID);
+-
+- auto reshape = g->nodes()->create<luci::CircleReshape>();
+- reshape->tensor(original_input);
+- reshape->shape(new_shape);
+-
+- bcq_input = reshape;
+- batch_rank = original_input->rank() - 2;
+- }
+-
+- // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
+- if (converter.do_w_x(weights))
+- {
+- const auto binary_hidden_size =
+- loco::must_cast<luci::CircleNode *>(fully_connected->input())
+- ->dim(batch_rank)
+- .value();
+- bcq_fc->weights_hidden_size(binary_hidden_size);
+- bcq_fc->input(bcq_input);
+- loco::replace(fully_connected).with(bcq_fc);
+- }
+- else
+- {
+- const auto binary_hidden_size =
+- loco::must_cast<luci::CircleNode *>(fully_connected->input())
+- ->dim(1 + batch_rank)
+- .value();
+- bcq_fc->weights_hidden_size(binary_hidden_size);
+-
+- auto perm = g->nodes()->create<luci::CircleConst>();
+- perm->dtype(loco::DataType::S32);
+- perm->size<loco::DataType::S32>(2);
+- perm->rank(1);
+- perm->dim(0) = 2;
+- perm->at<loco::DataType::S32>(0) = 1;
+- perm->at<loco::DataType::S32>(1) = 0;
+- perm->shape_status(ShapeStatus::VALID);
+-
+- auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+- input_transpose->a(bcq_input);
+- input_transpose->perm(perm);
+-
+- bcq_fc->input(input_transpose);
+-
+- auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+- output_transpose->a(bcq_fc);
+- output_transpose->perm(perm);
+-
+- loco::replace(fully_connected).with(output_transpose);
+- }
+-
+- changed = true;
+- }
+- }
+- }
+-
+- if (changed)
+- converter.clear_BCQ_nodes();
++ changed = BCQFuser<1>().fuseBCQ(g);
+
+ return changed;
+ }
+diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
+index 6726ce7..9c9e741 100644
+--- a/compiler/luci/pass/src/QuantizationUtils.cpp
++++ b/compiler/luci/pass/src/QuantizationUtils.cpp
+@@ -99,6 +99,13 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t
+ nudged_zero_point = static_cast<uint8_t>(std::round(zero_point_double));
+ }
+
++ // protect scale from being very low due to overflow
++ if (scale < 1e-5)
++ {
++ scale = 1e-5;
++ nudged_zero_point = static_cast<uint8_t>(std::round(qmin_double - rmin / scale));
++ }
++
+ nudged_min = static_cast<float>((qmin_double - nudged_zero_point) * scale);
+ nudged_max = static_cast<float>((qmax_double - nudged_zero_point) * scale);
+
+diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+index f8abee7..2264bd7 100644
+--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
++++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+@@ -138,7 +138,8 @@ bool is_quantized(const CircleNode *node)
+ node->dtype() == loco::DataType::S32; // bias
+ }
+
+-void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor)
++void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
++ int32_t &channel_dim_index)
+ {
+ assert(node->dtype() == loco::DataType::FLOAT32);
+
+@@ -153,7 +154,6 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
+ uint32_t indices[4] = {
+ 0,
+ };
+- int channel_dim_index{0};
+
+ if (!get_channel_dim_index(node, dimension, channel_dim_index))
+ {
+@@ -189,7 +189,7 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
+ }
+
+ void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
+- std::vector<float> &scaling_factor)
++ std::vector<float> &scaling_factor, int32_t &channel_dim_index)
+ {
+ assert(node->dtype() == loco::DataType::FLOAT32);
+
+@@ -204,7 +204,6 @@ void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
+ uint32_t indices[4] = {
+ 0,
+ };
+- int channel_dim_index{0};
+
+ if (!get_channel_dim_index(node, dimension, channel_dim_index))
+ {
+@@ -350,8 +349,8 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
+ circle_node->dtype(loco::DataType::S16);
+ }
+
+- circle_node->quantparam()->max[0] = nudged_max;
+- circle_node->quantparam()->min[0] = nudged_min;
++ circle_node->quantparam()->min.clear();
++ circle_node->quantparam()->max.clear();
+ circle_node->quantparam()->scale.push_back(scaling_factor);
+ circle_node->quantparam()->zerop.push_back(zp);
+ }
+@@ -472,15 +471,19 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
+ assert(quantparam != nullptr);
+ auto min = quantparam->min;
+ auto scaling_factor = quantparam->scale;
++ int32_t channel_dim_index = 0;
+
+ if (output_type == loco::DataType::U8)
+ {
+- asym_wquant_per_channel(circle_const, min, scaling_factor);
++ asym_wquant_per_channel(circle_const, min, scaling_factor, channel_dim_index);
+ }
+ else
+ {
+- sym_wquant_per_channel(circle_const, scaling_factor);
++ sym_wquant_per_channel(circle_const, scaling_factor, channel_dim_index);
+ }
++ quantparam->min.clear();
++ quantparam->max.clear();
++ quantparam->quantized_dimension = channel_dim_index;
+ }
+ // Find min/max per layer-wise
+ else
+@@ -493,6 +496,8 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
+ auto min = quantparam->min[0];
+ auto scaling_factor = quantparam->scale[0];
+ asym_wquant_per_layer(circle_const, min, scaling_factor);
++ quantparam->min.clear();
++ quantparam->max.clear();
+ }
+ }
+ }
+diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst
+index 188e298..3da3437 100644
+--- a/compiler/luci/tests/test.lst
++++ b/compiler/luci/tests/test.lst
+@@ -30,13 +30,16 @@ addread(Ceil_000)
+ addread(Concatenation_000)
+ addread(Concatenation_U8_000)
+ addread(Conv2D_000)
++addread(Conv2D_001)
+ addread(Conv2D_002)
+ addread(Conv2D_003)
+ addread(Conv2D_U8_000)
++addread(Conv2D_U8_001)
+ addread(Cos_000)
+ addread(DepthToSpace_000)
+ addread(DepthwiseConv2D_000)
+ addread(DepthwiseConv2D_U8_000)
++addread(DepthwiseConv2D_U8_001)
+ addread(DepthwiseConv2D_001)
+ addread(Div_000)
+ addread(ELU_000)
+@@ -84,6 +87,7 @@ addread(MaxPool2D_000)
+ addread(MaxPool2D_U8_000)
+ addread(Mean_000)
+ addread(Mean_001)
++addread(Mean_U8_000)
+ addread(Minimum_000)
+ addread(MirrorPad_000)
+ addread(Mul_000)
+@@ -97,6 +101,7 @@ addread(OneHot_003)
+ addread(Pack_000)
+ addread(Pack_U8_000)
+ addread(Pad_000)
++addread(Pad_U8_000)
+ addread(Pow_000)
+ addread(PRelu_000)
+ addread(Range_000)
+@@ -222,13 +227,16 @@ addwrite(Ceil_000)
+ addwrite(Concatenation_000)
+ addwrite(Concatenation_U8_000)
+ addwrite(Conv2D_000)
++addwrite(Conv2D_001)
+ addwrite(Conv2D_002)
+ addwrite(Conv2D_003)
+ addwrite(Conv2D_U8_000)
++addwrite(Conv2D_U8_001)
+ addwrite(Cos_000)
+ addwrite(DepthToSpace_000)
+ addwrite(DepthwiseConv2D_000)
+ addwrite(DepthwiseConv2D_U8_000)
++addwrite(DepthwiseConv2D_U8_001)
+ addwrite(DepthwiseConv2D_001)
+ addwrite(Div_000)
+ addwrite(ELU_000)
+@@ -276,6 +284,7 @@ addwrite(MaxPool2D_000)
+ addwrite(MaxPool2D_U8_000)
+ addwrite(Mean_000)
+ addwrite(Mean_001)
++addwrite(Mean_U8_000)
+ addwrite(Minimum_000)
+ addwrite(MirrorPad_000)
+ addwrite(Mul_000)
+diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
+index 2c80664..820b6d8 100644
+--- a/compiler/one-cmds/one-codegen
++++ b/compiler/one-cmds/one-codegen
+@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+ function Usage()
+ {
+- echo "Usage: $0 [BACKEND] ..."
++ echo "Usage: one-codegen [BACKEND] ..."
+ echo "Available BACKEND drivers:"
+ backend_exist=0
+ for file in `find $DRIVER_PATH -name *-compile -type f`;
+@@ -33,23 +33,34 @@ function Usage()
+ if [ $backend_exist == 0 ]; then
+ echo " (There is no available backend drivers)"
+ fi
++
++ exit 255
+ }
+
+-# Get command from command-line
+-BACKEND=$1; shift
+-BACKEND_DRIVER="$BACKEND-compile"
++function version()
++{
++ $DRIVER_PATH/one-version one-codegen
++ exit 255
++}
+
+-if [[ -z "${BACKEND_DRIVER}" ]]; then
++# Get command from command-line
++BACKEND=$1
++if [[ -z ${BACKEND} ]]; then
+ Usage
+- exit 255
+ fi
++shift
++
++if [[ "${BACKEND}" == "--version" ]]; then
++ version
++fi
++
++BACKEND_DRIVER="${BACKEND}-compile"
+
+ BACKEND_DRIVER_CMD="${DRIVER_PATH}/${BACKEND_DRIVER}"
+
+ if [[ ! -f "${BACKEND_DRIVER_CMD}" ]]; then
+ echo "ERROR: '${BACKEND_DRIVER}' is not supported"
+ Usage
+- exit 255
+ fi
+
+ "${BACKEND_DRIVER_CMD}" "$@"
+diff --git a/compiler/one-cmds/one-import b/compiler/one-cmds/one-import
+index dbf4af5..b1dd8f4 100644
+--- a/compiler/one-cmds/one-import
++++ b/compiler/one-cmds/one-import
+@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+ function Usage()
+ {
+- echo "Usage: $0 [FRAMEWORK] ..."
++ echo "Usage: one-import [FRAMEWORK] ..."
+ echo "Available FRAMEWORK drivers:"
+ framework_exist=0
+ for file in "$DRIVER_PATH"/one-import-*;
+@@ -31,23 +31,34 @@ function Usage()
+ if [ $framework_exist == 0 ]; then
+ echo " (There is no available import drivers)"
+ fi
++
++ exit 255
+ }
+
+-# Get command from command-line
+-FRAMEWORK=$1; shift
+-FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
++function version()
++{
++ $DRIVER_PATH/one-version one-import-tf
++ exit 255
++}
+
+-if [[ -z "${FRAMEWORK_DRIVER}" ]]; then
++# Get command from command-line
++FRAMEWORK=$1
++if [[ -z ${FRAMEWORK} ]]; then
+ Usage
+- exit 255
++fi
++shift
++
++if [ ${FRAMEWORK} = "--version" ]; then
++ version
+ fi
+
++FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
++
+ FRAMEWORK_DRIVER_CMD="${DRIVER_PATH}/${FRAMEWORK_DRIVER}"
+
+ if [[ ! -f "${FRAMEWORK_DRIVER_CMD}" ]]; then
+ echo "ERROR: '${FRAMEWORK_DRIVER}' is not supported"
+ Usage
+- exit 255
+ fi
+
+ "${FRAMEWORK_DRIVER_CMD}" "$@"
+diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf
+index c048a4e..d59e1c5 100644
+--- a/compiler/one-cmds/one-import-tf
++++ b/compiler/one-cmds/one-import-tf
+@@ -22,14 +22,24 @@ usage()
+ {
+ echo "Convert TensorFlow model to circle."
+ echo "Usage: one-import-tf"
++ echo " --version Show version information and exit"
+ echo " --input_path <path/to/tfmodel>"
+ echo " --output_path <path/to/circle>"
+ echo " --input_arrays <names of the input arrays, comma-separated>"
+ echo " --input_shapes <input shapes, colon-separated>"
+ echo " --output_arrays <names of the output arrays, comma-separated>"
+- exit 0
++ echo " --v2 Use TensorFlow 2.x interface (default is 1.x interface)"
++ exit 255
+ }
+
++version()
++{
++ $DRIVER_PATH/one-version one-import-tf
++ exit 255
++}
++
++TF_INTERFACE="--v1"
++
+ # Parse command-line arguments
+ #
+ while [ "$#" -ne 0 ]; do
+@@ -39,6 +49,9 @@ while [ "$#" -ne 0 ]; do
+ '--help')
+ usage
+ ;;
++ '--version')
++ version
++ ;;
+ '--input_path')
+ export INPUT_PATH="$2"
+ shift 2
+@@ -59,6 +72,10 @@ while [ "$#" -ne 0 ]; do
+ export OUTPUT_ARRAYS="$2"
+ shift 2
+ ;;
++ '--v2')
++ TF_INTERFACE="--v2"
++ shift
++ ;;
+ *)
+ echo "Unknown parameter: ${CUR}"
+ shift
+@@ -92,14 +109,21 @@ fi
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+
++show_err_onexit()
++{
++ cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # generate temporary tflite file
+-echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
++echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
+ --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
+ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
+ --output_arrays ${OUTPUT_ARRAYS} > "${OUTPUT_PATH}.log"
+ echo " " >> "${OUTPUT_PATH}.log"
+
+-python "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
++python "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
+ --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
+ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
+ --output_arrays ${OUTPUT_ARRAYS} >> "${OUTPUT_PATH}.log" 2>&1
+diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite
+index 31ed5af..053489c 100644
+--- a/compiler/one-cmds/one-import-tflite
++++ b/compiler/one-cmds/one-import-tflite
+@@ -22,9 +22,16 @@ usage()
+ {
+ echo "Convert TensorFlow lite model to circle."
+ echo "Usage: one-import-tflite"
++ echo " --version Show version information and exit"
+ echo " --input_path <path/to/tflitemodel>"
+ echo " --output_path <path/to/circle>"
+- exit 0
++ exit 255
++}
++
++version()
++{
++ $DRIVER_PATH/one-version one-import-tflite
++ exit 255
+ }
+
+ # Parse command-line arguments
+@@ -36,6 +43,9 @@ while [ "$#" -ne 0 ]; do
+ '--help')
+ usage
+ ;;
++ '--version')
++ version
++ ;;
+ '--input_path')
+ export INPUT_PATH="$2"
+ shift 2
+@@ -55,12 +65,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+ echo "Error: input model not found"
+ echo ""
+ usage
+- exit 2
+ fi
+
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+
++show_err_onexit()
++{
++ cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # convert .tflite to .circle
+ echo "${DRIVER_PATH}/tflite2circle" "${INPUT_PATH}" "${OUTPUT_PATH}" > "${OUTPUT_PATH}.log"
+
+diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
+index 95384c1..17b6b98 100644
+--- a/compiler/one-cmds/one-optimize
++++ b/compiler/one-cmds/one-optimize
+@@ -22,6 +22,7 @@ usage()
+ {
+ echo "Optimize circle model."
+ echo "Usage: one-optimize"
++ echo " --version Show version information and exit"
+ echo " --all Enable all optimization algorithms"
+ echo " --fuse_bcq Enable FuseBCQ Pass"
+ echo " --fuse_instnorm Enable FuseInstanceNormalization Pass"
+@@ -33,7 +34,13 @@ usage()
+ echo " Enable ResolveCustomOpMatMulPass Pass"
+ echo " --input_path <path/to/input/circle>"
+ echo " --output_path <path/to/output/circle>"
+- exit 0
++ exit 255
++}
++
++version()
++{
++ $DRIVER_PATH/one-version one-optimize
++ exit 255
+ }
+
+ OPTIMIZE_all=0
+@@ -52,6 +59,9 @@ while [ "$#" -ne 0 ]; do
+ '--help')
+ usage
+ ;;
++ '--version')
++ version
++ ;;
+ '--all')
+ OPTIMIZE_all=1
+ shift
+@@ -96,7 +106,6 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+ echo "Error: input model not found"
+ echo ""
+ usage
+- exit 2
+ fi
+
+ OPTIMIZE_OPTIONS=""
+@@ -123,6 +132,13 @@ fi
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+
++show_err_onexit()
++{
++ cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # NOTE do not wrap ${OPTIMIZE_OPTIONS} with ""
+ # optimize circle
+ echo "${DRIVER_PATH}/circle2circle" ${OPTIMIZE_OPTIONS} \
+diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack
+index 2bc4c60..9224b2c 100644
+--- a/compiler/one-cmds/one-pack
++++ b/compiler/one-cmds/one-pack
+@@ -22,9 +22,16 @@ usage()
+ {
+ echo "Package circle to nnpkg"
+ echo "Usage: one-pack"
++ echo " -v, --version Show version information and exit"
+ echo " -i <path/to/circle>"
+ echo " -o <path/to/nnpackage/folder>"
+- exit 0
++ exit 255
++}
++
++version()
++{
++ $DRIVER_PATH/one-version one-pack
++ exit 255
+ }
+
+ # Parse command-line arguments
+@@ -36,6 +43,12 @@ while [ "$#" -ne 0 ]; do
+ '--help')
+ usage
+ ;;
++ '-v')
++ version
++ ;;
++ '--version')
++ version
++ ;;
+ '-i')
+ export INPUT_PATH="$2"
+ shift 2
+@@ -55,12 +68,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+ echo "Error: input model not found"
+ echo ""
+ usage
+- exit 2
+ fi
+
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+
++show_err_onexit()
++{
++ cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # Package circle model file to nnpkg
+ echo "${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" > "${OUTPUT_PATH}.log"
+
+diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
+index ff9e266..c74b2c2 100644
+--- a/compiler/one-cmds/one-quantize
++++ b/compiler/one-cmds/one-quantize
+@@ -22,16 +22,23 @@ usage()
+ {
+ echo "Quantize circle model."
+ echo "Usage: one-quantize"
++ echo " --version Show version information and exit"
+ echo " --input_dtype Input data type (supported: float32, default=float32)"
+ echo " --quantized_dtype Output quantized data type (supported: uint8, default=uint8)"
+- echo " --granularity Quantize granularity (supported: layer, default=layer)"
++ echo " --granularity Quantize granularity (supported: layer, channel, default=layer)"
+ echo " --min_percentile Minimum percentile (0.0~100.0, default=1.0)"
+ echo " --max_percentile Maximum percentile (0.0~100.0, default=99.0)"
+ echo " --mode Record mode (supported: percentile/moving_average, default=percentile)"
+ echo " --input_path <path/to/input/circle>"
+ echo " --input_data <path/to/input/data>"
+ echo " --output_path <path/to/output/circle>"
+- exit 0
++ exit 255
++}
++
++version()
++{
++ $DRIVER_PATH/one-version one-quantize
++ exit 255
+ }
+
+ INPUT_DTYPE=float32
+@@ -50,6 +57,9 @@ while [ "$#" -ne 0 ]; do
+ '--help')
+ usage
+ ;;
++ '--version')
++ version
++ ;;
+
+ '--input_dtype')
+ INPUT_DTYPE="$2"
+@@ -100,13 +110,11 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+ echo "Error: input model not found"
+ echo ""
+ usage
+- exit 2
+ fi
+ if [ -z ${INPUT_DATA} ] || [ ! -e ${INPUT_DATA} ]; then
+ echo "Error: input data not found"
+ echo ""
+ usage
+- exit 2
+ fi
+
+ FILE_BASE=$(basename ${OUTPUT_PATH})
+@@ -118,6 +126,13 @@ trap "{ rm -rf $TMPDIR; }" EXIT
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+
++show_err_onexit()
++{
++ cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # quantize circle
+ echo "${DRIVER_PATH}/circle-quantizer" \
+ --quantize_dequantize_weights ${INPUT_DTYPE} ${QUANTIZED_DTYPE} ${GRANULARITY} \
+diff --git a/compiler/one-cmds/requires.cmake b/compiler/one-cmds/requires.cmake
+index 9b858ad..812149c 100644
+--- a/compiler/one-cmds/requires.cmake
++++ b/compiler/one-cmds/requires.cmake
+@@ -3,3 +3,4 @@ require("tflite2circle")
+ require("circle2circle")
+ require("circle-quantizer")
+ require("record-minmax")
++require("vconone")
+diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt
+index 862660e..f8a165b 100644
+--- a/compiler/record-minmax/CMakeLists.txt
++++ b/compiler/record-minmax/CMakeLists.txt
+@@ -19,9 +19,14 @@ target_link_libraries(record-minmax safemain)
+ target_link_libraries(record-minmax luci_import)
+ target_link_libraries(record-minmax luci_export)
+ target_link_libraries(record-minmax luci_interpreter)
++target_link_libraries(record-minmax vconone)
+
+ install(TARGETS record-minmax DESTINATION bin)
+
++if(NOT ENABLE_TEST)
++ return()
++endif(NOT ENABLE_TEST)
++
+ nnas_find_package(GTest REQUIRED)
+ GTest_AddTest(record_minmax_function_test "${CMAKE_CURRENT_SOURCE_DIR}/tests/RecordFunction.test.cpp")
+ target_include_directories(record_minmax_function_test PRIVATE include)
+diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp
+index ae4fcb7..8b09498 100644
+--- a/compiler/record-minmax/driver/Driver.cpp
++++ b/compiler/record-minmax/driver/Driver.cpp
+@@ -17,6 +17,13 @@
+ #include "RecordMinMax.h"
+
+ #include <arser/arser.h>
++#include <vconone/vconone.h>
++
++void print_version(void)
++{
++ std::cout << "record-minmax version " << vconone::get_string() << std::endl;
++ std::cout << vconone::get_copyright() << std::endl;
++}
+
+ int entry(const int argc, char **argv)
+ {
+@@ -25,6 +32,13 @@ int entry(const int argc, char **argv)
+ arser::Arser arser(
+ "Embedding min/max values of activations to the circle model for post-training quantization");
+
++ arser.add_argument("--version")
++ .nargs(0)
++ .required(false)
++ .default_value(false)
++ .help("Show version information and exit")
++ .exit_with(print_version);
++
+ arser.add_argument("--input_model")
+ .nargs(1)
+ .type(arser::DataType::STR)
+@@ -66,7 +80,7 @@ int entry(const int argc, char **argv)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ auto input_model_path = arser.get<std::string>("--input_model");
+diff --git a/compiler/record-minmax/requires.cmake b/compiler/record-minmax/requires.cmake
+index 0545035..f6804ce 100644
+--- a/compiler/record-minmax/requires.cmake
++++ b/compiler/record-minmax/requires.cmake
+@@ -1,3 +1,4 @@
+ require("luci")
+ require("safemain")
+ require("arser")
++require("vconone")
+diff --git a/compiler/record-minmax/src/HDF5Importer.cpp b/compiler/record-minmax/src/HDF5Importer.cpp
+index cf30cd8..a0e65ee 100644
+--- a/compiler/record-minmax/src/HDF5Importer.cpp
++++ b/compiler/record-minmax/src/HDF5Importer.cpp
+@@ -20,6 +20,7 @@
+
+ #include <string>
+ #include <cassert>
++#include <stdexcept>
+
+ using Shape = luci_interpreter::Shape;
+ using DataType = luci_interpreter::DataType;
+diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp
+index 45f0197..410ce3d 100644
+--- a/compiler/record-minmax/src/MinMaxObserver.cpp
++++ b/compiler/record-minmax/src/MinMaxObserver.cpp
+@@ -38,7 +38,8 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node,
+ assert(node->opcode() != luci::CircleOpcode::UNPACK);
+ assert(node->opcode() != luci::CircleOpcode::WHILE);
+
+- if (node->opcode() == luci::CircleOpcode::CONST)
++ if (node->opcode() == luci::CircleOpcode::CONST ||
++ node->opcode() == luci::CircleOpcode::CIRCLECONST)
+ {
+ // node is not activation. Do nothing.
+ return;
+diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
+index d12a0d3..17c6aa6 100644
+--- a/compiler/record-minmax/src/RecordMinMax.cpp
++++ b/compiler/record-minmax/src/RecordMinMax.cpp
+@@ -158,7 +158,7 @@ void RecordMinMax::profileData(const std::string &mode, const std::string &input
+ auto node = iter->first;
+ auto minmax = iter->second;
+
+- float min, max;
++ float min{0.0f}, max{0.0f};
+ if (mode == "percentile")
+ {
+ min = getNthPercentile(minmax.min_vector, min_percentile);
+diff --git a/compiler/record-minmax/tests/RecordFunction.test.cpp b/compiler/record-minmax/tests/RecordFunction.test.cpp
+index 13b464d..e2f135a 100644
+--- a/compiler/record-minmax/tests/RecordFunction.test.cpp
++++ b/compiler/record-minmax/tests/RecordFunction.test.cpp
+@@ -32,6 +32,8 @@ TEST(GetNthPercentileTest, Edge)
+
+ EXPECT_FLOAT_NEAR(0, getNthPercentile(input, 0));
+ EXPECT_FLOAT_NEAR(9, getNthPercentile(input, 100));
++
++ SUCCEED();
+ }
+
+ TEST(GetNthPercentileTest, Simple)
+@@ -47,6 +49,8 @@ TEST(GetNthPercentileTest, Simple)
+ {
+ EXPECT_FLOAT_NEAR(0.09 * std::floor(i) + 0.045, getNthPercentile(input, i));
+ }
++
++ SUCCEED();
+ }
+
+ TEST(GetNthPercentileTest, Float)
+@@ -61,6 +65,8 @@ TEST(GetNthPercentileTest, Float)
+ EXPECT_FLOAT_NEAR(2.799942346802177, getNthPercentile(input, 1));
+ EXPECT_FLOAT_NEAR(7.768503955476342, getNthPercentile(input, 3.14));
+ EXPECT_FLOAT_NEAR(99.40456084968194, getNthPercentile(input, 99));
++
++ SUCCEED();
+ }
+
+ TEST(GetNthPercentileTest, FloatWithNegative)
+@@ -75,6 +81,8 @@ TEST(GetNthPercentileTest, FloatWithNegative)
+ EXPECT_FLOAT_NEAR(-47.20005765319782, getNthPercentile(input, 1));
+ EXPECT_FLOAT_NEAR(-42.23149604452366, getNthPercentile(input, 3.14));
+ EXPECT_FLOAT_NEAR(49.40456084968194, getNthPercentile(input, 99));
++
++ SUCCEED();
+ }
+
+ TEST(GetNthPercentileTest, SigleElement)
+@@ -84,6 +92,8 @@ TEST(GetNthPercentileTest, SigleElement)
+ EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 0));
+ EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 50));
+ EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 100));
++
++ SUCCEED();
+ }
+
+ TEST(GetNthPercentileTest, OutOfBoundary_NEG)
+@@ -92,6 +102,8 @@ TEST(GetNthPercentileTest, OutOfBoundary_NEG)
+
+ EXPECT_THROW(getNthPercentile(input, -1), std::runtime_error);
+ EXPECT_THROW(getNthPercentile(input, 101), std::runtime_error);
++
++ SUCCEED();
+ }
+
+ TEST(GetNthPercentileTest, EmptyVector_NEG)
+@@ -99,6 +111,8 @@ TEST(GetNthPercentileTest, EmptyVector_NEG)
+ std::vector<float> input;
+
+ EXPECT_THROW(getNthPercentile(input, 10), std::runtime_error);
++
++ SUCCEED();
+ }
+
+ } // namespace record_minmax
+diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt
+index d33059f..4421a46 100644
+--- a/compiler/tfl-verify/CMakeLists.txt
++++ b/compiler/tfl-verify/CMakeLists.txt
+@@ -6,6 +6,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
+
+ add_executable(tfl-verify ${SOURCES})
+ target_include_directories(tfl-verify PRIVATE src)
++target_link_libraries(tfl-verify arser)
+ target_link_libraries(tfl-verify foder)
+ target_link_libraries(tfl-verify mio_tflite)
+ target_link_libraries(tfl-verify safemain)
+diff --git a/compiler/tfl-verify/requires.cmake b/compiler/tfl-verify/requires.cmake
+index ed6b84d..79503f3 100644
+--- a/compiler/tfl-verify/requires.cmake
++++ b/compiler/tfl-verify/requires.cmake
+@@ -1,3 +1,4 @@
++require("arser")
+ require("foder")
+ require("mio-tflite")
+ require("safemain")
+diff --git a/compiler/tfl-verify/src/Driver.cpp b/compiler/tfl-verify/src/Driver.cpp
+index 81f6d54..6d18976 100644
+--- a/compiler/tfl-verify/src/Driver.cpp
++++ b/compiler/tfl-verify/src/Driver.cpp
+@@ -16,22 +16,31 @@
+
+ #include "VerifyFlatBuffers.h"
+
++#include <arser/arser.h>
++
+ #include <iostream>
+ #include <memory>
+ #include <string>
+
+ int entry(int argc, char **argv)
+ {
+- if (argc != 2)
++ arser::Arser arser;
++ arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file path to verify");
++
++ try
+ {
+- std::cerr << "ERROR: Failed to parse arguments" << std::endl;
+- std::cerr << std::endl;
+- std::cerr << "USAGE: " << argv[0] << " [tflite]" << std::endl;
++ arser.parse(argc, argv);
++ }
++ catch (const std::runtime_error &err)
++ {
++ std::cout << err.what() << std::endl;
++ std::cout << arser;
+ return 255;
+ }
++
+ auto verifier = std::make_unique<VerifyFlatbuffers>();
+
+- std::string model_file = argv[argc - 1];
++ std::string model_file = arser.get<std::string>("tflite");
+
+ std::cout << "[ RUN ] Check " << model_file << std::endl;
+
+diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
+index 932a649..692ce48 100644
+--- a/compiler/tflchef/core/src/ModelChef.cpp
++++ b/compiler/tflchef/core/src/ModelChef.cpp
+@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
+ quant_builder.add_min(quant_min);
+ quant_builder.add_scale(quant_scale);
+ quant_builder.add_zero_point(quant_zero_point);
++ quant_builder.add_quantized_dimension(quant.quantized_dimension());
+
+ // Update QuantizationParameters Index
+ quant_index = quant_builder.Finish();
+diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto
+index 792503b..55785c3 100644
+--- a/compiler/tflchef/proto/tflchef.proto
++++ b/compiler/tflchef/proto/tflchef.proto
+@@ -35,6 +35,7 @@ message TensorQuantization {
+ repeated float max = 2;
+ repeated float scale = 3;
+ repeated int64 zero_point = 4;
++ optional int32 quantized_dimension = 5 [default = 0];
+ }
+
+ message Operand {
+diff --git a/compiler/tflchef/tflite/src/RecipeChef.cpp b/compiler/tflchef/tflite/src/RecipeChef.cpp
+index db62d0e..088961c 100644
+--- a/compiler/tflchef/tflite/src/RecipeChef.cpp
++++ b/compiler/tflchef/tflite/src/RecipeChef.cpp
+@@ -184,6 +184,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const tflite::Model *model)
+ for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
+ chef_quant->add_zero_point(quant->zero_point()->Get(idx));
+ }
++ tflchef::TensorQuantization *chef_quant = operand->mutable_quant();
++ chef_quant->set_quantized_dimension(quant->quantized_dimension());
+ }
+ }
+
+diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp
+index cecfeeb..46e5b55 100644
+--- a/compiler/tflchef/tools/file/Driver.cpp
++++ b/compiler/tflchef/tools/file/Driver.cpp
+@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ int32_t model_version = 1;
+diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp
+index 1116dec..4d795a3 100644
+--- a/compiler/tflchef/tools/reverse/Driver.cpp
++++ b/compiler/tflchef/tools/reverse/Driver.cpp
+@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ std::string tflite_path = arser.get<std::string>("tflite");
+diff --git a/compiler/tfldump/driver/Driver.cpp b/compiler/tfldump/driver/Driver.cpp
+index 3961d2f..38c9c06 100644
+--- a/compiler/tfldump/driver/Driver.cpp
++++ b/compiler/tfldump/driver/Driver.cpp
+@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << '\n';
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ std::string tflite_path = arser.get<std::string>("tflite");
+diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt
+index a0a2e02..b1d1f61 100644
+--- a/compiler/tflite2circle/CMakeLists.txt
++++ b/compiler/tflite2circle/CMakeLists.txt
+@@ -14,5 +14,6 @@ target_link_libraries(tflite2circle arser)
+ target_link_libraries(tflite2circle safemain)
+ target_link_libraries(tflite2circle mio_tflite)
+ target_link_libraries(tflite2circle mio_circle)
++target_link_libraries(tflite2circle vconone)
+
+ install(TARGETS tflite2circle DESTINATION bin)
+diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp
+index 67b8e33..2f11e0a 100644
+--- a/compiler/tflite2circle/driver/Driver.cpp
++++ b/compiler/tflite2circle/driver/Driver.cpp
+@@ -24,10 +24,25 @@
+ #include "CircleModel.h"
+ #include "TFLModel.h"
+
++#include <vconone/vconone.h>
++
++void print_version(void)
++{
++ std::cout << "tflite2circle version " << vconone::get_string() << std::endl;
++ std::cout << vconone::get_copyright() << std::endl;
++}
++
+ int entry(int argc, char **argv)
+ {
+ arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"};
+
++ arser.add_argument("--version")
++ .nargs(0)
++ .required(false)
++ .default_value(false)
++ .help("Show version information and exit")
++ .exit_with(print_version);
++
+ arser.add_argument("tflite")
+ .nargs(1)
+ .type(arser::DataType::STR)
+@@ -42,7 +57,7 @@ int entry(int argc, char **argv)
+ {
+ std::cout << err.what() << std::endl;
+ std::cout << arser;
+- return 0;
++ return 255;
+ }
+
+ std::string tfl_path = arser.get<std::string>("tflite");
+diff --git a/compiler/tflite2circle/requires.cmake b/compiler/tflite2circle/requires.cmake
+index ff19b74..837c287 100644
+--- a/compiler/tflite2circle/requires.cmake
++++ b/compiler/tflite2circle/requires.cmake
+@@ -2,3 +2,4 @@ require("arser")
+ require("mio-tflite")
+ require("mio-circle")
+ require("safemain")
++require("vconone")
+diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
+new file mode 100644
+index 0000000..b8cb793
+--- /dev/null
++++ b/compiler/vconone/CMakeLists.txt
+@@ -0,0 +1,31 @@
++if (NOT VCONONE_VERSION)
++ set(VCONONE_VERSION 0x0000000000080001)
++ # NOTE order is [build patch minor major]
++ # if VCONONE_VERSION is set with -D option, it will be cached
++ # you may have to remove cache file if you remove -D option
++endif()
++
++configure_file(version_cfg.h.in version_cfg.h @ONLY)
++
++set(DRIVER "driver/driver.cpp")
++
++file(GLOB_RECURSE SOURCES "src/*.cpp")
++file(GLOB_RECURSE TESTS "src/*.test.cpp")
++list(REMOVE_ITEM SOURCES ${TESTS})
++
++add_library(vconone STATIC ${SOURCES})
++target_include_directories(vconone PUBLIC include)
++target_include_directories(vconone PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
++
++add_executable(one-version ${DRIVER})
++target_link_libraries(one-version vconone)
++install(TARGETS one-version DESTINATION bin)
++
++if(NOT ENABLE_TEST)
++ return()
++endif(NOT ENABLE_TEST)
++
++nnas_find_package(GTest REQUIRED)
++
++GTest_AddTest(vconone_test ${TESTS})
++target_link_libraries(vconone_test vconone)
+diff --git a/compiler/vconone/README.md b/compiler/vconone/README.md
+new file mode 100644
+index 0000000..c08dd63
+--- /dev/null
++++ b/compiler/vconone/README.md
+@@ -0,0 +1,14 @@
++# vconone
++
++_vconone_ provides version number and strings for one-* commands and command
++line tools
++
++# Revise version number
++
++To revise version number, update `VCONONE_VERSION` in `CmakeLists.txt`
++or give `-DVCONONE_VERSION=0x0000000100080001` at cmake configure step.
++
++Number given is four numbers `build`, `patch`, `minor` and `major` in order for
++each 16bit integers. `build` is not used for now.
++
++`0x0000000100080001` version is interpretered as `1.8.1`
+diff --git a/compiler/vconone/driver/driver.cpp b/compiler/vconone/driver/driver.cpp
+new file mode 100644
+index 0000000..12bd0ee
+--- /dev/null
++++ b/compiler/vconone/driver/driver.cpp
+@@ -0,0 +1,36 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <vconone/vconone.h>
++
++#include <string>
++#include <iostream>
++
++int main(int argc, char *argv[])
++{
++ auto str = vconone::get_string();
++ if (argc >= 2)
++ {
++ for (int c = 1; c < argc; ++c)
++ std::cout << argv[c] << " ";
++ std::cout << "version " << str << std::endl;
++ std::cout << vconone::get_copyright() << std::endl;
++ }
++ else
++ std::cout << str;
++
++ return 0;
++}
+diff --git a/compiler/vconone/include/vconone/vconone.h b/compiler/vconone/include/vconone/vconone.h
+new file mode 100644
+index 0000000..a6a1998
+--- /dev/null
++++ b/compiler/vconone/include/vconone/vconone.h
+@@ -0,0 +1,61 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __VCON_ONE_H__
++#define __VCON_ONE_H__
++
++#include <cstdint>
++#include <string>
++
++namespace vconone
++{
++
++struct four
++{
++ uint16_t major;
++ uint16_t minor;
++ uint16_t patch;
++ uint16_t build; // build is not used for now
++};
++
++union version {
++ uint64_t v;
++ four f;
++};
++
++/**
++ * @brief get_number will return version union structure
++ */
++version get_number(void);
++
++/**
++ * @brief get_string will return string of major.minor.patch (without build)
++ */
++std::string get_string(void);
++
++/**
++ * @brief get_string4 will return string of major.minor.patch.build
++ */
++std::string get_string4(void);
++
++/**
++ * @brief get_copyright will return copyright string
++ */
++std::string get_copyright(void);
++
++} // namespace vconone
++
++#endif // __VCON_ONE_H__
+diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp
+new file mode 100644
+index 0000000..9b693c6
+--- /dev/null
++++ b/compiler/vconone/src/version.cpp
+@@ -0,0 +1,63 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "vconone/vconone.h"
++
++#include "version_cfg.h"
++
++#include <sstream>
++
++namespace vconone
++{
++
++version get_number(void)
++{
++ version v;
++ v.v = VCONONE_VERSION;
++ return v;
++}
++
++std::string get_string4(void)
++{
++ std::ostringstream ss;
++
++ auto v = get_number();
++ ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch) << "."
++ << unsigned(v.f.build);
++
++ return ss.str();
++}
++
++std::string get_string(void)
++{
++ std::ostringstream ss;
++
++ auto v = get_number();
++ ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch);
++
++ return ss.str();
++}
++
++std::string get_copyright(void)
++{
++ std::string str;
++ str = "Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
++ str += "Licensed under the Apache License, Version 2.0\r\n";
++ str += "https://github.com/Samsung/ONE";
++ return str;
++}
++
++} // namespace vconone
+diff --git a/compiler/vconone/src/version.test.cpp b/compiler/vconone/src/version.test.cpp
+new file mode 100644
+index 0000000..35a0647
+--- /dev/null
++++ b/compiler/vconone/src/version.test.cpp
+@@ -0,0 +1,49 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <vconone/vconone.h>
++
++#include <gtest/gtest.h>
++
++TEST(vconone, version_number)
++{
++ auto v = vconone::get_number();
++
++ ASSERT_NE(0x0000000000000000ULL, v.v);
++}
++
++TEST(vconone, version_string)
++{
++ auto str = vconone::get_string();
++
++ ASSERT_NE("..", str);
++ ASSERT_NE("", str);
++}
++
++TEST(vconone, version_string4)
++{
++ auto str = vconone::get_string4();
++
++ ASSERT_NE("...", str);
++ ASSERT_NE("", str);
++}
++
++TEST(vconone, copyright)
++{
++ auto str = vconone::get_copyright();
++
++ ASSERT_NE("", str);
++}
+diff --git a/compiler/vconone/version_cfg.h.in b/compiler/vconone/version_cfg.h.in
+new file mode 100644
+index 0000000..aa3ad9e
+--- /dev/null
++++ b/compiler/vconone/version_cfg.h.in
+@@ -0,0 +1,22 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __VCON_ONE_VERSION_CFG_H__
++#define __VCON_ONE_VERSION_CFG_H__
++
++#define VCONONE_VERSION @VCONONE_VERSION@ULL
++
++#endif // __VCON_ONE_VERSION_CFG_H__
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
+deleted file mode 100644
+index 9699b5c..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
++++ /dev/null
+@@ -1,124 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLArgOperationKernel.h
+- * @brief This file defines CLArgOperationKernel
+- * @ingroup COM_AI_RUNTIME
+- */
+-
+-#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
+-#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to define interface for the argop kernel.
+- */
+-class CLArgOperationKernel : public ICLKernel
+-{
+-public:
+- /**
+- * @brief Default constructor.
+- */
+- CLArgOperationKernel();
+- /**
+- * @brief Prevent instances of this class from being copied (As this class contains pointers).
+- * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
+- */
+- CLArgOperationKernel(const CLArgOperationKernel &) = delete;
+- /**
+- * @brief Prevent instances of this class from being copied (As this class contains pointers).
+- * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
+- * @return Reference of this instance
+- */
+- CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete;
+- /**
+- * @brief Allow instances of this class to be moved
+- * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
+- */
+- CLArgOperationKernel(CLArgOperationKernel &&) = default;
+- /**
+- * @brief Allow instances of this class to be moved
+- * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
+- * @return Reference of this instance
+- */
+- CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default;
+- /**
+- * @brief Initialise the kernel's input, output and border mode.
+- * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+- * @param[out] output The output tensor, Data types supported: S32.
+- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+- * @param[in] op Arg operation to perform.
+- * return N/A
+- */
+- void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op);
+- /**
+- * @brief Static function to check if given info will lead to a valid configuration of @ref
+- * CLArgOperationKernel
+- * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
+- * @param[in] output The output tensor info, Data types supported: S32.
+- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+- * @param[in] op Arg operation to perform.
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+- ArgOperation op);
+-
+- /*
+- * @brief Run CLArgOperationKernel op
+- * @param[in] window Window to be used for in_slice
+- * @param[in] queue cl::CommandQueue
+- * @return N/A
+- */
+- void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+- const ICLTensor *_input;
+- ICLTensor *_output;
+- uint32_t _axis;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
+deleted file mode 100644
+index b0357fe..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
++++ /dev/null
+@@ -1,121 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLCastKernel.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file defines CLCastKernel class
+- */
+-
+-#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
+-#define __ARM_COMPUTE_CLCASTKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to define OpenCL kernel for cast operation
+- */
+-class CLCastKernel : public ICLKernel
+-{
+-public:
+- /**
+- * @brief Construct CLCastKernel object
+- */
+- CLCastKernel();
+-
+- /**
+- * @brief Prevent instances of this class from being copied (As this class contains pointers)
+- */
+- CLCastKernel(const CLCastKernel &) = delete;
+-
+- /**
+- * @brief Prevent instances of this class from being copied (As this class contains pointers)
+- */
+- CLCastKernel &operator=(const CLCastKernel &) = delete;
+-
+- /**
+- * @brief Construct CLCastKernel object using default move constructor
+- * @param[in] CLCastKernel object to move
+- */
+- CLCastKernel(CLCastKernel &&) = default;
+-
+- /**
+- * @brief Allow instances of this class to be moved
+- * @param[in] CLCastKernel object to move
+- */
+- CLCastKernel &operator=(CLCastKernel &&) = default;
+-
+- /**
+- * @brief Destruct this CLCastKernel object
+- */
+- ~CLCastKernel() = default;
+-
+- /**
+- * @brief Initialise the kernel's input and output.
+- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[in] input_subtype Sub data type of input.
+- * @return N/A
+- */
+- void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
+-
+- /**
+- * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+- * queue.
+- * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+- * been executed by the time this method returns.
+- * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+- * the window returned by window()).
+- * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+- * @return N/A
+- */
+- void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+- const ICLTensor *_input; /**< Source tensor */
+- ICLTensor *_output; /**< Destination tensor */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
+deleted file mode 100644
+index 8615cf1..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
++++ /dev/null
+@@ -1,82 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+-#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** OpenCL kernel to perform depthTospace operation */
+-class CLDepthToSpaceKernel : public ICLKernel
+-{
+-public:
+- /** Default constructor */
+- CLDepthToSpaceKernel();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
+- /** Allow instances of this class to be moved */
+- CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
+- /** Allow instances of this class to be moved */
+- CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
+- /** Default destructor */
+- ~CLDepthToSpaceKernel() = default;
+- /** Initialise the kernel's input and output.
+- *
+- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- */
+- void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+- const ICLTensor *_input; /**< Source tensor */
+- ICLTensor *_output; /**< Destination tensor */
+-};
+-
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
+deleted file mode 100644
+index 9321c36..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
++++ /dev/null
+@@ -1,117 +0,0 @@
+-/*
+- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
+-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** OpenCL kernel to multiply matrices
+- *
+- * @note This kernel should be used ONLY for Midgard architectures
+- *
+- * This kernel performs the following computation:
+- *
+- * -# Convert a values from int8 to int32
+- * -# Convert b values from int8 to int32
+- * -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+- *
+- */
+-class CLGEMMLowpMatrixMultiplyKernelEx : public ICLKernel
+-{
+-public:
+- /** Default Constructor */
+- CLGEMMLowpMatrixMultiplyKernelEx();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLGEMMLowpMatrixMultiplyKernelEx(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLGEMMLowpMatrixMultiplyKernelEx &operator=(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
+- /** Allow instances of this class to be moved */
+- CLGEMMLowpMatrixMultiplyKernelEx(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
+- /** Allow instances of this class to be moved */
+- CLGEMMLowpMatrixMultiplyKernelEx &operator=(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
+- /** Initialise the kernel's input and output.
+- *
+- * @note This kernel should be used ONLY for Midgard architectures
+- *
+- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
+- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p
+- * input0
+- * @param[out] output Output tensor to store the result of matrix multiplication. Data type
+- * supported: S32
+- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of
+- * the input matrices
+- */
+- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
+- const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * CLGEMMLowpMatrixMultiplyKernelEx
+- *
+- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
+- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p
+- * input0
+- * @param[in] output Output tensor to store the result of matrix multiplication. Data type
+- * supported: S32
+- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of
+- * the input matrices
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1,
+- const ITensorInfo *output,
+- const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
+-
+- // Inherited methods overridden:
+- void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+- const ICLTensor *_input0;
+- const ICLTensor *_input1;
+- ICLTensor *_output;
+- bool _slide_matrix_b;
+- bool _reinterpret_input_as_3d;
+- bool _reinterpret_output_as_3d;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__*/
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
+deleted file mode 100644
+index dd2dbf6..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
++++ /dev/null
+@@ -1,83 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
+-#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** OpenCL kernel to calculate PReLU*/
+-class CLPReLUKernel : public ICLKernel
+-{
+-public:
+- /** Default constructor */
+- CLPReLUKernel();
+- /** Prevent instances of this class from being copied (As this class contains pointers). */
+- CLPReLUKernel(const CLPReLUKernel &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers). */
+- CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
+- /** Allow instances of this class to be moved */
+- CLPReLUKernel(CLPReLUKernel &&) = default;
+- /** Allow instances of this class to be moved */
+- CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
+- /** Initialize the kernel's input, output.
+- *
+- * @param[in] input Source tensor1.
+- * @param[in] alpha Source tensor2.
+- * @param[out] output Output tensor.
+- */
+- void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, cl::CommandQueue &queue) override;
+-
+- BorderSize border_size() const override;
+-
+-private:
+- const ICLTensor *_input;
+- const ICLTensor *_alpha;
+- ICLTensor *_output;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
+deleted file mode 100644
+index 4c0a82c..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
++++ /dev/null
+@@ -1,82 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+-#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** OpenCL kernel to perform spaceTodepth operation */
+-class CLSpaceToDepthKernel : public ICLKernel
+-{
+-public:
+- /** Default constructor */
+- CLSpaceToDepthKernel();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
+- /** Allow instances of this class to be moved */
+- CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
+- /** Allow instances of this class to be moved */
+- CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
+- /** Default destructor */
+- ~CLSpaceToDepthKernel() = default;
+- /** Initialise the kernel's input and output.
+- *
+- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- */
+- void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+- const ICLTensor *_input; /**< Source tensor */
+- ICLTensor *_output; /**< Destination tensor */
+-};
+-
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
+deleted file mode 100644
+index 9d174de..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
++++ /dev/null
+@@ -1,109 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL.
+- */
+-class CLTransposeConvLayerUpsampleKernel : public ICLKernel
+-{
+-public:
+- /** Constructor */
+- CLTransposeConvLayerUpsampleKernel();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLTransposeConvLayerUpsampleKernel &
+- operator=(const CLTransposeConvLayerUpsampleKernel &) = delete;
+- /** Default Move Constructor. */
+- CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default;
+- /** Default move assignment operator */
+- CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default;
+- /** Default destructor */
+- ~CLTransposeConvLayerUpsampleKernel() = default;
+-
+- /** Initialise the kernel's input and output.
+- *
+- * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
+- * @param[out] output Destination tensor. Data types supported: same as @p input. All but
+- * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only
+- * performed within the XY-plane.
+- * @param[in] inner_border Top and right inner border sizes. These rows and columns will be
+- * filled with zero.
+- * @param[in] info Contains padding and stride information described in @ref
+- * PadStrideInfo.
+- */
+- void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+- const PadStrideInfo &info);
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * CLTransposeConvLayerUpsample
+- *
+- * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32.
+- * @param[in] output Destination tensor info. Data types supported: same as @p input. All
+- * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is
+- * only performed within the XY-plane.
+- * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled
+- * with zero.
+- * @param[in] info Contains padding and stride information described in @ref
+- * PadStrideInfo.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+- const BorderSize &inner_border, const PadStrideInfo &info);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+- const ICLTensor *_input;
+- ICLTensor *_output;
+- BorderSize _inner_border;
+- PadStrideInfo _info;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
+deleted file mode 100644
+index d4c9c61..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
++++ /dev/null
+@@ -1,88 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
+-#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
+-
+-#include "arm_compute/core/CPP/ICPPKernel.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** CPP kernel to perform tensor upsample.
+- *
+- */
+-class CPPUpsampleKernelEx : public ICPPKernel
+-{
+-public:
+- const char *name() const override { return "CPPUpsampleKernelEx"; }
+- /** Default constructor */
+- CPPUpsampleKernelEx();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete;
+- /** Allow instances of this class to be moved */
+- CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default;
+- /** Allow instances of this class to be moved */
+- CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default;
+- /** Default destructor */
+- ~CPPUpsampleKernelEx() = default;
+-
+- /** Set the input and output of the kernel.
+- *
+- * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8
+- * @param[out] output The output tensor. Data types supported: Same as @p input
+- * @param[in] info Padding info.
+- */
+- void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, const ThreadInfo &info) override;
+- bool is_parallelisable() const override;
+-
+-private:
+- const ITensor *_input;
+- ITensor *_output;
+- PadStrideInfo _info;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
+deleted file mode 100644
+index 4e9f097..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
++++ /dev/null
+@@ -1,96 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NECASTKERNEL_H__
+-#define __ARM_COMPUTE_NECASTKERNEL_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for the cast layer kernel. */
+-class NECastKernel : public INEKernel
+-{
+-public:
+- const char *name() const override { return "NECastKernel"; }
+- /** Default constructor */
+- NECastKernel();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NECastKernel(const NECastKernel &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NECastKernel &operator=(const NECastKernel &) = delete;
+- /** Default Move Constructor. */
+- NECastKernel(NECastKernel &&) = default;
+- /** Default move assignment operator */
+- NECastKernel &operator=(NECastKernel &&) = default;
+- /** Default destructor */
+- ~NECastKernel() = default;
+- /** Set input, output tensors.
+- *
+- * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+- * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+- * U8/S8/QASYMM8/U32/S32/F32.
+- * @param[in] input_subtype Sub data type of input.
+- */
+- void configure(const ITensor *input, ITensor *output, SubDataType input_subtype);
+- /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel
+- *
+- * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+- * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+- * @param[in] input_subtype Sub data type of input.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+- SubDataType input_subtype);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, const ThreadInfo &info) override;
+-
+-private:
+- const ITensor *_input;
+- ITensor *_output;
+- SubDataType _input_subtype;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
+deleted file mode 100644
+index b62897e..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
++++ /dev/null
+@@ -1,96 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
+-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for the depth to space kernel */
+-class NEDepthToSpaceLayerKernelEx : public INEKernel
+-{
+-public:
+- const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; }
+- /** Default constructor */
+- NEDepthToSpaceLayerKernelEx();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete;
+- /** Allow instances of this class to be moved */
+- NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default;
+- /** Allow instances of this class to be moved */
+- NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default;
+- /** Default destructor */
+- ~NEDepthToSpaceLayerKernelEx() = default;
+- /** Initialise the kernel's inputs and output.
+- *
+- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[out] output Tensor output. Data types supported: same as @p input
+- * @param[in] block_shape Block shape x value.
+- */
+- void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NEDepthToSpaceLayerKernelEx.
+- *
+- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[in] output Tensor output info. Data types supported: same as @p input
+- * @param[in] block_shape Block shape value.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, const ThreadInfo &info) override;
+-
+-private:
+- const ITensor *_input; /**< Source tensor */
+- ITensor *_output; /**< Destination tensor */
+- int32_t _block_shape; /**< Block shape */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
+deleted file mode 100644
+index 57de78d..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
++++ /dev/null
+@@ -1,118 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
+-#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for an element-wise unary operation kernel
+- *
+- * Element-wise operation is computed by:
+- * @f[ output(x) = OP(input(x))@f]
+- *
+- */
+-class NEElementwiseUnaryKernelEx : public INEKernel
+-{
+-public:
+- const char *name() const override { return "NEElementwiseUnaryKernelEx"; }
+- /** Default constructor */
+- NEElementwiseUnaryKernelEx();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete;
+- /** Allow instances of this class to be moved */
+- NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default;
+- /** Allow instances of this class to be moved */
+- NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default;
+- /** Default destructor */
+- ~NEElementwiseUnaryKernelEx() = default;
+-
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NEElementwiseUnaryKernelEx
+- *
+- * @param[in] op Arithmetic operation to be executed.
+- * @param[in] input First tensor input. Data types supported: F16/F32/S32.
+- * @param[in] output Output tensor. Data types supported: Same as @p input.
+- */
+- void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output);
+-
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NEElementwiseUnaryKernelEx
+- *
+- * @param[in] op Arithmetic operation to be executed.
+- * @param[in] input First tensor input info. Data types supported: F16/F32/S32.
+- * @param[in] output Output tensor info. Data types supported: Same as @p input.
+- *
+- * @return a Status
+- */
+- static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+- const ITensorInfo *output);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, const ThreadInfo &info) override;
+-
+- /** Common signature for all the specialised arithmetic functions
+- *
+- * @param[in] input An input tensor. Data types supported: F16/F32/S32.
+- * @param[out] output The output tensor. Data types supported: Same as @p input.
+- * @param[in] window Region on which to execute the kernel.
+- */
+- using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output,
+- const Window &window);
+-
+-protected:
+- // Inherited methods overridden:
+- static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output);
+-
+- /** Function to use for the particular tensor types passed to configure() */
+- std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function;
+-
+- const ITensor *_input;
+- ITensor *_output;
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
+deleted file mode 100644
+index 722efd3..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
++++ /dev/null
+@@ -1,100 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__
+-#define __ARM_COMPUTE_NEPRELUKERNEL_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for the kernel to perform Parametric Rectified Linear Unit
+- *
+- * Result is computed by:
+- * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f]
+- */
+-class NEPReLUKernel : public INEKernel
+-{
+-public:
+- const char *name() const override { return "NEPReLUKernel"; }
+- /** Default constructor */
+- NEPReLUKernel();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NEPReLUKernel(const NEPReLUKernel &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NEPReLUKernel &operator=(const NEPReLUKernel &) = delete;
+- /** Allow instances of this class to be moved */
+- NEPReLUKernel(NEPReLUKernel &&) = default;
+- /** Allow instances of this class to be moved */
+- NEPReLUKernel &operator=(NEPReLUKernel &&) = default;
+- /** Initialise the kernel's inputs and output
+- *
+- * @param[in] input Input tensor. Data type supported: QASYMM8/F32
+- * @param[in] alpha Alpha tensor. Data types supported: Same as @p input
+- * @param[out] output Output tensor. Data types supported: Same as @p input
+- */
+- void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, const ThreadInfo &info) override;
+-
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NEPReLUKernel.h
+- *
+- * @param[in] input Input tensor input info. Data types supported: QASYMM8/F32.
+- * @param[in] alpha Alpha tensor input info. Data types supported: Same as @p input.
+- * @param[in] output Output tensor info. Data types supported: Same as @p input.
+- *
+- * @return a Status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *alpha,
+- const ITensorInfo *output);
+- static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+- const ITensorInfo &output);
+-
+-private:
+- const ITensor *_input; /**< Source tensor */
+- const ITensor *_alpha; /**< Alpha tensor */
+- ITensor *_output; /**< Destination tensor */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
+deleted file mode 100644
+index 0ffcf6b..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
++++ /dev/null
+@@ -1,97 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
+-#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-#include "arm_compute/core/Types.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for the space to depth kernel */
+-class NESpaceToDepthLayerKernelEx : public INEKernel
+-{
+-public:
+- const char *name() const override { return "NESpaceToDepthLayerKernelEx"; }
+- /** Default constructor */
+- NESpaceToDepthLayerKernelEx();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete;
+- /** Allow instances of this class to be moved */
+- NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default;
+- /** Allow instances of this class to be moved */
+- NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default;
+- /** Default destructor */
+- ~NESpaceToDepthLayerKernelEx() = default;
+- /** Initialise the kernel's inputs and output.
+- *
+- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[out] output Tensor output. Data types supported: same as @p input
+- * @param[in] block_shape Block shape value
+- */
+- void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NESpaceToDepthLayerKernelEx
+- *
+- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[in] output Tensor output info. Data types supported: same as @p input
+- * @param[in] block_shape Block shape value
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+-
+- // Inherited methods overridden:
+- void run(const Window &window, const ThreadInfo &info) override;
+-
+-private:
+- const ITensor *_input; /**< Source tensor */
+- ITensor *_output; /**< Destination tensor */
+- int32_t _block_shape; /**< Block shape */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+index 97bc4ce..cfbd134 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+@@ -16,25 +16,14 @@
+ #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
+ #define __ARM_COMPUTE_CLFUNCTIONSEX_H__
+
+-#include <arm_compute/runtime/CL/functions/CLArgOperation.h>
+-#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h>
+ #include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
+-#include <arm_compute/runtime/CL/functions/CLCast.h>
+-#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
+ #include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
+ #include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
+ #include <arm_compute/runtime/CL/functions/CLGatherEx.h>
+ #include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
+ #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
+-#include <arm_compute/runtime/CL/functions/CLLogicalNot.h>
+ #include <arm_compute/runtime/CL/functions/CLNeg.h>
+-#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
+-#include <arm_compute/runtime/CL/functions/CLPReLU.h>
+ #include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
+-#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h>
+-#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h>
+-#include <arm_compute/runtime/CL/functions/CLSplit.h>
+-#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h>
+ #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
+ #include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
+
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
+deleted file mode 100644
+index c37096f..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
++++ /dev/null
+@@ -1,129 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLArgOperation.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file contains arm_compute::CLArgOperation class
+- */
+-
+-#ifndef __ARM_COMPUTE_CLARGOPERATION_H__
+-#define __ARM_COMPUTE_CLARGOPERATION_H__
+-
+-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+-#include "arm_compute/runtime/CL/CLTensor.h"
+-#include "arm_compute/runtime/IFunction.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to execute CLArgOperation operation
+- */
+-class CLArgOperation : public IFunction
+-{
+-public:
+- /**
+- * @brief Construct a new CLArgOperation object
+- */
+- CLArgOperation();
+-
+- /**
+- * @brief Prevent instances of this class from being copied (As this class contains pointers)
+- */
+- CLArgOperation(const CLArgOperation &) = delete;
+-
+- /**
+- * @brief Prevent instances of this class from being copied (As this class contains pointers)
+- */
+- CLArgOperation &operator=(const CLArgOperation &) = delete;
+-
+- /**
+- * @brief Construct a new CLArgOperation object by using copy constructor
+- * @param[in] CLArgOperation object to move
+- */
+- CLArgOperation(CLArgOperation &&) = default;
+-
+- /**
+- * @brief Assign a CLArgOperation object.
+- * @param[in] CLArgOperation object to assign. This object will be moved.
+- */
+- CLArgOperation &operator=(CLArgOperation &&) = default;
+-
+- /**
+- * @brief Initialise the kernel's inputs and outputs.
+- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+- * @param[out] output The result of arg operation. Data types supported: S32.
+- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+- * @param[in] op Arg operation to perform.
+- * @return N/A
+- */
+- void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op);
+-
+- /**
+- * @brief Static function to check if given info will lead to a valid configuration
+- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+- * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+- * @param[out] output The result of arg operation. Data types supported: S32.
+- * @param[in] op Arg operation to perform.
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+- const ITensorInfo *output, ArgOperation op);
+- /**
+- * @brief Run the OpenCL kernel for this operation
+- * @return N/A
+- */
+- void run() override;
+-
+-private:
+- ICLTensor *_input{nullptr};
+- ICLTensor *_output{nullptr};
+- std::vector<uint32_t> _axis{};
+- ArgOperation _arg_op{ArgOperation::MAX};
+-
+- std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+- std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr};
+- size_t _num_of_kernels{0};
+-};
+-}
+-#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
+deleted file mode 100644
+index eed5cb8..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
++++ /dev/null
+@@ -1,69 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+-#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLBatchToSpaceNDKernel
+- *
+- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+- * @note The function converts the input tensor to the tensor of the output tensor's type.
+- */
+-class CLBatchToSpaceND : public ICLSimpleFunction
+-{
+-public:
+- /** Initialise the kernel's input and output.
+- *
+- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[in] block_size A pointer to an array of integer values specifying block sizes
+- * for spatial dimension.
+- */
+- void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+-};
+-
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
+deleted file mode 100644
+index ebe0d8a..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
++++ /dev/null
+@@ -1,75 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLCast.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file contains arm_compute::CLCast class
+- */
+-
+-#ifndef __ARM_COMPUTE_CLCAST_H__
+-#define __ARM_COMPUTE_CLCAST_H__
+-
+-#include "arm_compute/core/TypesEx.h"
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to run @ref CLCastKernel.
+- * This converts the input tensor to the tensor of the output tensor's type.
+- */
+-class CLCast : public ICLSimpleFunction
+-{
+-public:
+- /**
+- * @brief Initialise the kernel's input and output
+- * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * The input tensor is [in, out] because its TensorInfo might be
+- * modified inside the kernel.
+- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[in] input_subtype Sub data type of input.
+- */
+- void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
+-};
+-}
+-#endif /* __ARM_COMPUTE_CLCAST_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
+deleted file mode 100644
+index d52a538..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
++++ /dev/null
+@@ -1,68 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+-#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLDepthToSpaceKernel
+- *
+- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+- * @note The function converts the input tensor to the tensor of the output tensor's type.
+- */
+-class CLDepthToSpace : public ICLSimpleFunction
+-{
+-public:
+- /** Initialise the kernel's input and output.
+- *
+- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[block_size] block size integer only
+- */
+- void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+-};
+-} // namesace arm_compute
+-
+-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
+new file mode 100644
+index 0000000..409eaf5
+--- /dev/null
++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
+@@ -0,0 +1,201 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/*
++ * Copyright (c) 2019-2020 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
++#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
++
++#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
++#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
++#include "arm_compute/runtime/CL/functions/CLReverse.h"
++#include "arm_compute/runtime/CL/functions/CLTranspose.h"
++
++#include "arm_compute/runtime/CL/CLTensor.h"
++#include "arm_compute/runtime/IFunction.h"
++#include "arm_compute/runtime/IMemoryManager.h"
++#include "arm_compute/runtime/MemoryGroup.h"
++
++#include <memory>
++
++namespace arm_compute
++{
++class ICLTensor;
++/** Function to run the deconvolution layer.
++ *
++ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
++ * depending on the stride and pad info and then perform a 1x1
++ * convolution pass. Input stride defines how many zeroes we should put between each element of the
++ * input and pad is the amount of padding.
++ *
++ * The relation between input to output is as follows:
++ * \f[
++ * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
++ * \f]
++ * \f[
++ * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
++ * \f]
++ *
++ * where:
++ * width_input is the size of the first input dimension.
++ * height_input is the size of the second input dimension.
++ * width_output is the size of the first output dimension.
++ * height_output is the size of the second output dimension.
++ * kernel_x and kernel_y are the convolution sizes in x and y.
++ * stride_x and stride_y is the input stride of the first and second dimension.
++ *
++ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
++ * Therefore, it will be necessary to use the weights in the
++ * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse.
++ *
++ * This function calls the following OpenCL kernels/functions:
++ *
++ * -# @ref CLDeconvolutionLayerUpsample
++ * -# @ref CLConvolutionLayer
++ *
++ * And the following CPP kernels:
++ * -# @ref CLReverse
++ *
++ */
++class CLDirectTransposeConvLayer : public IFunction
++{
++public:
++ /** Constructor */
++ CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
++ /** Prevent instances of this class from being copied (As this class contains pointers) */
++ CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete;
++ /** Default move constructor */
++ CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default;
++ /** Prevent instances of this class from being copied (As this class contains pointers) */
++ CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete;
++ /** Default move assignment operator */
++ CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
++ /** Set the input, weights, biases and output tensors.
++ *
++ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
++ * optional 4th dimension for batch of inputs.
++ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
++ * supported: Same as @p input.
++ * @param[in] bias (Optional) The biases have one dimension.
++ * Data type supported: Should match @p input data type, except for
++ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
++ * @param[out] output Output tensor. The output has the same number of dimensions as the
++ * @p input.
++ * @param[in] info Contains padding and policies to be used in the deconvolution, this
++ * is decribed in @ref PadStrideInfo.
++ * @param[in] invalid_right The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
++ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
++ *
++ */
++ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
++ const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
++ const WeightsInfo &weights_info = WeightsInfo());
++ /** Set the input, weights, biases and output tensors.
++ *
++ * @param[in] compile_context The compile context to be used.
++ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and
++ * an optional 4th dimension for batch of inputs.
++ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data
++ * type supported: Same as @p input.
++ * @param[in] bias (Optional) The biases have one dimension.
++ * Data type supported: Should match @p input data type, except for
++ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
++ * @param[out] output Output tensor. The output has the same number of dimensions as
++ * the @p input.
++ * @param[in] info Contains padding and policies to be used in the deconvolution,
++ * this is decribed in @ref PadStrideInfo.
++ * @param[in] invalid_right The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
++ * @param[in] weights_info (Optional) Weights information needed for @ref
++ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
++ * CLWeightsReshapeKernel.
++ *
++ */
++ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
++ const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
++ unsigned int invalid_right, unsigned int invalid_bottom,
++ const WeightsInfo &weights_info = WeightsInfo());
++ /** Static function to check if given info will lead to a valid configuration of @ref
++ * CLDirectTransposeConvLayer
++ *
++ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
++ * optional 4th dimension for batch of inputs.
++ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data
++ * type supported: Same as @p input.
++ * @param[in] bias (Optional) The biases have one dimension.
++ * Data type supported: Should match @p input data type, except for input
++ * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
++ * @param[in] output Output tensor info. The output has the same number of dimensions as the
++ * @p input.
++ * @param[in] info Contains padding and policies to be used in the deconvolution, this is
++ * decribed in @ref PadStrideInfo.
++ * @param[in] invalid_right The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
++ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
++ *
++ * @return a status
++ */
++ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
++ const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
++ unsigned int invalid_right, unsigned int invalid_bottom,
++ const WeightsInfo &weights_info = WeightsInfo());
++
++ // Inherited methods overridden:
++ void run() override;
++ void prepare() override;
++
++private:
++ MemoryGroup _memory_group;
++ CLDeconvolutionLayerUpsample _scale_f;
++ CLConvolutionLayer _conv_f;
++ CLReverse _flip_weights;
++
++ CLTensor _scaled_output;
++ ICLTensor *_original_weights;
++ CLTensor _weights_flipped;
++ CLTensor _flip_axis;
++
++ bool _is_prepared;
++};
++} // namespace arm_compute
++#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+index 1a0284a..f3266f6 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+@@ -50,7 +50,7 @@
+ #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+ #include "arm_compute/runtime/MemoryGroup.h"
+ #include "arm_compute/runtime/CL/CLTensor.h"
+-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
++#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+
+ namespace arm_compute
+ {
+@@ -168,7 +168,7 @@ private:
+ CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
+ CLScaleFactorSymm8Kernel _scale_factor_kernel;
+ CLQuantizationSymmetricKernel _quant_input_kernel;
+- CLGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
++ CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+ CLMultiplyScaleFactorKernel _multiply_scale_kernel;
+ CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to
+ // add bias in
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
+deleted file mode 100644
+index 68aba74..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
++++ /dev/null
+@@ -1,142 +0,0 @@
+-/*
+- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+-
+-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
+-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+-#include "arm_compute/runtime/CL/CLTensor.h"
+-#include "arm_compute/runtime/IFunction.h"
+-#include "arm_compute/runtime/MemoryGroup.h"
+-
+-namespace arm_compute
+-{
+-class IMemoryManager;
+-class ICLTensor;
+-
+-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the
+- * following OpenCL kernels:
+- *
+- * -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of
+- * GEMMInfo is FALSE)
+- * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
+- * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
+- *
+-*/
+-class CLGEMMLowpMatrixMultiplyCoreEx : public IFunction
+-{
+-public:
+- /** Constructor */
+- CLGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLGEMMLowpMatrixMultiplyCoreEx(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
+- /** Default move constructor */
+- CLGEMMLowpMatrixMultiplyCoreEx(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLGEMMLowpMatrixMultiplyCoreEx &operator=(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
+- /** Default move assignment operator */
+- CLGEMMLowpMatrixMultiplyCoreEx &operator=(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
+- /** Initialise the kernel's inputs, output
+- *
+- * @note GEMMLowp: low precision GEMM kernel. [A * B + C]
+- * This kernel performs the following computations:
+- *
+- * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+- * -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
+- * -# Compute the matrix product of the resulting a * b in int32.
+- * -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
+- *
+- * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8.
+- * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
+- * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported:
+- * S32
+- * @param[out] output Output tensor. Data type supported: S32 or QASYMM8 if
+- * gemm_info.gemmlowp_output_stage != NONE
+- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+- * and
+- * if the reshape of matrix B should be executed only for the first run
+- */
+- void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output,
+- const GEMMInfo &gemm_info = GEMMInfo());
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * CLGEMMLowpMatrixMultiplyCoreEx
+- *
+- * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8.
+- * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a
+- * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type
+- * supported: S32
+- * @param[in] output Output tensor info. Data type supported: S32 or QASYMM8 if
+- * gemm_info.gemmlowp_output_stage != NONE
+- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+- * and
+- * if the reshape of matrix B should be executed only for the first run
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+- const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+-
+- // Inherited methods overridden:
+- void run() override;
+- void prepare() override;
+-
+-private:
+- MemoryGroup _memory_group;
+-
+- // Kernels used
+- CLGEMMLowpMatrixMultiplyKernelEx _mm_midgard_kernel;
+- CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
+- CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
+-
+- // Temporary tensors
+- CLTensor _vector_sum_col;
+- CLTensor _vector_sum_row;
+-
+- int32_t _a_offset;
+- int32_t _b_offset;
+- bool _reshape_b_only_on_first_run;
+- bool _is_prepared;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
+deleted file mode 100644
+index 5121671..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
++++ /dev/null
+@@ -1,62 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__
+-#define __ARM_COMPUTE_CLLOGICALNOT_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-class CLLogicalNot : public ICLSimpleFunction
+-{
+-public:
+- /** Initialise the function's source and destination.
+- *
+- * @param[in] input Source tensor. Data types supported: QASYMM8.
+- * @param[out] output Output tensor. Data types supported: QASYMM8.
+- */
+- void configure(ICLTensor *input, ICLTensor *output);
+-};
+-
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
+deleted file mode 100644
+index 7fbe558..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
++++ /dev/null
+@@ -1,64 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLPRELU_H__
+-#define __ARM_COMPUTE_CLPRELU_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-class CLPReLU : public ICLSimpleFunction
+-{
+-public:
+- /** Initialise the function's source and destination.
+- *
+- * @param[in] input. Data types supported:
+- * QASYMM8/F16/F32.
+- * @param[in] alpha. Data types supported:
+- * QASYMM8/F16/F32.
+- * @param[out] output Output tensor. Data types supported: Same as @p input.
+- */
+- void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLPRELU_H__*/
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+deleted file mode 100644
+index e83fb01..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
++++ /dev/null
+@@ -1,103 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLPixelWiseDivision.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file contains arm_compute::CLPixelWiseDivision class
+- */
+-#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+-#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to run @ref CLPixelWiseDivisionKernel.
+- */
+-class CLPixelWiseDivision : public ICLSimpleFunction
+-{
+-public:
+- /**
+- * @brief Initialise the kernel's inputs, output and convertion policy.
+- * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32
+- * The input tensor is [in, out] because its TensorInfo might be
+- * modified inside the kernel in case of broadcasting of dimension 0.
+- * @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
+- * The input tensor is [in, out] because its TensorInfo might be
+- * modified inside the kernel in case of broadcasting of dimension 0.
+- * @param[out] output The output tensor, Data types supported: same as @p input1.
+- * Note: U8 requires both inputs to be U8.
+- * @param[in] scale Scale to apply after multiplication.
+- * Scale must be positive and its value must be either 1/255 or
+- * 1/2^n where n is between 0 and 15.
+- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
+- * even.
+- * @return N/A
+- */
+- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
+- ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+- RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+-
+- /**
+- * @brief Static function to check if given info will lead to a valid configuration of @ref
+- * CLPixelWiseDivision
+- * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32
+- * @param[in] input2 An input tensor info. Data types supported: same as @p input1.
+- * @param[in] output The output tensor info, Data types supported: same as @p input1.
+- * Note: U8 requires both inputs to be U8.
+- * @param[in] scale Scale to apply after multiplication.
+- * Scale must be positive and its value must be either 1/255 or 1/2^n
+- * where n is between 0 and 15.
+- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+- const ITensorInfo *output, float scale = 1.f,
+- ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+- RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+-};
+-}
+-#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
+deleted file mode 100644
+index b49cbd8..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
++++ /dev/null
+@@ -1,120 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__
+-#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__
+-
+-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLRNNLayerEx */
+-class CLRNNLayerEx : public IFunction
+-{
+-public:
+- /** Default constructor */
+- CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+- /** Initialize the function
+- *
+- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
+- * types supported: F16/F32
+- * @param[in] weights Weights tensor of shape [input_size, num_units] that
+- * multiplies the input. Data types supported: Same as @p input
+- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+- * the current 'state'. Data types supported: Same as @p input
+- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same
+- * as @p input
+- * @param[out] output Output tensor of shape [num_units, batch_size]. Data types
+- * supported: Same as @p input
+- * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types
+- * supported: Same as @p input
+- * @param[in] info Activation layer parameter.
+- */
+- void configure(const ICLTensor *input, const ICLTensor *weights,
+- const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
+- ICLTensor *output, ActivationLayerInfo &info);
+- /** Initialize the function
+- *
+- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
+- * types supported: F16/F32
+- * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies
+- * the input. Data types supported: Same as @p input
+- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+- * current 'state'. Data types supported: Same as @p input
+- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p
+- * input
+- * @param[in] output Output tensor of shape [num_units, batch_size]. Data types
+- * supported: Same as @p input
+- * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types
+- * supported: Same as @p input
+- * @param[in] info Activation layer parameter.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+- const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+- const ITensorInfo *hidden_state, const ITensorInfo *output,
+- const ActivationLayerInfo &info);
+-
+- // Inherited methods overridden:
+- void run() override;
+- void prepare() override;
+-
+-private:
+- MemoryGroup _memory_group;
+- CLGEMM _gemm_state_f;
+- CLSaturatedArithmeticOperationKernel _add_kernel;
+- CLActivationLayerKernel _activation_kernel;
+- CLFullyConnectedLayer _fully_connected_kernel;
+- CLCopyKernel _copy_kernel;
+- CLTensor _fully_connected_out;
+- CLTensor _gemm_output;
+- CLTensor _add_output;
+- bool _is_prepared;
+-};
+-}
+-#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
+deleted file mode 100644
+index 2090b46..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
++++ /dev/null
+@@ -1,68 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
+-#define __ARM_COMPUTE_CLSPACETODEPTH_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLSpaceToDepthKernel
+- *
+- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+- * @note The function converts the input tensor to the tensor of the output tensor's type.
+- */
+-class CLSpaceToDepth : public ICLSimpleFunction
+-{
+-public:
+- /** Initialise the kernel's input and output.
+- *
+- * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+- * @param[block_size] block size integer only
+- */
+- void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+-};
+-
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
+deleted file mode 100644
+index 03edd15..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
++++ /dev/null
+@@ -1,81 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLStridedSlice.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
+- */
+-
+-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+-#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to run @ref CLStridedSliceKernel
+- */
+-class CLStridedSliceEx : public ICLSimpleFunction
+-{
+-public:
+- /**
+- * @brief Initialise the kernel's inputs and outputs
+- * @param[in] input Tensor input. Data type supported:
+- * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+- * @param[out] output Output tensor. Data type supported: Same as @p input
+- * @param[in] beginData 'begin' vector of strided slice operation
+- * @param[in] endData 'end' vector of strided slice operation
+- * @param[in] stridesData 'strides' vector of strided slice operation
+- * @param[in] beginMask If the ith bit is set, begin[i] is ignored
+- * @param[in] endMask If the ith bit is set, end[i] is ignored
+- * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the
+- * dimensionality by 1, taking on the value at index begin[i]
+- * @return N/A
+- */
+- void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+- ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+- int32_t shrinkAxisMask);
+-};
+-}
+-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+index 54a697e..5fb102e 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+@@ -15,7 +15,7 @@
+ */
+
+ /*
+- * Copyright (c) 2017-2018 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+@@ -37,16 +37,11 @@
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+-
+ #ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+ #define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+
+-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+-
+-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+-
+-#include "arm_compute/runtime/CL/CLTensor.h"
++#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
++#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
+ #include "arm_compute/runtime/IFunction.h"
+ #include "arm_compute/runtime/IMemoryManager.h"
+
+@@ -54,119 +49,102 @@
+
+ namespace arm_compute
+ {
+-class ICLTensor;
+-/** Function to run the transpose convolution layer.
+- *
+- * @note This layer was copied in order to fix a bug computing to wrong output dimensions.
+- *
+- * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input
+- * depending on the stride and pad info and then perform a 1x1
+- * convolution pass. Input stride defines how many zeroes we should put between each element of the
+- * input, pad is the amount of padding and finally a is a user
+- * specified value where a < stride - 1, that increases the padding top and right of the input
+- * image.
+- *
+- * The relation between input to output is as follows:
+- * \f[
+- * width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
+- * \f]
+- * \f[
+- * height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
+- * \f]
+- *
+- * where:
+- * width_input is the size of the first input dimension.
+- * height_input is the size of the second input dimension.
+- * width_output is the size of the first output dimension.
+- * height_output is the size of the second output dimension.
+- * kernel_x and kernel_y are the convolution sizes in x and y.
+- * stride_x and stride_y is the input stride of the first and second dimension.
+- *
+- * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+- * Therefore, it will be necessary to use the weights in the
+- * reverse order to perform an actual convolution. This is achieved by using the @ref
+- * CPPFlipWeightsKernel.
+- *
+- * This function calls the following OpenCL kernels/functions:
+- *
+- * -# @ref CLTransposeConvLayerUpsample
+- * -# @ref CLConvolutionLayer
++/** Basic function to compute the deconvolution layer. This function calls the following OpenCL
++ * kernels/functions:
+ *
++ * -# @ref CLGEMMDeconvolutionLayer
++ * -# @ref CLDirectTransposeConvLayer
+ */
+ class CLTransposeConvLayer : public IFunction
+ {
+ public:
+- /** Constructor */
++ /** Default constructor */
+ CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLTransposeConvLayer(const CLTransposeConvLayer &) = delete;
+- /** Default move constructor */
+- CLTransposeConvLayer(CLTransposeConvLayer &&) = default;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete;
+- /** Default move assignment operator */
+- CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default;
++
+ /** Set the input, weights, biases and output tensors.
+ *
+- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input,
+- * and an optional 4th dimension for batch of inputs.
+- * Data types supported: QASYMM8/F16/F32.
+- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
+- * Data type supported: Same as @p input.
+- * @param[in] bias (Optional) The biases have one dimension. Data type supported:
+- * Same as @p input.
+- * @param[out] output Output tensor. The output has the same number of dimensions
+- * as the @p input.
+- * @param[in] info Contains padding and policies to be used in the
+- * transpose convolution, this is decribed in @ref PadStrideInfo.
+- * @param[in] invalid_right The number of zeros added to right edge of the output.
+- * @param[in] invalid_bottom The number of zeros added to top edge of the output.
+- * @param[in] weights_info (Optional) Weights information needed for @ref
+- * CLConvolutionLayer, specifies if the weights tensor has been
+- * reshaped with @ref CLWeightsReshapeKernel.
++ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
++ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
++ * supported: Same as @p input.
++ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same
++ * as @p input.
++ * @param[out] output Output tensor. The output has the same number of dimensions as the
++ * @p input.
++ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this
++ * is described in @ref PadStrideInfo.
++ * @param[in] invalid_right The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
++ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
++ *
+ */
+ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+- const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
++ const PadStrideInfo &deconv_info, unsigned int invalid_right,
++ unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo());
++ /** Set the input, weights, biases and output tensors.
++ *
++ * @param[in] compile_context The compile context to be used.
++ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and
++ * an optional 4th dimension for batch of inputs. Data types supported:
++ * QASYMM8_SIGNED/QASYMM8/F16/F32.
++ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data
++ * type supported: Same as @p input.
++ * @param[in] bias (Optional) The biases have one dimension. Data type supported:
++ * Same as @p input.
++ * @param[out] output Output tensor. The output has the same number of dimensions as
++ * the @p input.
++ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution,
++ * this is described in @ref PadStrideInfo.
++ * @param[in] invalid_right The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
++ * @param[in] weights_info (Optional) Weights information needed for @ref
++ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
++ * CLWeightsReshapeKernel.
++ *
++ */
++ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
++ const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
++ unsigned int invalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info = WeightsInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref
+- * CLTransposeConvLayer
++ * CLTransposeConvLayer
++ *
++ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
++ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data
++ * type supported: Same as @p input.
++ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as
++ * @p input.
++ * @param[in] output Output tensor info. The output has the same number of dimensions as the
++ * @p input.
++ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is
++ * described in @ref PadStrideInfo.
++ * @param[in] invalid_right The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
++ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ *
+- * @param[in] input Input tensor info. 3 lower dimensions represent a single input,
+- * and an optional 4th dimension for batch of inputs.
+- * Data types supported: QASYMM8/F16/F32.
+- * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM].
+- * Data type supported: Same as @p input.
+- * @param[in] bias (Optional) The biases have one dimension. Data type supported:
+- * Same as @p input.
+- * @param[in] output Output tensor info. The output has the same number of dimensions
+- * as the @p input.
+- * @param[in] info Contains padding and policies to be used in the
+- * transpose convolution, this is decribed in @ref PadStrideInfo.
+- * @param[in] innvalid_right The number of zeros added to right edge of the output.
+- * @param[in] invalid_bottom The number of zeros added to top edge of the output.
+- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+- * specifies if the weights tensor has been reshaped with @ref
+- * CLWeightsReshapeKernel.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+- const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+- unsigned int innvalid_right, unsigned int invalid_bottom,
++ const ITensorInfo *bias, ITensorInfo *output,
++ const PadStrideInfo &deconv_info, unsigned int invalid_right,
++ unsigned int invalid_bottom,
+ const WeightsInfo &weights_info = WeightsInfo());
+
++ static DeconvolutionMethod
++ get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights,
++ const ITensorInfo *bias, ITensorInfo *output,
++ const PadStrideInfo &deconv_info, unsigned int invalid_right,
++ unsigned int invalid_bottom, const WeightsInfo &weights_info);
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
+
+ private:
+- MemoryGroup _memory_group;
+- CLTransposeConvLayerUpsample _scale_f;
+- CLConvolutionLayer _conv_f;
+- CPPFlipWeightsKernel _flip_weights;
+- CLTensor _scaled_output;
+- ICLTensor *_original_weights;
+- CLTensor _weights_flipped;
+- bool _is_prepared;
++ std::shared_ptr<IMemoryManager> _memory_manager;
++ std::unique_ptr<IFunction> _function;
+ };
+-}
++} // namespace arm_compute
+ #endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
+deleted file mode 100644
+index 7570fe7..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
++++ /dev/null
+@@ -1,102 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+-
+-#include "arm_compute/runtime/IFunction.h"
+-
+-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/IFunction.h"
+-#include "arm_compute/runtime/IMemoryManager.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */
+-class CLTransposeConvLayerUpsample : public IFunction
+-{
+-public:
+- /** Default constructor */
+- CLTransposeConvLayerUpsample();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete;
+- /** Allow instances of this class to be moved */
+- CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default;
+- /** Allow instances of this class to be moved */
+- CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default;
+- /** Default destructor */
+- virtual ~CLTransposeConvLayerUpsample() = default;
+-
+- /** Initialize the function's source, destination, interpolation type and border_mode.
+- *
+- * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32.
+- * @param[out] output Destination tensor. Data type supported: same as @p input.
+- * @param[in] inner_border The number of zeros added to right and top edges of the input.
+- * @param[in] info Contains padding and policies to be used in the deconvolution.
+- */
+- void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+- const PadStrideInfo &info);
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * CLTransposeConvLayerUpsample
+- *
+- * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32.
+- * @param[in] output Destination tensor info. Data type supported: same as @p input.
+- * @param[in] inner_border The number of zeros added to right and top edges of the input.
+- * @param[in] info Contains padding and policies to be used in the deconvolution.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+- const BorderSize &inner_border, const PadStrideInfo &info);
+-
+- // Inherited methods overridden:
+- void run() override;
+-
+-private:
+- CLTransposeConvLayerUpsampleKernel _upsample;
+- ICLTensor *_output;
+-};
+-}
+-#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
+deleted file mode 100644
+index 666afef..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
++++ /dev/null
+@@ -1,65 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
+-#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
+-
+-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
+-
+-#include "arm_compute/core/Types.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to run @ref CPPUpsample */
+-class CPPUpsampleEx : public ICPPSimpleFunction
+-{
+-public:
+- /** Configure the upsample CPP kernel
+- *
+- * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8
+- * @param[out] output The output tensor. Data types supported: Same as @p input
+- * @param[in] info Padding information
+- */
+- void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
+-};
+-}
+-#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+index 49504fd..3fad230 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+@@ -18,20 +18,13 @@
+
+ #include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
+-#include <arm_compute/runtime/NEON/functions/NECast.h>
+-#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
+ #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
+ #include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
+ #include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
+-#include <arm_compute/runtime/NEON/functions/NEPReLU.h>
+-#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
+-#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
+-#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h>
+-#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h>
+ #include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
+
+ #endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
+deleted file mode 100644
+index f0f0d81..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
++++ /dev/null
+@@ -1,79 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NECAST_H__
+-#define __ARM_COMPUTE_NECAST_H__
+-
+-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+-
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-// Forward declarations
+-class ITensor;
+-
+-/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */
+-class NECast : public INESimpleFunctionNoBorder
+-{
+-public:
+- /** Configure the kernel.
+- *
+- * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+- * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+- * U8/S8/QASYMM8/U32/S32/F32.
+- * @param[in] input_subtype Sub data type of input.
+- */
+- void configure(const ITensor *input, ITensor *output,
+- SubDataType input_subtype = SubDataType::NONE);
+- /** Static function to check if given info will lead to a valid configuration of @ref NECast
+- *
+- * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+- * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+- * @param[in] input_subtype Sub data type of input.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+- SubDataType input_subtype = SubDataType::NONE);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NECAST_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
+deleted file mode 100644
+index 005d85a..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
++++ /dev/null
+@@ -1,78 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+-
+-#include "arm_compute/runtime/IFunction.h"
+-
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */
+-class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder
+-{
+-public:
+- /** Set the input and output tensors.
+- *
+- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[out] output Tensor output. Data types supported: same as @p input
+- * @param[in] block_shape Block shape value.
+- */
+- void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NEDepthToSpaceLayerEx.
+- *
+- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[in] output Tensor output info. Data types supported: same as @p input
+- * @param[in] block_shape Block shape x value.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
+deleted file mode 100644
+index 27a38e9..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
++++ /dev/null
+@@ -1,70 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+-#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+-
+-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to perform negative on an input tensor. */
+-class NENegLayer : public INESimpleFunction
+-{
+-public:
+- /** Initialize the function
+- *
+- * @param[in] input Input tensor. Data types supported: F16/F32/S32.
+- * @param[out] output Output tensor. Data types supported: same as @p input.
+- */
+- void configure(const ITensor *input, ITensor *output);
+- /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
+- *
+- * @param[in] input First tensor input info. Data types supported: F16/F32/S32.
+- * @param[in] output Output tensor info. Data types supported: Same as @p input.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+index 39c57eb..56548a4 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+@@ -46,7 +46,7 @@
+ #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+ #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+ #include "arm_compute/runtime/MemoryGroup.h"
+-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
++#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+ #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+ #include "arm_compute/runtime/Tensor.h"
+
+@@ -164,7 +164,7 @@ private:
+ MemoryGroup _memory_group;
+ NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
+ NEQuantizationSymmetricKernel _quant_input_kernel;
+- NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
++ NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+ NEMultiplyScaleFactorKernel _multiply_scale_kernel;
+ NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+ Tensor _reshape_weights_output;
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
+deleted file mode 100644
+index d844513..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
++++ /dev/null
+@@ -1,170 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+-#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+-#include "arm_compute/runtime/IFunction.h"
+-#include "arm_compute/runtime/IMemoryManager.h"
+-#include "arm_compute/runtime/MemoryGroup.h"
+-// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+-#include "arm_compute/runtime/Tensor.h"
+-
+-#include <memory>
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following
+- * NEON kernels if the DOT product instruction is not available:
+- *
+- * -# @ref NEGEMMInterleave4x4Kernel
+- * -# @ref NEGEMMTranspose1xWKernel
+- * -# @ref NEGEMMLowpMatrixMultiplyKernel
+- * -# @ref NEGEMMLowpOffsetContributionKernel
+- * -# @ref NEActivationLayer
+- *
+- * otherwise if the DOT product instruction is available:
+- *
+- * -# @ref NEGEMMLowpOffsetContributionKernel
+- *
+-*/
+-class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction
+-{
+-public:
+- /** Constructor */
+- NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+- /** Default move constructor */
+- NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+- /** Default move assignment operator */
+- NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+- /** Initialise the kernel's inputs, output
+- *
+- * @note GEMM_LOWP: low precision GEMM kernel
+- * This kernel performs the following computations:
+- *
+- * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+- * -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+- * -# Compute the matrix product of the resulting a * b in int32.
+- *
+- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+- * QASYMM8/QASYMM8_SIGNED otherwise
+- *
+- * @param[in] a First input tensor (Matrix A). Data type supported:
+- * QASYMM8/QASYMM8_SIGNED.
+- * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
+- * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported:
+- * S32
+- * @param[out] output Output tensor. Data type supported: Data type supported:
+- * S32/QASYMM8/QASYMM8_SIGNED
+- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+- * and
+- * if the reshape of matrix B should be executed only for the first run
+- */
+- void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output,
+- const GEMMInfo &gemm_info = GEMMInfo());
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NEGEMMLowpMatrixMultiplyCoreEx
+- *
+- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+- * QASYMM8/QASYMM8_SIGNED otherwise
+- *
+- * @param[in] a First input tensor info (Matrix A). Data type supported:
+- * QASYMM8/QASYMM8_SIGNED.
+- * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a
+- * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type
+- * supported: S32
+- * @param[in] output Output tensor info. Data type supported: Data type supported:
+- * S32/QASYMM8/QASYMM8_SIGNED
+- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+- * and
+- * if the reshape of matrix B should be executed only for the first run
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+- const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+-
+- // Inherited methods overridden
+- void run() override;
+- void prepare() override;
+-
+-private:
+- MemoryGroup _memory_group;
+- NEGEMMAssemblyDispatch _asm_glue;
+- std::unique_ptr<INEKernel> _mm_kernel;
+- std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
+- std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
+- NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
+- NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
+- NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+- NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
+-
+- Tensor _vector_sum_col;
+- Tensor _vector_sum_row;
+- Tensor _tmp_a;
+- Tensor _tmp_b;
+- Tensor _mm_result_s32;
+- Tensor _signed_a;
+- Tensor _signed_output;
+- const ITensor *_original_b;
+- int32_t _a_offset;
+- int32_t _b_offset;
+-
+- bool _run_vector_matrix_multiplication;
+- bool _assembly_path;
+- bool _fused_assembly_path;
+- bool _reshape_b_only_on_first_run;
+- bool _is_prepared;
+- bool _fuse_output_stage;
+- bool _flip_signedness;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
+deleted file mode 100644
+index ca84133..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
++++ /dev/null
+@@ -1,63 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEPRELU_H__
+-#define __ARM_COMPUTE_NEPRELU_H__
+-
+-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to run @ref NEPReLUKernel */
+-class NEPReLU : public INESimpleFunctionNoBorder
+-{
+-public:
+- /** Initialise the kernel's inputs and output
+- *
+- * @param[in] input. Data types supported: QASYMM8/F32.
+- * @param[in] alpha. Data types supported: Same as @p input.
+- * @param[out] output Output tensor. Data types supported: Same as @p input.
+- */
+- void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEPRELU_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
+deleted file mode 100644
+index 8a7b179..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
++++ /dev/null
+@@ -1,130 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__
+-#define __ARM_COMPUTE_NERNNLAYER_EX_H__
+-
+-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+-
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+-
+-namespace arm_compute
+-{
+-// Forward declarations
+-class ITensor;
+-
+-/** Basic function to run @ref NERNNLayerEx */
+-class NERNNLayerEx : public IFunction
+-{
+-public:
+- /** Default constructor */
+- NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NERNNLayerEx(const NERNNLayerEx &) = delete;
+- /** Default move constructor */
+- NERNNLayerEx(NERNNLayerEx &&) = default;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NERNNLayerEx &operator=(const NERNNLayerEx &) = delete;
+- /** Default move assignment operator */
+- NERNNLayerEx &operator=(NERNNLayerEx &&) = default;
+- /** Initialize the function
+- *
+- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
+- * types supported: F16/F32
+- * @param[in] weights Weights tensor of shape [input_size, num_units] that
+- * multiplies the input. Data types supported: Same as @p input
+- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+- * the current 'state'. Data types supported: Same as @p input
+- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same
+- * as @p input
+- * @param[out] output Output tensor of shape [num_units, batch_size]. Data types
+- * supported: Same as @p input
+- * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types
+- * supported: Same as @p input
+- * @param[in] info Activation layer parameter.
+- */
+- void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights,
+- const ITensor *bias, ITensor *hidden_state, ITensor *output,
+- ActivationLayerInfo &info);
+- /** Initialize the function
+- *
+- * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
+- * types supported: F16/F32
+- * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies
+- * the input. Data types supported: Same as @p input
+- * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+- * current 'state'. Data types supported: Same as @p input
+- * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p
+- * input
+- * @param[in] output Output tensor of shape [num_units, batch_size]. Data types
+- * supported: Same as @p input
+- * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types
+- * supported: Same as @p input
+- * @param[in] info Activation layer parameter.
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+- const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+- const ITensorInfo *hidden_state, const ITensorInfo *output,
+- const ActivationLayerInfo &info);
+-
+- // Inherited methods overridden:
+- void run() override;
+- void prepare() override;
+-
+-private:
+- MemoryGroup _memory_group;
+- NEGEMM _gemm_state_f;
+- NEArithmeticAdditionKernel _add_kernel;
+- NEActivationLayerKernel _activation_kernel;
+- NEFullyConnectedLayer _fully_connected_kernel;
+- NECopyKernel _copy_kernel;
+- Tensor _fully_connected_out;
+- Tensor _gemm_output;
+- Tensor _add_output;
+- bool _is_prepared;
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
+deleted file mode 100644
+index 03ac457..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
++++ /dev/null
+@@ -1,99 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+-#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+-
+-#include "arm_compute/runtime/IFunction.h"
+-
+-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/MemoryGroup.h"
+-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to perform reduce operation */
+-class NEReduceMeanEx : public IFunction
+-{
+-public:
+- /** Constructor */
+- NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+- /** Configure kernel
+- *
+- * @note Supported tensor rank: up to 4
+- *
+- * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+- * @param[in] reduction_axis Reduction axis vector.
+- * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+- * @param[out] output Destination tensor. Data type supported: Same as @p input
+- */
+- void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+- ITensor *output);
+-
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NEReduceMeanEx
+- *
+- * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+- * @param[in] reduction_axis Reduction axis vector.
+- * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+- * @param[in] output Destination tensor. Data type supported: Same as @p input
+- *
+- * @return A status
+- */
+- static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+- bool keep_dims, const ITensorInfo *output);
+-
+- // Inherited methods overridden:
+- void run() override;
+-
+-private:
+- MemoryGroup _memory_group;
+- std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
+- std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
+- NEReshapeLayer _reshape;
+- unsigned int _reduction_ops;
+- bool _keep_dims;
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
+deleted file mode 100644
+index 3b695fb..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
++++ /dev/null
+@@ -1,136 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+-#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+-
+-#include "arm_compute/runtime/IFunction.h"
+-
+-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+-#include "arm_compute/core/Types.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to spatial divide a tensor. This function calls the following NEON
+- * kernels/functions:
+- *
+- * -# @ref NEMemsetKernel
+- * -# @ref NESpaceToBatchLayerKernel
+- */
+-class NESpaceToBatchLayerEx : public IFunction
+-{
+-public:
+- /** Default constructor */
+- NESpaceToBatchLayerEx();
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete;
+- /** Prevent instances of this class from being copied (As this class contains pointers) */
+- NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete;
+- /** Allow instances of this class to be moved */
+- NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default;
+- /** Allow instances of this class to be moved */
+- NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default;
+- /** Default destructor */
+- virtual ~NESpaceToBatchLayerEx() = default;
+- /** Set the input and output tensors.
+- *
+- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+- * @param[in] paddings 2-D tensor with shape [2, M]. Data types supported: S32
+- * @param[out] output Tensor output. Data types supported: same as @p input
+- */
+- void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings,
+- ITensor *output);
+- /** Set the input and output tensors. (Static block shape and paddings)
+- *
+- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[in] block_shape_x Block shape x value.
+- * @param[in] block_shape_y Block shape y value.
+- * @param[in] padding_left The left padding of the output tensor.
+- * @param[in] padding_right The right padding of the output tensor.
+- * @param[out] output Tensor output. Data types supported: same as @p input
+- */
+- void configure(const ITensor *input, const int block_shape_x, const int block_shape_y,
+- const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NESpaceToBatchLayerEx
+- *
+- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
+- * @param[in] paddings paddings tensor info with shape [2, M]. Data types supported: S32
+- * @param[in] output Tensor output info. Data types supported: same as @p input
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+- const ITensorInfo *paddings, const ITensorInfo *output);
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NESpaceToBatchLayerEx (Static block shape and paddings)
+- *
+- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[in] block_shape_x Block shape x value.
+- * @param[in] block_shape_y Block shape y value.
+- * @param[in] padding_left The left padding of the output tensor.
+- * @param[in] padding_right The right padding of the output tensor.
+- * @param[in] output Tensor output info. Data types supported: same as @p input
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y,
+- const Size2D &padding_left, const Size2D &padding_right,
+- const ITensorInfo *output);
+-
+- // Inherited methods overridden:
+- void run() override;
+-
+-private:
+- NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+- NEMemsetKernel _memset_kernel; /**< Memset kernel to run */
+- bool _has_padding; /**< Flag to check if the output has padding */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
+deleted file mode 100644
+index 9f32616..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
++++ /dev/null
+@@ -1,79 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+-#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+-
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** This function calls the following NEON kernels/functions:
+- *
+- * -# @ref NESpaceToDepthLayerKernelEx
+- */
+-class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder
+-{
+-public:
+- /** Set the input and output tensors.
+- *
+- * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[out] output Tensor output. Data types supported: same as @p input
+- * @param[in] block_shape Block shape value
+- */
+- void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+- /** Static function to check if given info will lead to a valid configuration of @ref
+- * NESpaceToDepthLayerEx (Static block shape and paddings)
+- *
+- * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+- * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+- * @param[in] output Tensor output info. Data types supported: same as @p input
+- * @param[in] block_shape Block shape value
+- *
+- * @return a status
+- */
+- static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+index 408d150..24ff5da 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+@@ -15,7 +15,7 @@
+ */
+
+ /*
+- * Copyright (c) 2017-2019 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+@@ -37,16 +37,14 @@
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+-
+ #ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+ #define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+
+-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
++#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
+ #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+ #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+-#include "arm_compute/runtime/NEON/functions/NEPermute.h"
++#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+
+-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+ #include "arm_compute/core/Types.h"
+ #include "arm_compute/runtime/IFunction.h"
+ #include "arm_compute/runtime/IMemoryManager.h"
+@@ -59,8 +57,8 @@ namespace arm_compute
+ {
+ /** Function to run the deconvolution layer.
+ *
+- * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the
+- * input depending on the stride and pad info and then perfrom a 1x1
++ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
++ * depending on the stride and pad info and then perfrom a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input, pad is the amount of padding and finaly a is a user
+ * specified value where a < stride - 1 that increases the padding top and right of the input image.
+@@ -81,21 +79,22 @@ namespace arm_compute
+ * kernel_x and kernel_y are the convolution sizes in x and y.
+ * stride_x and stride_y is the input stride of the first and second dimension.
+ *
+- * The weights used by Transpose convolution are supposed to be the same as the ones used for
+- * Convolution. Therefore, it will be necessary to use the weights in the
+- * reverse order to perform an actual convolution. This is achieved by using the @ref
+- * CPPFlipWeightsKernel.
++ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
++ * Therefore, it will be necessary to use the weights in the
++ * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
+ *
+ * This function calls the following NEON kernels/functions:
+ *
+- * -# @ref CPPUpsample
++ * -# @ref CPPUpsampleEx
+ * -# @ref NEConvolutionLayer
++ * -# @ref NEPermute
++ * -# @ref NEReverse
+ *
+ */
+ class NETransposeConvLayer : public IFunction
+ {
+ public:
+- /** Default constructor */
++ /** Constructor */
+ NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+@@ -112,37 +111,38 @@ public:
+ /** Set the input, weights, biases and output tensors.
+ *
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
+- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
++ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+- * supported: Same as @p input.
++ * supported: Same as @p input.
+ * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type
+- * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
++ * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
++ * for F16 input.
+ * @param[out] output Output tensor. The output has the same number of dimensions as the @p
+- * input.
++ * input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution, this is
+- * decribed in @ref PadStrideInfo.
+- * @param[in] invalid_right The number of zeros added to right edge of the output.
+- * @param[in] invalid_bottom The number of zeros added to top edge of the output.
++ * decribed in @ref PadStrideInfo.
++ * @param[in] invalid_right The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ *
+ */
+ void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+- * NETransposeConvLayer
++ * NETransposeConvLayer
+ *
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
+- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
++ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
+- * supported: Same as @p input.
++ * supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types
+- * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
++ * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+ * @param[in] output Output tensor info. The output has the same number of dimensions as the @p
+- * input.
++ * input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution, this is
+- * decribed in @ref PadStrideInfo.
+- * @param[in] innvalid_right The number of zeros added to right edge of the output.
+- * @param[in] invalid_bottom The number of zeros added to top edge of the output.
++ * decribed in @ref PadStrideInfo.
++ * @param[in] innvalid_right The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ *
+ * @return a status
+ */
+@@ -158,17 +158,11 @@ public:
+ private:
+ MemoryGroup _memory_group;
+ NEConvolutionLayer _conv_f;
+- CPPUpsampleEx _upsample_f;
+- CPPFlipWeightsKernel _flip_weights;
+- NEPermute _permute_input;
+- NEPermute _permute_weights;
+- NEPermute _permute_output;
++ CPPUpsample _upsample_f;
++ NEReverse _flip_weights;
+ Tensor _scaled_output;
+ Tensor _weights_flipped;
+- Tensor _permuted_input;
+- Tensor _permuted_weights;
+- Tensor _permuted_output;
+- bool _is_nchw;
++ Tensor _flip_axis;
+ const ITensor *_original_weights;
+ ITensor *_input;
+ PadStrideInfo _info;
+diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+index 7b6b974..ba42a24 100644
+--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
++++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+@@ -55,16 +55,7 @@ using namespace arm_compute;
+
+ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
+ // ARMComputeEx kernels
+- {"arg_op", "arg_operation.cl"},
+- {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
+ {"binary_logical_op", "binary_logical_op.cl"},
+- {"cast", "cast.cl"},
+- {"cast_qasymm_in", "cast.cl"},
+- {"cast_qasymm_out", "cast.cl"},
+- {"comparison_op", "comparison_op.cl"},
+- {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
+- {"depth_to_space_nchw", "depth_to_space.cl"},
+- {"depth_to_space_nhwc", "depth_to_space.cl"},
+ {"embedding_lookup", "embedding_lookup.cl"},
+ {"gather_ex", "gather_ex.cl"},
+ {"gather_ex_1d", "gather_ex.cl"},
+@@ -74,10 +65,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
+ {"instance_normalization_ex", "instance_normalization_ex.cl"},
+ {"multiply_scale_factor", "multiply_scale_factor.cl"},
+ {"neg_tensor", "neg_tensor.cl"},
+- {"permute_generic", "permute_ex.cl"},
+- {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
+- {"prelu", "prelu.cl"},
+- {"prelu_qasymm8", "prelu_quantized.cl"},
+ {"quantization_symm8", "quantization_symm8.cl"},
+ {"reduce_min_max", "reduce_operation.cl"},
+ {"reduce_sum_mean", "reduce_operation.cl"},
+@@ -91,29 +78,15 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
+ {"radixsort_reorder", "topkv2_radixsort.cl"},
+ {"topkv2_quicksort", "topkv2_quicksort.cl"},
+ {"scale_factor_symm8", "scale_factor.cl"},
+- {"space_to_depth_nchw", "space_to_depth.cl"},
+- {"space_to_depth_nhwc", "space_to_depth.cl"},
+ };
+
+ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
+ #ifdef EMBEDDED_KERNELS
+ {
+- "arg_operation.cl",
+-#include "./cl_kernels/arg_operation.clembed"
+- },
+- {
+- "cast.cl",
+-#include "./cl_kernels/cast.clembed"
+- },
+- {
+ "embedding_lookup.cl",
+ #include "./cl_kernels/embedding_lookup.clembed"
+ },
+ {
+- "depth_to_space.cl",
+-#include "./cl_kernels/depth_to_space.clembed"
+- },
+- {
+ "gather_ex.cl",
+ #include "./cl_kernels/gather_ex.clembed"
+ },
+@@ -150,14 +123,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
+ #include "./cl_kernels/neg_tensor.clembed"
+ },
+ {
+- "prelu.cl",
+-#include "./cl_kernels/prelu.clembed"
+- },
+- {
+- "prelu_quantized.cl",
+-#include "./cl_kernels/prelu_quantized.clembed"
+- },
+- {
+ "quantization_symm8.cl",
+ #include "./cl_kernels/quantization_symm8.clembed"
+ },
+@@ -170,10 +135,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
+ #include "./cl_kernels/scale_factor.clembed"
+ },
+ {
+- "space_to_depth.cl",
+-#include "./cl_kernels/space_to_depth.clembed"
+- },
+- {
+ "topkv2.cl",
+ #include "./cl_kernels/topkv2.clembed"
+ },
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+deleted file mode 100644
+index 03717cf..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
++++ /dev/null
+@@ -1,137 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+-/** Perform arg_max/arg_min
+- *
+- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
+- * e.g. -DDATA_TYPE=short
+- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+- * e.g. -DDEPTH_OUT=16
+- * @attention Operation type(code) specifying which operation to perform should be passed as
+- * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+- *
+- * @param[in] input_ptr Pointer to the source image. Supported data
+- * types:
+- * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
+- * @param[in] input_stride_x Stride of the source image in X dimension
+- * (in bytes)
+- * @param[in] input_step_x input_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_y Stride of the source image in Y dimension
+- * (in bytes)
+- * @param[in] input_step_y input_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_z Stride of the source tensor in Z dimension
+- * (in bytes)
+- * @param[in] input_step_z input_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input_offset_first_element_in_bytes The offset of the first element
+- * in the source image
+- * @param[in] input_stride_w Stride of the source tensor in W dimension
+- * (in bytes)
+- * @param[in] input_step_w output_stride_w * number of elements along W
+- * processed per workitem(in bytes)
+- * @param[out] output_ptr Pointer to the destination image.
+- * Supported data types: U32
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension
+- * (in bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_w Stride of the source tensor in W dimension
+- * (in bytes)
+- * @param[in] output_step_w output_stride_w * number of elements along W
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- * @param[in] axis Axis through which reduction occurs
+- * @param[in] dim Dimension across the axis to be reduced.
+- */
+-
+-__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
+- const int dim)
+-{
+- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+-
+- int indices[4] = {
+- get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+- get_global_id(2) / DEPTH_OUT,
+- };
+-
+- DATA_TYPE value =
+- *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+- DATA_TYPE tval = value;
+- int idx = 0;
+- for (int i = 1; i < dim; ++i)
+- {
+- indices[axis] = i;
+-
+-#if OP_CODE == 1 // ArgMax
+- value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+- indices[2], indices[3])));
+-#elif OP_CODE == 2 // ArgMin
+- value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+- indices[2], indices[3])));
+-#else
+- return;
+-
+-#endif
+-
+- if (tval != value)
+- {
+- idx = indices[axis];
+- tval = value;
+- }
+- }
+-
+- *((__global uint *)out.ptr) = idx;
+-}
+-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+deleted file mode 100644
+index f74c1c1..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
++++ /dev/null
+@@ -1,191 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016, 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers_asymm.h"
+-
+-#ifdef SATURATE
+-#define ADD(x, y) add_sat((x), (y))
+-#define SUB(x, y) sub_sat((x), (y))
+-#else /* SATURATE */
+-#define ADD(x, y) (x) + (y)
+-#define SUB(x, y) (x) - (y)
+-#endif /* SATURATE */
+-
+-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
+- * QASYMM8
+- *
+- * The following computations will be performed:
+- *
+- * -# Add offset terms to inputs
+- -# Get scaled value of two inputs
+- * -# Add inputs
+- * -# Add offset terms to final result
+- * -# Multiply each entry of result by result_mult_int
+- * -# Shift the int32 accumulator by result_shift
+- * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+- *
+- * @attention The inputs and output data types need to be passed at compile time using
+- * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+- * @attention The number of bits to shift left of input tensors must be passed at compile time using
+- * -DLEFT_SHIFT
+- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
+- * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
+- -DIN2_OFFSET,
+- * -RIN2_MULT_INT and -DIN2_SHIFT
+- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+- * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+- -DRESULT_SHIFT
+- *
+- * @attention The input and output data_types need to be passed at compile time using
+- * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
+- * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
+- * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
+- * @attention The inputs and output scale offset need to be passed at compile time using
+- * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
+- * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- * -DVEC_SIZE=16
+- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
+- * wrapping policy will be used.
+- *
+- * @param[in] in1_ptr Pointer to the source tensor.
+- * Supported data types: QASYMM8
+- * @param[in] in1_stride_x Stride of the source tensor in X dimension
+- * (in bytes)
+- * @param[in] in1_step_x in1_stride_x * number of elements along X processed
+- * per workitem(in bytes)
+- * @param[in] in1_stride_y Stride of the source tensor in Y dimension
+- * (in bytes)
+- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed
+- * per workitem(in bytes)
+- * @param[in] in1_stride_z Stride of the source tensor in Z dimension
+- * (in bytes)
+- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed
+- * per workitem(in bytes)
+- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source
+- * tensor
+- * @param[in] in2_ptr Pointer to the source tensor. Supported data types:
+- * QASYMM8
+- * @param[in] in2_stride_x Stride of the source tensor in X dimension
+- * (in bytes)
+- * @param[in] in2_step_x in2_stride_x * number of elements along X processed
+- * per workitem(in bytes)
+- * @param[in] in2_stride_y Stride of the source tensor in Y dimension
+- * (in bytes)
+- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed
+- * per workitem(in bytes)
+- * @param[in] in2_stride_z Stride of the source tensor in Z dimension
+- * (in bytes)
+- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed
+- * per workitem(in bytes)
+- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source
+- * tensor
+- * @param[out] out_ptr Pointer to the destination tensor.
+- * Supported data types: QASYMM8
+- * @param[in] out_stride_x Stride of the destination tensor in X dimension
+- * (in bytes)
+- * @param[in] out_step_x out_stride_x * number of elements along X processed
+- * per workitem(in bytes)
+- * @param[in] out_stride_y Stride of the destination tensor in Y dimension
+- * (in bytes)
+- * @param[in] out_step_y out_stride_y * number of elements along Y processed
+- * per workitem(in bytes)
+- * @param[in] out_stride_z Stride of the source tensor in Z dimension
+- * (in bytes)
+- * @param[in] out_step_z out_stride_z * number of elements along Z processed
+- * per workitem(in bytes)
+- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination
+- * tensor
+- */
+-__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+- TENSOR3D_DECLARATION(out))
+-{
+- // Get pixels pointer
+- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+-
+- // Load data
+- VEC_DATA_TYPE(int, 16)
+- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+- VEC_DATA_TYPE(int, 16)
+- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+-
+- // Get scaled value of two inputs
+- VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+- VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+-
+- VEC_DATA_TYPE(int, 16)
+- left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
+- VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
+- VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
+-
+- VEC_DATA_TYPE(int, 16)
+- scaled_in1_val =
+- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
+- VEC_DATA_TYPE(int, 16)
+- scaled_in2_val =
+- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
+-
+- // Add inputs and multiply with a multiplier smaller than 1
+- VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
+- VEC_DATA_TYPE(int, 16)
+- out_val =
+- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+- out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+-
+- VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+-
+- // TODO: Apply min-max BOUND to support fuse with relu.
+- /*
+- #if defined(MIN_BOUND)
+- res = max(res, (uchar16)MIN_BOUND);
+- #endif // defined(MIN_BOUND)
+- #if defined(MAX_BOUND)
+- res = min(res, (uchar16)MAX_BOUND);
+- #endif // defined(MAX_BOUND)
+- */
+-
+- // Store result
+- VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+deleted file mode 100644
+index 4147a00..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
++++ /dev/null
+@@ -1,233 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#ifndef SCALE
+-#define SCALE 1.0f
+-#endif
+-#ifndef OFFSET
+-#define OFFSET 0
+-#endif
+-#ifndef VEC_SIZE
+-#define VEC_SIZE 1
+-#endif
+-
+-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
+-/** Perform a cast operation on an input tensor.
+- *
+- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- * -DVEC_SIZE=16
+- * @attention -DBOOL_INPUT : Whether type of input is bool.
+- *
+- * @param[in] input_ptr Pointer to the source image. Supported data
+- * types: F16/F32
+- * @param[in] input_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input_step_x input_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input_step_y input_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input_step_z input_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[out] output_ptr Pointer to the destination image. Supported data
+- * types: same as @p input_ptr
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+-{
+- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+- VSTORE(VEC_SIZE)
+- (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+- 0, (__global DATA_TYPE_OUT *)output.ptr);
+- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+- res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+-#if defined(BOOL_INPUT)
+- VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
+- VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
+- res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+-#endif // defined(BOOL_INPUT)
+-
+- VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
+-}
+-
+-/** Perform a cast operation on an QASYMM8 input tensor.
+- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+- * @attention Offset and Scale of input should be given as a preprocessor argument using
+- * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- * -DVEC_SIZE=16
+- *
+- * @param[in] input_ptr Pointer to the source image. Supported data
+- * types: F16/F32
+- * @param[in] input_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input_step_x input_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input_step_y input_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input_step_z input_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[out] output_ptr Pointer to the destination image. Supported data
+- * types: same as @p input_ptr
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+-{
+- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+- in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+-
+- VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
+- VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
+-
+- VSTORE(VEC_SIZE)
+- (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+- (__global DATA_TYPE_OUT *)output.ptr);
+-}
+-
+-/** Perform a cast operation on an QASYMM8 output tensor.
+- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+- * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+- * @attention Offset and Scale of output should be given as a preprocessor argument using
+- * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- * -DVEC_SIZE=16
+- *
+- * @param[in] input_ptr Pointer to the source image. Supported data
+- * types: F16/F32
+- * @param[in] input_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input_step_x input_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input_step_y input_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input_step_z input_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[out] output_ptr Pointer to the destination image. Supported data
+- * types: U8
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+-{
+- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+- VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+- in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+-
+- VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
+- VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
+-
+- VSTORE(VEC_SIZE)
+- (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+- (__global DATA_TYPE_OUT *)output.ptr);
+-}
+-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+deleted file mode 100644
+index 0285c95..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
++++ /dev/null
+@@ -1,185 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016, 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+-/** Perform space to depth rearrangement of tensor
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+- * e.g. -DDEPTH_OUT=16
+- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+- * using -DZ_OUT=size. e.g. -DZ_OUT=16
+- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+- * -DBLOCK_SIZE=1
+- *
+- * @param[in] input_ptr Pointer to the source image. Supported data
+- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+- * @param[in] input_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input_step_x input_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input_step_y input_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input_step_z input_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[out] output_ptr Pointer to the destination image. Supported data
+- * types: same as @p input_ptr
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+- * bytes)
+- * @param[in] output_step_w output_stride_w * number of elements along W
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+-{
+- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+-
+- int out_index[4] = {0};
+- int in_index[4] = {0};
+-
+- out_index[0] = get_global_id(0); // W
+- out_index[1] = get_global_id(1); // H
+- out_index[2] = get_global_id(2) % Z_OUT; // C
+- out_index[3] = get_global_id(2) / Z_OUT; // B
+-
+- in_index[0] = out_index[0] / BLOCK_SIZE;
+- in_index[1] = out_index[1] / BLOCK_SIZE;
+- in_index[2] = out_index[2] +
+- ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
+- in_index[3] = out_index[3];
+-
+- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+- &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+-}
+-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+-
+-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+-/** Perform space to depth rearrangement of tensor (NHWC)
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+- * e.g. -DDEPTH_OUT=16
+- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+- * using -DZ_OUT=size. e.g. -DZ_OUT=16
+- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+- * -DBLOCK_SIZE=1
+- *
+- * @param[in] input_ptr Pointer to the source image. Supported data
+- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+- * @param[in] input_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input_step_x input_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input_step_y input_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input_step_z input_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[out] output_ptr Pointer to the destination image. Supported data
+- * types: same as @p input_ptr
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+- * bytes)
+- * @param[in] output_step_w output_stride_w * number of elements along W
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+-{
+- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+-
+- int out_index[4] = {0};
+- int in_index[4] = {0};
+-
+- out_index[0] = get_global_id(0); // C
+- out_index[1] = get_global_id(1); // W
+- out_index[2] = get_global_id(2) % Z_OUT; // H
+- out_index[3] = get_global_id(2) / Z_OUT; // B
+-
+- in_index[0] = out_index[0] +
+- ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
+- in_index[1] = out_index[1] / BLOCK_SIZE;
+- in_index[2] = out_index[2] / BLOCK_SIZE;
+- in_index[3] = out_index[3];
+-
+- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+- &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+-}
+-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+index 2d0b6a2..e07a25e 100644
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
++++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+@@ -15,7 +15,7 @@
+ */
+
+ /*
+- * Copyright (c) 2016-2018 ARM Limited.
++ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+@@ -37,7 +37,6 @@
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+-
+ #ifndef ARM_COMPUTE_HELPER_H
+ #define ARM_COMPUTE_HELPER_H
+
+@@ -59,16 +58,219 @@
+ #pragma OPENCL EXTENSION cl_arm_printf : enable
+ #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
++#define GPU_ARCH_MIDGARD 0x100
++#define GPU_ARCH_BIFROST 0x200
++
++/** Concatenate two inputs.
++ *
++ * @param[in] a The first input to be concatenated
++ * @param[in] b The second input to be concatenated
++ *
++ * @return The concatenated output
++ */
++#define CONCAT(a, b) a##b
++
++/** Expand the given vector
++ *
++ * @param[in] x The vector to be expanded
++ *
++ * @return The expanded output
++ */
+ #define EXPAND(x) x
+
++/** Clamp the given value between an upper and lower bound.
++ *
++ * @param[in] x The value to be clamped
++ * @param[in] min_val The lower bound
++ * @param[in] max_val The upper bound
++ *
++ * @return The clamped value.
++ */
+ #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
++/** REVn reverses the given vector whose size is n.
++ * @name REVn
++ *
++ * @param[in] x The vector to be reversed
++ *
++ * @return The reversed vector
++ * @{
++ */
++#define REV1(x) ((x))
++#define REV2(x) ((x).s10)
++#define REV3(x) ((x).s210)
++#define REV4(x) ((x).s3210)
++#define REV8(x) ((x).s76543210)
++#define REV16(x) ((x).sFEDCBA9876543210)
++/** @} */ // end of group REVn
++
++/** Reverse the given vector.
++ * @name REVERSE
++ *
++ * @param[in] x The vector to be reversed
++ * @param[in] s The size of the vector
++ *
++ * @return The reversed vector
++ * @{
++ */
++#define REVERSE_STR(x, s) REV##s((x))
++#define REVERSE(x, s) REVERSE_STR(x, s)
++/** @} */ // end of group REVERSE
++
++/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
++ * @name ROTs_n
++ *
++ * @param[in] x The vector to be shifted
++ *
++ * @return The shifted vector
++ * @{
++ */
++#define ROT1_0(x) ((x))
++
++#define ROT2_0(x) ((x))
++#define ROT2_1(x) ((x).s10)
++
++#define ROT3_0(x) ((x))
++#define ROT3_1(x) ((x).s201)
++#define ROT3_2(x) ((x).s120)
++
++#define ROT4_0(x) ((x))
++#define ROT4_1(x) ((x).s3012)
++#define ROT4_2(x) ((x).s2301)
++#define ROT4_3(x) ((x).s1230)
++
++#define ROT8_0(x) ((x))
++#define ROT8_1(x) ((x).s70123456)
++#define ROT8_2(x) ((x).s67012345)
++#define ROT8_3(x) ((x).s56701234)
++#define ROT8_4(x) ((x).s45670123)
++#define ROT8_5(x) ((x).s34567012)
++#define ROT8_6(x) ((x).s23456701)
++#define ROT8_7(x) ((x).s12345670)
++
++#define ROT16_0(x) ((x))
++#define ROT16_1(x) ((x).sF0123456789ABCDE)
++#define ROT16_2(x) ((x).sEF0123456789ABCD)
++#define ROT16_3(x) ((x).sDEF0123456789ABC)
++#define ROT16_4(x) ((x).sCDEF0123456789AB)
++#define ROT16_5(x) ((x).sBCDEF0123456789A)
++#define ROT16_6(x) ((x).sABCDEF0123456789)
++#define ROT16_7(x) ((x).s9ABCDEF012345678)
++#define ROT16_8(x) ((x).s89ABCDEF01234567)
++#define ROT16_9(x) ((x).s789ABCDEF0123456)
++#define ROT16_10(x) ((x).s6789ABCDEF012345)
++#define ROT16_11(x) ((x).s56789ABCDEF01234)
++#define ROT16_12(x) ((x).s456789ABCDEF0123)
++#define ROT16_13(x) ((x).s3456789ABCDEF012)
++#define ROT16_14(x) ((x).s23456789ABCDEF01)
++#define ROT16_15(x) ((x).s123456789ABCDEF0)
++/** @} */ // end of group ROTs_n
++
++/** Circular-right-shift (rotate-right) the given vector by the given amount.
++ * @name ROTATE
++ *
++ * @param[in] x The vector to be shifted
++ * @param[in] s The size of the vector
++ * @param[in] n The amount to be shifted
++ *
++ * @return The shifted vector
++ * @{
++ */
++#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
++#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
++/** @} */ // end of group ROTATE
++
++/** Creates a vector of size n filled with offset values corresponding to the location of each
++ * element.
++ * @name V_OFFSn
++ *
++ * @param[in] dt The data type of the output vector
++ *
++ * @return The vector filled with offset values
++ * @{
++ */
++#define V_OFFS1(dt) (dt)(0)
++#define V_OFFS2(dt) (dt)(0, 1)
++#define V_OFFS3(dt) (dt)(0, 1, 3)
++#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
++#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
++#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
++/** @} */ // end of group V_OFFSn
++
++/** Create a vector filled with offset values corresponding to the location of each element.
++ * @name VEC_OFFS
++ *
++ * @param[in] dt The data type of the output vector
++ * @param[in] s The size of the output vector
++ *
++ * @return The vector filled with offset values
++ * @{
++ */
++#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
++#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
++/** @} */ // end of group VEC_OFFS
++
+ #define VLOAD_STR(size) vload##size
+ #define VLOAD(size) VLOAD_STR(size)
+
+ #define VSTORE_STR(size) vstore##size
+ #define VSTORE(size) VSTORE_STR(size)
+
++#define float1 float
++#define half1 half
++#define char1 char
++#define uchar1 uchar
++#define short1 short
++#define ushort1 ushort
++#define int1 int
++#define uint1 uint
++#define long1 long
++#define ulong1 ulong
++#define double1 double
++
++#define vload1(OFFSET, PTR) *(OFFSET + PTR)
++#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
++
++// Convert built-in functions with _sat modifier are not supported in floating point so we create
++// defines
++// without _sat to overcome this issue
++#define convert_float_sat convert_float
++#define convert_float1_sat convert_float
++#define convert_float2_sat convert_float2
++#define convert_float3_sat convert_float3
++#define convert_float4_sat convert_float4
++#define convert_float8_sat convert_float8
++#define convert_float16_sat convert_float16
++#define convert_half_sat convert_float
++#define convert_half1_sat convert_half
++#define convert_half2_sat convert_half2
++#define convert_half3_sat convert_half3
++#define convert_half4_sat convert_half4
++#define convert_half8_sat convert_half8
++#define convert_half16_sat convert_half16
++
++#define convert_float1 convert_float
++#define convert_half1 convert_half
++#define convert_char1 convert_char
++#define convert_uchar1 convert_uchar
++#define convert_short1 convert_short
++#define convert_ushort1 convert_ushort
++#define convert_int1 convert_int
++#define convert_uint1 convert_uint
++#define convert_long1 convert_long
++#define convert_ulong1 convert_ulong
++#define convert_double1 convert_double
++
++#define convert_char1_sat convert_char_sat
++#define convert_uchar1_sat convert_uchar_sat
++#define convert_short1_sat convert_short_sat
++#define convert_ushort1_sat convert_ushort_sat
++#define convert_int1_sat convert_int_sat
++#define convert_uint1_sat convert_uint_sat
++#define convert_long1_sat convert_long_sat
++#define convert_ulong1_sat convert_ulong_sat
++#define convert_double1_sat convert_double_sat
++
+ #define VEC_DATA_TYPE_STR(type, size) type##size
+ #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+index a83b1a8..5f1b3f9 100644
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
++++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+@@ -15,7 +15,7 @@
+ */
+
+ /*
+- * Copyright (c) 2017-2018 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+@@ -37,29 +37,112 @@
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+-
+ #ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+ #define ARM_COMPUTE_HELPERS_ASYMM_H
+
+ #include "helpers.h"
+
++/** Convert the given vector with round to nearest even rounding mode
++ *
++ * @param[in] x The target to be converted
++ * @param[in] type The target type
++ *
++ * @return The converted vector
++ */
++#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
++#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
++
++/** Quantize a floating-point scalar value to 8-bit asymmetric
++ *
++ * @param[in] input Input value to quantize
++ * @param[in] offset Quantization offset
++ * @param[in] scale Quantization scale
++ *
++ * @return quantized value
++ */
++inline uchar quantize_qasymm8(float input, float offset, float scale)
++{
++ float out_f32 = input / scale + offset;
++ uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
++ return res_u8;
++}
++
++/** Dequantize a scalar value from 8-bit asymmetric to floating-point
++ *
++ * @param[in] input Input value to quantize
++ * @param[in] offset Quantization offset
++ * @param[in] scale Quantization scale
++ *
++ * @return quantized value
++ */
++inline float dequantize_qasymm8(uchar input, float offset, float scale)
++{
++ return ((float)input - offset) * scale;
++}
++
++/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
++ *
++ * @param[in] input Input value to quantize
++ * @param[in] offset Quantization offset
++ * @param[in] scale Quantization scale
++ *
++ * @return quantized value
++ */
++inline float dequantize_qasymm8_signed(char input, float offset, float scale)
++{
++ return ((float)input - offset) * scale;
++}
++
++/** Quantize a vector of values from floating-point
++ *
++ * @param[in] type Output data type.
++ * @param[in] size Size of vector.
++ *
++ * @return quantized values
++ */
++#define QUANTIZE_IMPL(type, size) \
++ inline VEC_DATA_TYPE(type, size) \
++ quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
++ { \
++ VEC_DATA_TYPE(float, size) \
++ out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
++ VEC_DATA_TYPE(type, size) \
++ res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \
++ VEC_DATA_TYPE(type, size)); \
++ return res; \
++ }
++
++/** Dequantize a vector of values to floating-point
++ *
++ * @param[in] type Input data type.
++ * @param[in] size Size of vector.
++ *
++ * @return dequantized values in floating point
++ */
++#define DEQUANTIZE_IMPL(type, size) \
++ inline VEC_DATA_TYPE(float, size) \
++ dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
++ { \
++ return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \
++ }
++
+ /** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
+- inline VEC_DATA_TYPE(int, size) \
+- asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+- { \
+- VEC_DATA_TYPE(int, size) \
+- mask = (1 << exponent) - 1; \
+- const VEC_DATA_TYPE(int, size) zero = 0; \
+- const VEC_DATA_TYPE(int, size) one = 1; \
+- VEC_DATA_TYPE(int, size) \
+- threshold = (mask >> 1) + select(zero, one, x < 0); \
+- return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
++#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
++ inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
++ VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
++ { \
++ const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \
++ const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \
++ VEC_DATA_TYPE(int, size) \
++ mask = (one << exponent) - one; \
++ VEC_DATA_TYPE(int, size) \
++ threshold = (mask >> 1) + select(zero, one, x < 0); \
++ return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
+ }
+
+ /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+@@ -81,9 +164,19 @@
+ b_64 = convert_long##size(b); \
+ VEC_DATA_TYPE(long, size) \
+ ab_64 = a_64 * b_64; \
+- /* COMPMID-907 */ \
++ /* Revert COMPMID-907 */ \
++ VEC_DATA_TYPE(long, size) \
++ mask1 = 1 << 30; \
++ VEC_DATA_TYPE(long, size) \
++ mask2 = 1 - (1 << 30); \
++ VEC_DATA_TYPE(long, size) \
++ is_positive_or_zero = ab_64 >= 0; \
++ VEC_DATA_TYPE(long, size) \
++ nudge = select(mask2, mask1, is_positive_or_zero); \
++ VEC_DATA_TYPE(long, size) \
++ mask = 1ll << 31; \
+ VEC_DATA_TYPE(int, size) \
+- ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \
++ ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \
+ return select(ab_x2_high32, INT_MAX, overflow); \
+ }
+
+@@ -335,9 +428,18 @@
+ return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \
+ }
+
++#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
++#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
++#define DEQUANTIZE_STR(input, offset, scale, type, size) \
++ dequantize_##type##size(input, offset, scale)
++#define DEQUANTIZE(input, offset, scale, type, size) \
++ DEQUANTIZE_STR(input, offset, scale, type, size)
++
+ #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
+ asymm_rounding_divide_by_POW2_##size(x, exponent)
+ #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
++#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
++ ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+ #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+ #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+@@ -360,11 +462,53 @@
+ #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+ asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
++#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \
++ inline VEC_DATA_TYPE(int, size) \
++ multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
++ { \
++ const int left_shift = shift > 0 ? shift : 0; \
++ const int right_shift = shift > 0 ? 0 : -shift; \
++ return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \
++ right_shift, size); \
++ }
++#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
++ multiply_by_quantized_multiplier##size(input, qmul, shift)
++
++QUANTIZE_IMPL(uchar, 1)
++QUANTIZE_IMPL(char, 1)
++QUANTIZE_IMPL(uint, 1)
++QUANTIZE_IMPL(int, 1)
++QUANTIZE_IMPL(uchar, 4)
++QUANTIZE_IMPL(ushort, 4)
++QUANTIZE_IMPL(short, 4)
++QUANTIZE_IMPL(uchar, 16)
++QUANTIZE_IMPL(char, 16)
++QUANTIZE_IMPL(ushort, 16)
++QUANTIZE_IMPL(short, 16)
++QUANTIZE_IMPL(uint, 16)
++QUANTIZE_IMPL(int, 16)
++
++DEQUANTIZE_IMPL(uchar, 1)
++DEQUANTIZE_IMPL(char, 1)
++DEQUANTIZE_IMPL(uint, 1)
++DEQUANTIZE_IMPL(int, 1)
++DEQUANTIZE_IMPL(uchar, 4)
++DEQUANTIZE_IMPL(ushort, 4)
++DEQUANTIZE_IMPL(short, 4)
++DEQUANTIZE_IMPL(uchar, 16)
++DEQUANTIZE_IMPL(char, 16)
++DEQUANTIZE_IMPL(ushort, 16)
++DEQUANTIZE_IMPL(short, 16)
++DEQUANTIZE_IMPL(uint, 16)
++DEQUANTIZE_IMPL(int, 16)
++
++ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
++ASYMM_MULT_IMPL(1)
+ ASYMM_MULT_IMPL(2)
+ ASYMM_MULT_IMPL(4)
+ ASYMM_MULT_IMPL(8)
+@@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
++ASYMM_SELECT_USING_MASK_IMPL(1)
+ ASYMM_SELECT_USING_MASK_IMPL(2)
+ ASYMM_SELECT_USING_MASK_IMPL(4)
+ ASYMM_SELECT_USING_MASK_IMPL(8)
+ ASYMM_SELECT_USING_MASK_IMPL(16)
+
++ASYMM_MASK_IF_ZERO_IMPL(1)
+ ASYMM_MASK_IF_ZERO_IMPL(2)
+ ASYMM_MASK_IF_ZERO_IMPL(4)
+ ASYMM_MASK_IF_ZERO_IMPL(8)
+ ASYMM_MASK_IF_ZERO_IMPL(16)
+
++ASYMM_MASK_IF_NON_ZERO_IMPL(1)
+ ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+@@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
++ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
+ ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+@@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
++ASYMM_RESCALE_IMPL(1)
+ ASYMM_RESCALE_IMPL(2)
+ ASYMM_RESCALE_IMPL(4)
+ ASYMM_RESCALE_IMPL(8)
+ ASYMM_RESCALE_IMPL(16)
+
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
++
+ #endif // ARM_COMPUTE_HELPERS_ASYMM_H
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+deleted file mode 100644
+index 12c8eeb..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
++++ /dev/null
+@@ -1,120 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#ifndef VEC_SIZE
+-#define VEC_SIZE 1
+-#endif
+-
+-#if defined(DATA_TYPE)
+-/** Returns result of prelu function implemented as below:
+- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- * -DVEC_SIZE=16
+- * @note Can only take floating point data types.
+- *
+- * @param[in] input1_ptr Pointer to the source image. Supported Data
+- * types : F16/F32
+- * @param[in] input1_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input1_step_x input1_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input1_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input1_step_y input1_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input1_step_z input1_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[in] alpha_ptr Pointer to the source image. Supported Data
+- * types : F16/F32
+- * @param[in] alpha_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] alpha_step_x input2_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] alpha_step_y input2_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] alpha_step_z input2_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- *
+- * @param[out] output_ptr Pointer to the destination image. Supported
+- * data types: same as @p input_ptr
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+- TENSOR3D_DECLARATION(output))
+-{
+- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+- VSTORE(VEC_SIZE)
+- (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
+- ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
+- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
+- : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
+- 0, (__global DATA_TYPE *)output.ptr);
+-}
+-#endif // defined(DATA_TYPE)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+deleted file mode 100644
+index a66e107..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
++++ /dev/null
+@@ -1,138 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-#define SUB(x, y) (x) - (y)
+-
+-#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
+- defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
+-
+-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+-#define SELECT_TYPE VEC_INT
+-
+-/** Returns result of prelu function implemented as below:
+- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
+- * -DDATA_TYPE_IN=uchar
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- * -DVEC_SIZE=16
+- * @note Can only take uchar data types.
+- *
+- * @param[in] input1_ptr Pointer to the source image. Supported Data
+- * types : QASYMM8
+- * @param[in] input1_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input1_step_x input1_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input1_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input1_step_y input1_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input1_step_z input1_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[in] alpha_ptr Pointer to the source image. Supported Data
+- * types : QASYMM8
+- * @param[in] alpha_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] alpha_step_x input2_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] alpha_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] alpha_step_y input2_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] alpha_step_z input2_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[out] output_ptr Pointer to the destination image. Supported
+- * data types: same as @p input_ptr
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+- TENSOR3D_DECLARATION(output))
+-{
+- // Get pixels pointer
+- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+- Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+- VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
+- VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
+-
+- in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
+- alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
+-
+- const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
+- const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
+- const VEC_FLOAT outf32 =
+- select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
+- const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
+- const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+-
+- VSTORE(VEC_SIZE)
+- (res, 0, (__global uchar *)output.ptr);
+-}
+-
+-#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
+- // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+deleted file mode 100644
+index eb612f8..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
++++ /dev/null
+@@ -1,185 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016, 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+-/** Perform space to depth rearrangement of tensor
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+- * e.g. -DDEPTH_IN=16
+- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+- * argument using -DZ_IN=size. e.g. -DZ_IN=16
+- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+- * -DBLOCK_SIZE=1
+- *
+- * @param[in] input_ptr Pointer to the source image. Supported data
+- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+- * @param[in] input_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input_step_x input_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input_step_y input_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input_step_z input_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[out] output_ptr Pointer to the destination image. Supported data
+- * types: same as @p input_ptr
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+- * bytes)
+- * @param[in] output_step_w output_stride_w * number of elements along W
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+-{
+- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+-
+- int out_index[4] = {0};
+- int in_index[4] = {0};
+-
+- in_index[0] = get_global_id(0); // W
+- in_index[1] = get_global_id(1); // H
+- in_index[2] = get_global_id(2) % Z_IN; // C
+- in_index[3] = get_global_id(2) / Z_IN; // B
+-
+- out_index[0] = in_index[0] / BLOCK_SIZE;
+- out_index[1] = in_index[1] / BLOCK_SIZE;
+- out_index[2] =
+- in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+- out_index[3] = in_index[3];
+-
+- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+- out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+-}
+-#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+-
+-#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+-/** Perform space to depth rearrangement of tensor
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+- * e.g. -DDEPTH_IN=16
+- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+- * argument using -DZ_IN=size. e.g. -DZ_IN=16
+- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+- * -DBLOCK_SIZE=1
+- *
+- * @param[in] input_ptr Pointer to the source image. Supported data
+- * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+- * @param[in] input_stride_x Stride of the source image in X dimension (in
+- * bytes)
+- * @param[in] input_step_x input_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_y Stride of the source image in Y dimension (in
+- * bytes)
+- * @param[in] input_step_y input_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] input_step_z input_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+- * image
+- * @param[out] output_ptr Pointer to the destination image. Supported data
+- * types: same as @p input_ptr
+- * @param[in] output_stride_x Stride of the destination image in X dimension
+- * (in bytes)
+- * @param[in] output_step_x output_stride_x * number of elements along X
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_y Stride of the destination image in Y dimension
+- * (in bytes)
+- * @param[in] output_step_y output_stride_y * number of elements along Y
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+- * bytes)
+- * @param[in] output_step_z output_stride_z * number of elements along Z
+- * processed per workitem(in bytes)
+- * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+- * bytes)
+- * @param[in] output_step_w output_stride_w * number of elements along W
+- * processed per workitem(in bytes)
+- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+- * destination image
+- */
+-__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+-{
+- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+-
+- int out_index[4] = {0};
+- int in_index[4] = {0};
+-
+- in_index[0] = get_global_id(0); // C
+- in_index[1] = get_global_id(1); // W
+- in_index[2] = get_global_id(2) % Z_IN; // H
+- in_index[3] = get_global_id(2) / Z_IN; // B
+-
+- out_index[0] =
+- in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
+- out_index[1] = in_index[1] / BLOCK_SIZE;
+- out_index[2] = in_index[2] / BLOCK_SIZE;
+- out_index[3] = in_index[3];
+-
+- *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+- out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+-}
+-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
+deleted file mode 100644
+index 06eeb5b..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
++++ /dev/null
+@@ -1,181 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-namespace
+-{
+-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+-{
+- TensorShape out_shape{input_shape};
+-
+- out_shape.set(axis, 1);
+-
+- return out_shape;
+-}
+-} // namespace
+-
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+- ArgOperation /*op*/)
+-{
+- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
+- DataType::QASYMM8);
+- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
+- output->tensor_shape().num_dimensions(),
+- "Input's rank is not same with output");
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+- "Inputs are not broadcast compatible");
+-
+- const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+- "output shape's size does not match axis");
+-
+- const auto num_dimensions = input->tensor_shape().num_dimensions();
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+- return Status{};
+-}
+-
+-} // namespace
+-
+-CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+-
+-void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+- ArgOperation op)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+-
+- _input = input;
+- _output = output;
+- _axis = axis;
+-
+- std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+- output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+-
+- // Construct kernel and set op_code based on type of ArgOperation as specified by object op
+- std::string kernel_name = "arg_op";
+- int op_code = 0;
+- if (op == ArgOperation::MAX)
+- {
+- op_code = 1;
+- }
+- else if (op == ArgOperation::MIN)
+- {
+- op_code = 2;
+- }
+- else
+- throw std::runtime_error("Operation not supported, yet");
+-
+- // Set kernel build options
+- std::set<std::string> build_opts;
+- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+- build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+-
+- // Create kernel
+- _kernel =
+- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+-
+- // Configure kernel window
+- Window win = calculate_max_window(*output_info, Steps());
+-
+- Coordinates coord;
+- coord.set_num_dimensions(output_info->num_dimensions());
+- output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+-
+- ICLKernel::configure_internal(win);
+-}
+-
+-Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+- const uint32_t axis, ArgOperation op)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+-
+- return Status{};
+-}
+-
+-void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+- const TensorShape &shape_in = _input->info()->tensor_shape();
+-
+- unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+-
+- _kernel.setArg<cl_int>(idx++, _axis);
+- _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+-
+- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+-
+- // Setup input slice
+- Window slice_in(slice_out);
+- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+- slice_in.set(3, Window::Dimension(0, 0, 0));
+-
+- // Copy output's shape in order to use for recovering at end of this method
+- const TensorShape shape_out = _output->info()->tensor_shape();
+- _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+-
+- do
+- {
+- unsigned int idx = 0;
+- add_4D_tensor_argument(idx, _input, slice_in);
+- add_4D_tensor_argument(idx, _output, slice_out);
+- enqueue(queue, *this, slice_out);
+- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+-
+- // Recover output's shape of output tensor
+- _output->info()->set_tensor_shape(shape_out);
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+index bb55568..fbc76f5 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+
+ using namespace arm_compute;
+
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+deleted file mode 100644
+index 01ea655..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
++++ /dev/null
+@@ -1,132 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
+-
+-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+- DataType::S16, DataType::S32, DataType::F16,
+- DataType::F32);
+- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+- DataType::S16, DataType::S32, DataType::F16,
+- DataType::F32);
+- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+-
+- _input = input;
+- _output = output;
+-
+- constexpr unsigned int num_elems_processed_per_iteration = 16;
+-
+- // Set kernel build options
+- CLBuildOptions build_opts;
+- build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+- build_opts.add_option("-DDATA_TYPE_OUT=" +
+- get_cl_type_from_data_type(output->info()->data_type()));
+- build_opts.add_option(
+- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+-
+- // Create kernel
+- if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+- {
+- UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
+- const float scale_in = qinfo.scale;
+- const int offset_in = qinfo.offset;
+- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
+-
+- _kernel = static_cast<cl::Kernel>(
+- CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options()));
+- }
+- else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
+- {
+- UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
+- const float scale_in = qinfo.scale;
+- const float offset_in = qinfo.offset;
+-
+- build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
+-
+- _kernel = static_cast<cl::Kernel>(
+- CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options()));
+- }
+- else
+- {
+- build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT");
+- _kernel = static_cast<cl::Kernel>(
+- CLKernelLibraryEx::get().create_kernel("cast", build_opts.options()));
+- }
+-
+- // Configure kernel window
+- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+- update_window_and_padding(win, input_access, output_access);
+- output_access.set_valid_region(win, input->info()->valid_region());
+-
+- ICLKernel::configure_internal(win);
+-}
+-
+-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+- Window slice = collapsed.first_slice_window_3D();
+-
+- do
+- {
+- unsigned int idx = 0;
+- add_3D_tensor_argument(idx, _input, slice);
+- add_3D_tensor_argument(idx, _output, slice);
+- enqueue(queue, *this, slice, lws_hint());
+- } while (collapsed.slide_window_slice_3D(slice));
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+deleted file mode 100644
+index 3891368..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
++++ /dev/null
+@@ -1,140 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-namespace
+-{
+-// TODO Use this validation function
+-#if 0
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+- const int32_t block_size)
+-{
+- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+- DataType::S16, DataType::S32, DataType::F16,
+- DataType::F32);
+- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+- DataType::S16, DataType::S32, DataType::F16,
+- DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+- "Block size should be greater than or equal to 1.");
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
+- "Output width should be equal to (Input width * block size)");
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
+- "Output height should be equal to (Input height * block size)");
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
+- "Input depth should be divisible by (block size * block size)");
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+- output->dimension(2) != input->dimension(2) / (block_size * block_size),
+- "Output depth should be equal to (Input depth / (block size * block size))");
+-
+- return Status{};
+-}
+-#endif
+-} // namespace
+-
+-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+-{
+- // DO NOTHING
+-}
+-
+-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+- const int32_t block_size)
+-{
+- // TODO Add validation of data_layout
+- _input = input;
+- _output = output;
+-
+- // Set kernel build options
+- auto layout_out = output->info()->data_layout();
+- std::set<std::string> build_opts;
+- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+- auto depth = output->info()->dimension(index_depth);
+- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
+- build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
+-
+- // Create kernel
+- _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+- "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+-
+- // Configure kernel window
+- Window win = calculate_max_window(*output->info(), Steps());
+-
+- Coordinates coord;
+- coord.set_num_dimensions(output->info()->num_dimensions());
+- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+-
+- ICLKernel::configure_internal(win);
+-}
+-
+-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+-
+- Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+-
+- // Setup input slice
+- Window slice_in(slice_out);
+- slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+- slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+- slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+- slice_in.set(3, Window::Dimension(0, 0, 0));
+-
+- do
+- {
+- unsigned int idx = 0;
+- add_4D_tensor_argument(idx, _input, slice_in);
+- add_4D_tensor_argument(idx, _output, slice_out);
+- enqueue(queue, *this, slice_out);
+- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+index 79f5ce0..67aaf2d 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+
+ using namespace arm_compute;
+
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
+deleted file mode 100644
+index 235e897..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
++++ /dev/null
+@@ -1,372 +0,0 @@
+-/*
+- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
+-
+-#include "arm_compute/core/AccessWindowStatic.h"
+-#include "arm_compute/core/AccessWindowTranspose.h"
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-#include "arm_compute/core/CL/OpenCL.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Utils.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/Window.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "support/ToolchainSupport.h"
+-
+-#include <cstddef>
+-#include <cstdint>
+-#include <tuple>
+-
+-using namespace arm_compute;
+-using namespace arm_compute::misc::shape_calculator;
+-
+-namespace arm_compute
+-{
+-class Coordinates;
+-} // namespace arm_compute
+-
+-namespace
+-{
+-using ElementsProcessed = Steps;
+-
+-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
+- const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4,
+- "The number of dimensions for the matrix A must be <= 4");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3,
+- "The number of dimensions for the matrix B must be <= 3");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 &&
+- gemm_info.reinterpret_input_as_3d(),
+- "The input1 tensor cannot have more than 2 dimensions if input0 "
+- "has to be reinterpreted as 3D");
+-
+- const int m = gemm_info.m();
+- const int n = gemm_info.n();
+- const int k = gemm_info.k();
+-
+- ARM_COMPUTE_UNUSED(m);
+- ARM_COMPUTE_UNUSED(n);
+- ARM_COMPUTE_UNUSED(k);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
+- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
+- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
+- if (gemm_info.reinterpret_input_as_3d())
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) !=
+- static_cast<unsigned int>(m));
+- }
+- else
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
+- }
+-
+- if (output->total_size() != 0)
+- {
+- const TensorInfo tensor_info_output =
+- output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info));
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+- }
+-
+- return Status{};
+-}
+-
+-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1,
+- ITensorInfo *output,
+- const GEMMReshapeInfo &gemm_info,
+- ElementsProcessed &num_elements_processed)
+-{
+- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+-
+- Window win{};
+- Window win_out{};
+- bool window_changed = false;
+-
+- // In case both input and output have to be reinterpreted as 3D tensors,
+- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+- if (reinterpret_input_as_3d == reinterpret_output_as_3d)
+- {
+- reinterpret_input_as_3d = false;
+- reinterpret_output_as_3d = false;
+- }
+-
+- // Output tensor auto inizialitation if not yet initialized
+- auto_init_if_empty(*output,
+- input0->clone()
+- ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info))
+- .set_data_type(DataType::S32));
+-
+- TensorInfo tmp_info(*output);
+-
+- if (reinterpret_output_as_3d)
+- {
+- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D
+- // GEMM,
+- // the window needs to be constructed on the 2D collapsed version of the tensor
+- TensorShape tmp_shape(output->tensor_shape());
+- tmp_shape.collapse(2U, 1U);
+- tmp_info.set_tensor_shape(tmp_shape);
+- }
+-
+- // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
+- // Note: if the dot product instruction is available, the 8x2 tile has to be used
+- num_elems_processed_per_iteration_x = 4;
+- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+-
+- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+- // The only way to set properly the paddings, it is to set those explicitly through the
+- // AccessWindowStatic
+- const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2]
+- : input0->tensor_shape()[1];
+- const int bottom_pad =
+- (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) %
+- num_elems_processed_per_iteration_y;
+-
+- // Configure window
+- win = calculate_max_window(
+- tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+- win_out = calculate_max_window(
+- *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+-
+- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0),
+- input0->dimension(1) + bottom_pad);
+- AccessWindowStatic input1_access(
+- input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+- input1->dimension(1));
+- AccessWindowStatic output_access(
+- output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+- output->dimension(1) + bottom_pad);
+-
+- window_changed =
+- update_window_and_padding(win, input0_access,
+- input1_access) || // window used by the execute_window_loop
+- update_window_and_padding(
+- win_out,
+- output_access); // window used to update the padding requirements of output tensor
+-
+- Coordinates coord;
+- coord.set_num_dimensions(output->num_dimensions());
+- output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
+-
+- // Collapse along the Z direction
+- // This collapse needs to be here in order to tune the Z dimension of LWS
+- Window collapsed = win;
+- const unsigned int dimension_to_collapse =
+- std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+- collapsed = win.collapse(win, dimension_to_collapse);
+-
+- Status err = (window_changed)
+- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+- : Status{};
+- return std::make_pair(err, collapsed);
+-}
+-} // namespace
+-
+-CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx()
+- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true),
+- _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
+-{
+-}
+-
+-void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1,
+- ICLTensor *output,
+- const GEMMReshapeInfo &gemm_info)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+-
+- ARM_COMPUTE_ERROR_THROW_ON(
+- validate_arguments(input0->info(), input1->info(), output->info(), gemm_info));
+-
+- _input0 = input0;
+- _input1 = input1;
+- _output = output;
+- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+-
+- // In case both input and output have to be reinterpreted as 3D tensors,
+- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+- if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+- {
+- _reinterpret_input_as_3d = false;
+- _reinterpret_output_as_3d = false;
+- }
+-
+- // Check if we need to slide the matrix B
+- const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d
+- ? _input0->info()->num_dimensions() - 1
+- : _input0->info()->num_dimensions();
+- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+-
+- ElementsProcessed num_elements_processed{};
+-
+- // Configure kernel window
+- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(),
+- gemm_info, num_elements_processed);
+- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+- ICLKernel::configure_internal(win_config.second);
+-
+- // Create build options
+- std::string kernel_name(" ");
+- CLBuildOptions build_opts;
+- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+- "-DHEIGHT_GEMM3D=" +
+- support::cpp11::to_string(output->info()->dimension(1)));
+- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+- "-DDEPTH_GEMM3D=" +
+- support::cpp11::to_string(output->info()->dimension(2)));
+- build_opts.add_option_if(!_slide_matrix_b,
+- "-DMATRIX_B_DEPTH=" +
+- support::cpp11::to_string(input1->info()->dimension(2)));
+- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" +
+- support::cpp11::to_string(num_elements_processed.x()));
+- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" +
+- support::cpp11::to_string(num_elements_processed.y()));
+-
+- kernel_name = "gemmlowp_mm_midgard_ex";
+-
+- // Create kernel
+- _kernel = static_cast<cl::Kernel>(
+- CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+-
+- // Set config_id for enabling LWS tuning
+- _config_id = kernel_name;
+- _config_id += "_";
+- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+- _config_id += "_";
+- _config_id += support::cpp11::to_string(output->info()->dimension(1));
+- _config_id += "_";
+- _config_id += support::cpp11::to_string(output->info()->dimension(0));
+-}
+-
+-Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0,
+- const ITensorInfo *input1,
+- const ITensorInfo *output,
+- const GEMMReshapeInfo &gemm_info)
+-{
+- ElementsProcessed num_elements_processed{};
+- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info));
+- ARM_COMPUTE_RETURN_ON_ERROR(
+- validate_and_configure_window(input0->clone().get(), input1->clone().get(),
+- output->clone().get(), gemm_info, num_elements_processed)
+- .first);
+-
+- return Status{};
+-}
+-
+-void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue)
+-{
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+- if (_input1->info()->num_dimensions() < 3)
+- {
+- // The stride_z for matrix B must be zero if we do not slice
+- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+- }
+-
+- Window slice = window.first_slice_window_3D();
+- Window slice_matrix_b = slice;
+-
+- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+-
+- if (_reinterpret_input_as_3d)
+- {
+- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+- const unsigned int total_cross_plane_pad =
+- _input0->info()->padding().top + _input0->info()->padding().bottom;
+- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+- }
+-
+- if (_reinterpret_output_as_3d)
+- {
+- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+- const unsigned int idx0 =
+- 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+- const unsigned int total_cross_plane_pad =
+- _output->info()->padding().top + _output->info()->padding().bottom;
+- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+- }
+-
+- do
+- {
+- Window slice_b = slice;
+- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A
+- // more than 2
+- // This scenario can happen when the matrix multiplication is used to perform a convolution
+- // operation
+- if (!_slide_matrix_b)
+- {
+- slice_b = slice_matrix_b;
+- }
+-
+- unsigned int idx = 0;
+- add_2D_tensor_argument(idx, _input0, slice);
+- add_2D_tensor_argument(idx, _input1, slice_b);
+- add_2D_tensor_argument(idx, _output, slice);
+- _kernel.setArg<cl_uint>(idx++,
+- static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+- _kernel.setArg<cl_uint>(idx++,
+- static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+- _kernel.setArg<cl_uint>(idx++,
+- static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+- enqueue(queue, *this, slice, lws_hint());
+- } while (window.slide_window_slice_3D(slice));
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+index 3a25987..3bfe3e4 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+@@ -45,6 +45,7 @@
+ #include "arm_compute/core/CL/ICLTensor.h"
+ #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+ #include "arm_compute/core/UtilsEx.h"
++#include "support/StringSupport.h"
+
+ using namespace arm_compute;
+
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+index 7fbdcda..930e7c9 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+
+ using namespace arm_compute;
+
+@@ -110,7 +111,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
+ _hits = hits;
+
+ // Make _lookup_indices tensor
+- _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
++ _lookup_indices = support::cpp14::make_unique<CLTensor>();
+ _lookup_indices->allocator()->init(
+ TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+ _lookup_indices->allocator()->allocate();
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+index b45f6bb..61c14d2 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+@@ -48,7 +48,7 @@
+ #include "arm_compute/core/TensorInfo.h"
+ #include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/Window.h"
+-
++#include "support/StringSupport.h"
+ #include "support/ToolchainSupport.h"
+
+ namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+index d305896..6b27c99 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+@@ -49,6 +49,7 @@
+ #include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/Window.h"
++#include "support/StringSupport.h"
+
+ using namespace arm_compute;
+
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+index 74f7b41..643c8b1 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+
+ using namespace arm_compute;
+
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+deleted file mode 100644
+index 8910a7b..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
++++ /dev/null
+@@ -1,210 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-namespace
+-{
+-constexpr unsigned int num_elems_processed_per_iteration = 16;
+-
+-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+-{
+- const TensorShape &out_shape =
+- TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+- DataType::QASYMM8);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+- DataType::QASYMM8);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+- "Inputs are not broadcast compatible");
+- // Validate in case of configured output
+- if (output->total_size() > 0)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
+- DataType::QASYMM8);
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+- "Wrong shape for output");
+- }
+- return Status{};
+-}
+-} // namespace
+-
+-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+-
+-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+-{
+- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+- ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
+-
+- _input = input;
+- _alpha = alpha;
+- _output = output;
+-
+- // Create kernel
+- std::string kernel_name = "prelu";
+- std::set<std::string> build_opts;
+- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+- build_opts.emplace(
+- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+-
+- if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+- {
+- build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string(
+- input->info()->quantization_info().uniform().offset));
+- build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string(
+- alpha->info()->quantization_info().uniform().offset));
+- build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string(
+- output->info()->quantization_info().uniform().offset));
+- build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string(
+- input->info()->quantization_info().uniform().scale));
+- build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string(
+- alpha->info()->quantization_info().uniform().scale));
+- build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(
+- output->info()->quantization_info().uniform().scale));
+- kernel_name += "_qasymm8";
+- }
+- _kernel =
+- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+-
+- const std::pair<TensorShape, ValidRegion> broadcast_pair =
+- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+-
+- const TensorShape &out_shape = broadcast_pair.first;
+- const ValidRegion &valid_region = broadcast_pair.second;
+-
+- // Auto initialize output if not initialized
+- {
+- set_shape_if_empty(*output->info(), out_shape);
+-
+- if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+- {
+- set_format_if_unknown(*output->info(), Format::F16);
+- }
+- else if (input->info()->data_type() == DataType::F32 ||
+- alpha->info()->data_type() == DataType::F32)
+- {
+- set_format_if_unknown(*output->info(), Format::F32);
+- }
+- }
+-
+- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+- Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+- Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+-
+- AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+- AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+-
+- update_window_and_padding(win_input1, input1_access) ||
+- update_window_and_padding(win_input2, input2_access) ||
+- update_window_and_padding(win, output_access);
+-
+- output_access.set_valid_region(win, valid_region);
+-
+- ICLKernel::configure_internal(win);
+-}
+-
+-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+- const TensorShape &in_shape1 = _input->info()->tensor_shape();
+- const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+- const TensorShape &out_shape = _output->info()->tensor_shape();
+-
+- bool can_collapse = true;
+- if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+- {
+- can_collapse =
+- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+- for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+- {
+- can_collapse = (in_shape1[d] == in_shape2[d]);
+- }
+- }
+-
+- bool has_collapsed = false;
+- Window collapsed =
+- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+- : window;
+-
+- const TensorShape &in_shape1_collapsed =
+- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+- const TensorShape &in_shape2_collapsed =
+- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+-
+- Window slice = collapsed.first_slice_window_3D();
+- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+-
+- do
+- {
+- unsigned int idx = 0;
+- add_3D_tensor_argument(idx, _input, slice_input1);
+- add_3D_tensor_argument(idx, _alpha, slice_input2);
+- add_3D_tensor_argument(idx, _output, slice);
+-
+- enqueue(queue, *this, slice);
+-
+- collapsed.slide_window_slice_3D(slice_input1);
+- collapsed.slide_window_slice_3D(slice_input2);
+- } while (collapsed.slide_window_slice_3D(slice));
+-}
+-
+-BorderSize CLPReLUKernel::border_size() const
+-{
+- const unsigned int replicateSize =
+- _output->info()->dimension(0) -
+- std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+- const unsigned int border =
+- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+- return BorderSize(0, border, 0, 0);
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+index 2d551f6..1a7a18c 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+@@ -49,6 +49,7 @@
+ #include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/Window.h"
++#include "support/StringSupport.h"
+
+ namespace arm_compute
+ {
+@@ -69,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_fac
+
+ // Output must always be initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ return Status{};
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+index a983183..06c2579 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+
+ using namespace arm_compute;
+ namespace
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+index ff1904a..8d8853c 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+@@ -48,6 +48,7 @@
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/Window.h"
+ #include "arm_compute/core/utils/misc/ShapeCalculator.h"
++#include "support/StringSupport.h"
+
+ #include <climits>
+
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+deleted file mode 100644
+index 64fc038..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
++++ /dev/null
+@@ -1,148 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+- const int32_t block_size)
+-{
+- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+- DataType::S16, DataType::S32, DataType::F16,
+- DataType::F32);
+- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+- DataType::S16, DataType::S32, DataType::F16,
+- DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+- "Block size should be greater than or equal to 1.");
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
+- "Input batch should be equal to Output batch");
+-
+- auto layout_out = input->data_layout();
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+-
+- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+- auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
+- auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+- input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
+- "Output depth should be equal to (input depth * block size *block size)");
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
+- (input->dimension(index_height) % block_size),
+- "Input height and width should be divisible by block size");
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+- (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
+- (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
+- "Output height and width should be equal to "
+- "input_height/blocksize and input_width/blocksize respectively");
+-
+- return Status{};
+-}
+-
+-} // namespace
+-
+-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+-
+-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+- const int32_t block_size)
+-{
+-
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+-
+- _input = input;
+- _output = output;
+-
+- // Set kernel build options
+- auto layout_out = input->info()->data_layout();
+- std::set<std::string> build_opts;
+- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+- build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+- auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+- auto depth = input->info()->dimension(index_depth);
+- build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
+- build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
+-
+- // Create kernel
+- _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+- "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+-
+- // Configure kernel window
+- Window win = calculate_max_window(*input->info(), Steps());
+-
+- Coordinates coord;
+- coord.set_num_dimensions(output->info()->num_dimensions());
+- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+-
+- ICLKernel::configure_internal(win);
+-}
+-
+-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+-
+- Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+-
+- // Setup output slice
+- Window slice_out(slice_in);
+- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+- slice_out.set(3, Window::Dimension(0, 0, 0));
+-
+- do
+- {
+- unsigned int idx = 0;
+- add_4D_tensor_argument(idx, _input, slice_in);
+- add_4D_tensor_argument(idx, _output, slice_out);
+- enqueue(queue, *this, slice_in);
+- } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
+deleted file mode 100644
+index 61999cb..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
++++ /dev/null
+@@ -1,188 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibrary.h"
+-#include "arm_compute/core/CL/CLValidate.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/Window.h"
+-
+-using namespace arm_compute;
+-
+-CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
+- : _input(nullptr), _output(nullptr), _inner_border(), _info()
+-{
+-}
+-
+-Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
+- const ITensorInfo *output,
+- const BorderSize &inner_border,
+- const PadStrideInfo &info)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+- DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+-
+- const DataLayout data_layout = input->data_layout();
+-
+- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+- for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+- }
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
+- "inner_border_right must be smaller that stride_x");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
+- "inner_border_top must be smaller that stride_y");
+-
+- return Status{};
+-}
+-
+-void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
+- const BorderSize &inner_border,
+- const PadStrideInfo &info)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+- _input = input;
+- _output = output;
+- _inner_border = inner_border;
+- _info = info;
+-
+- // Perform validation step
+- ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
+- input->info(), output->info(), inner_border, info));
+-
+- // Create kernel
+- CLBuildOptions build_opts;
+- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+- _kernel = static_cast<cl::Kernel>(
+- CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
+-
+- constexpr unsigned int num_elems_processed_per_iteration = 1;
+-
+- // Configure kernel window
+- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+-
+- ICLKernel::configure_internal(win);
+-}
+-
+-void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+- const DataLayout data_layout = _input->info()->data_layout();
+-
+- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+-
+- const int out_start_x = _info.pad_left();
+- const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
+- _info.pad_right() + _info.stride().first - 1;
+- const int out_step_x = _info.stride().first;
+-
+- const int out_start_y = _inner_border.top + _info.pad_top();
+- const int out_end_y =
+- _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
+- const int out_step_y = _info.stride().second;
+-
+- switch (data_layout)
+- {
+- case DataLayout::NCHW:
+- {
+- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+-
+- Window slice_out = collapsed.first_slice_window_3D();
+- slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+- slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+-
+- Window slice_in = collapsed.first_slice_window_3D();
+-
+- do
+- {
+- unsigned int idx = 0;
+- add_3D_tensor_argument(idx, _input, slice_in);
+- add_3D_tensor_argument(idx, _output, slice_out);
+- enqueue(queue, *this, slice_out);
+- } while (collapsed.slide_window_slice_3D(slice_in) &&
+- collapsed.slide_window_slice_3D(slice_out));
+- break;
+- }
+- case DataLayout::NHWC:
+- {
+- // NOTE: not collapsing in NHWC
+- Window slice_out = window.first_slice_window_3D();
+- slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+- slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+-
+- Window slice_in = window.first_slice_window_3D();
+-
+- do
+- {
+- unsigned int idx = 0;
+- add_3D_tensor_argument(idx, _input, slice_in);
+- add_3D_tensor_argument(idx, _output, slice_out);
+- enqueue(queue, *this, slice_out);
+- } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+- break;
+- }
+- default:
+- ARM_COMPUTE_ERROR("Unsupported data layout");
+- }
+-}
+diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
+deleted file mode 100644
+index 648afb3..0000000
+--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
++++ /dev/null
+@@ -1,118 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-
+-#include <cstddef>
+-#include <cstdint>
+-
+-namespace arm_compute
+-{
+-CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {}
+-
+-bool CPPUpsampleKernelEx::is_parallelisable() const { return false; }
+-
+-void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output,
+- const PadStrideInfo &info)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+- _input = input;
+- _output = output;
+- _info = info;
+-
+- // Configure kernel window
+- Window win = calculate_max_window(*input->info(), Steps());
+-
+- // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped
+- Coordinates coord;
+- coord.set_num_dimensions(output->info()->num_dimensions());
+- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+-
+- ICPPKernel::configure(win);
+-}
+-
+-void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
+-{
+- ARM_COMPUTE_UNUSED(info);
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+-
+- // Initialize _scaled_output buffer
+- const int width_scaled = _output->info()->dimension(0);
+- const int height_scaled = _output->info()->dimension(1);
+- const int stride_x = _info.stride().first;
+- const int stride_y = _info.stride().second;
+- const int start_x = _info.pad_left();
+- const int start_y = _info.pad_top();
+- const int end_y = height_scaled - _info.pad_bottom();
+- const int end_x = width_scaled - _info.pad_top();
+- const size_t element_size = _input->info()->element_size();
+-
+- // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
+- const uint8_t fill_value =
+- _output->info()->data_type() == DataType::QASYMM8
+- ? utility::clamp<uint8_t>(_output->info()->quantization_info().uniform().offset)
+- : 0;
+- // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
+- // values in a buffer of uint8_ts
+- std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value);
+-
+- // Create window
+- Window window_out(window);
+- window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
+- window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
+-
+- // Create iterators
+- Iterator in(_input, window);
+- Iterator out(_output, window_out);
+-
+- execute_window_loop(
+- window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
+deleted file mode 100644
+index fbb9dbc..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
++++ /dev/null
+@@ -1,671 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+-
+-#include "arm_compute/core/AccessWindowStatic.h"
+-#include "arm_compute/core/CPP/Validate.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/NEON/NEAsymm.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/Utils.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/Window.h"
+-
+-#include <arm_neon.h>
+-
+-namespace arm_compute
+-{
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+- SubDataType input_subtype)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
+- DataType::QASYMM8, DataType::U32,
+- DataType::S32, DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
+- input->data_type() != DataType::U8);
+-
+- if (output->tensor_shape().total_size() > 0)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+- DataType::QASYMM8, DataType::U32,
+- DataType::S32, DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+- }
+-
+- return Status{};
+-}
+-
+-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+-{
+- // Configure kernel window
+- Window win = calculate_max_window(*input, Steps());
+-
+- // Output tensor auto initialization if not yet initialized
+- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
+-
+- // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
+- Coordinates coord;
+- coord.set_num_dimensions(output->num_dimensions());
+- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+-
+- return std::make_tuple(Status{}, win);
+-}
+-
+-typedef struct bool8x16
+-{
+- uint8x16_t val;
+-} bool8x16_t;
+-
+-static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
+-
+-template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
+-template <> inline uint8x16_t vcast(const bool8x16_t &v)
+-{
+- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+- return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+-}
+-
+-template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
+-{
+- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+-
+- const uint32x4x4_t ret = {{
+- vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
+- vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
+- vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
+- vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline int32x4x4_t vcast(const bool8x16_t &v)
+-{
+- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+-
+- const int32x4x4_t ret = {{
+- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline float32x4x4_t vcast(const bool8x16_t &v)
+-{
+- const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+- const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+- uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+- uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+-
+- const float32x4x4_t ret = {{
+- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
+-{
+- const uint32x4x4_t ret = {{
+- vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
+- vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
+- vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
+- vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline int32x4x4_t vcast(const uint8x16_t &v)
+-{
+- const int32x4x4_t ret = {{
+- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline float32x4x4_t vcast(const uint8x16_t &v)
+-{
+- const float32x4x4_t ret = {{
+- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline uint8x16_t vcast(const int32x4x4_t &v)
+-{
+- // Saturate cast
+- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
+- vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
+-}
+-
+-template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
+-{
+- // Saturate cast
+- const uint32x4x4_t ret = {{
+- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
+- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
+- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
+- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
+- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
+- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
+- vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
+- vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
+-{
+- const float32x4x4_t ret = {{
+- vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
+- vcvtq_f32_s32(v.val[3]),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
+-{
+- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
+- vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
+-}
+-
+-template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
+-{
+- const int32x4x4_t ret = {{
+- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
+- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
+- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
+- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
+- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
+- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
+- vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
+- vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
+-{
+- const float32x4x4_t ret = {{
+- vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
+- vcvtq_f32_u32(v.val[3]),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline uint8x16_t vcast(const float32x4x4_t &v)
+-{
+- // Saturate cast
+- return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
+- vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
+- vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
+- vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
+-}
+-
+-template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
+-{
+- const uint32x4x4_t ret = {{
+- vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
+- vcvtq_u32_f32(v.val[3]),
+- }};
+-
+- return ret;
+-}
+-
+-template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
+-{
+- const int32x4x4_t ret = {{
+- vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
+- vcvtq_s32_f32(v.val[3]),
+- }};
+-
+- return ret;
+-}
+-
+-template <typename T> struct cast_vector;
+-template <> struct cast_vector<bool>
+-{
+- using type = bool8x16_t;
+-};
+-template <> struct cast_vector<uint8_t>
+-{
+- using type = uint8x16_t;
+-};
+-template <> struct cast_vector<uint32_t>
+-{
+- using type = uint32x4x4_t;
+-};
+-template <> struct cast_vector<int32_t>
+-{
+- using type = int32x4x4_t;
+-};
+-template <> struct cast_vector<float>
+-{
+- using type = float32x4x4_t;
+-};
+-
+-template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
+-{
+- wrapper::vstore(ptr, v.val[0]);
+- wrapper::vstore(ptr + 4, v.val[1]);
+- wrapper::vstore(ptr + 8, v.val[2]);
+- wrapper::vstore(ptr + 12, v.val[3]);
+-}
+-
+-template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
+-{
+- wrapper::vstore(ptr, v);
+-}
+-
+-inline bool8x16_t vloadq(const bool *ptr)
+-{
+- bool8x16_t ret;
+- ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
+- return ret;
+-}
+-
+-template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
+-{
+- return wrapper::vloadq(ptr);
+-}
+-
+-template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
+-{
+- return vloadq(ptr);
+-}
+-
+-template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
+-{
+- return vld4q_u32(ptr);
+-}
+-
+-template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
+-{
+- return vld4q_s32(ptr);
+-}
+-
+-template <> inline typename cast_vector<float>::type load_input(const float *ptr)
+-{
+- return vld4q_f32(ptr);
+-}
+-
+-template <typename T> inline T get_value(const T *ptr) { return *ptr; }
+-
+-template <> inline bool get_value(const bool *ptr)
+-{
+- bool ret = (*ptr != 0);
+- return ret;
+-}
+-
+-template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
+-{
+- const int window_step_x = 16;
+- const auto window_start_x = static_cast<int>(window.x().start());
+- const auto window_end_x = static_cast<int>(window.x().end());
+-
+- // Collapse window and reset first dimension to handle tail calculations manually
+- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+-
+- // Create iterators
+- Iterator in(input, win_collapsed);
+- Iterator out(output, win_collapsed);
+-
+-#ifdef __aarch64__
+- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+-#else //__aarch64__
+- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+-#endif //__aarch64__
+-
+- execute_window_loop(
+- win_collapsed,
+- [&](const Coordinates &) {
+- const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
+-
+- int x = window_start_x;
+- for (; x <= (window_end_x - window_step_x); x += window_step_x)
+- {
+- using from_vector = typename cast_vector<FromT>::type;
+- const from_vector vin = load_input(in_ptr + x);
+-
+- switch (output->info()->data_type())
+- {
+- case DataType::U8:
+- {
+- using to_vector = typename cast_vector<uint8_t>::type;
+- const to_vector vout = vcast<to_vector, from_vector>(vin);
+- store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+- break;
+- }
+- case DataType::QASYMM8:
+- {
+- using to_vector = typename cast_vector<float>::type;
+- const UniformQuantizationInfo &qinfo_out =
+- output->info()->quantization_info().uniform();
+- const auto vf = vcast<to_vector, from_vector>(vin);
+- const auto vout = vquantize(vf, qinfo_out);
+- store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+- break;
+- }
+- case DataType::U32:
+- {
+- using to_vector = typename cast_vector<uint32_t>::type;
+- const to_vector vout = vcast<to_vector, from_vector>(vin);
+- store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+- break;
+- }
+- case DataType::S32:
+- {
+- using to_vector = typename cast_vector<int32_t>::type;
+- const to_vector vout = vcast<to_vector, from_vector>(vin);
+- store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+- break;
+- }
+- case DataType::F32:
+- {
+- using to_vector = typename cast_vector<float>::type;
+- const to_vector vout = vcast<to_vector, from_vector>(vin);
+- store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+- break;
+- }
+- default:
+- ARM_COMPUTE_ERROR("Unsupported data type.");
+- }
+- }
+-
+- // Compute left-over elements
+- for (; x < window_end_x; ++x)
+- {
+- FromT val = get_value(in_ptr + x);
+- switch (output->info()->data_type())
+- {
+- case DataType::U8:
+- {
+- *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+- break;
+- }
+- case DataType::QASYMM8:
+- {
+- const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+- const auto qval =
+- quantize_qasymm8(static_cast<float>(val), qinfo_out, rounding_policy);
+- *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
+- break;
+- }
+- case DataType::U32:
+- {
+- *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+- break;
+- }
+- case DataType::S32:
+- {
+- *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+- break;
+- }
+- case DataType::F32:
+- {
+- *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+- break;
+- }
+- default:
+- ARM_COMPUTE_ERROR("Unsupported data type.");
+- }
+- }
+- },
+- in, out);
+-}
+-
+-void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
+-{
+- const int window_step_x = 16;
+- const auto window_start_x = static_cast<int>(window.x().start());
+- const auto window_end_x = static_cast<int>(window.x().end());
+-
+- // Collapse window and reset first dimension to handle tail calculations manually
+- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+-
+- // Create iterators
+- Iterator in(input, win_collapsed);
+- Iterator out(output, win_collapsed);
+-
+-#ifdef __aarch64__
+- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+-#else //__aarch64__
+- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+-#endif //__aarch64__
+- const auto &qinfo_in = input->info()->quantization_info().uniform();
+- const auto &qinfo_out = output->info()->quantization_info().uniform();
+-
+- execute_window_loop(
+- win_collapsed,
+- [&](const Coordinates &) {
+- const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
+-
+- int x = window_start_x;
+- for (; x <= (window_end_x - window_step_x); x += window_step_x)
+- {
+- using from_vector = typename cast_vector<float>::type;
+- const auto vf = wrapper::vloadq(in_ptr + x);
+- const auto vin = vdequantize(vf, qinfo_in);
+- switch (output->info()->data_type())
+- {
+- case DataType::U8:
+- {
+- using to_vector = typename cast_vector<uint8_t>::type;
+- const to_vector vout = vcast<to_vector, from_vector>(vin);
+- store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+- break;
+- }
+- case DataType::QASYMM8:
+- {
+- using to_vector = typename cast_vector<float>::type;
+- const auto vf = vcast<to_vector, from_vector>(vin);
+- const auto vout = vquantize(vf, qinfo_out);
+- store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+- break;
+- }
+- case DataType::U32:
+- {
+- using to_vector = typename cast_vector<uint32_t>::type;
+- const to_vector vout = vcast<to_vector, from_vector>(vin);
+- store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+- break;
+- }
+- case DataType::S32:
+- {
+- using to_vector = typename cast_vector<int32_t>::type;
+- const to_vector vout = vcast<to_vector, from_vector>(vin);
+- store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+- break;
+- }
+- case DataType::F32:
+- {
+- using to_vector = typename cast_vector<float>::type;
+- const to_vector vout = vcast<to_vector, from_vector>(vin);
+- store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+- break;
+- }
+- default:
+- ARM_COMPUTE_ERROR("Unsupported data type.");
+- }
+- }
+-
+- // Compute left-over elements
+- for (; x < window_end_x; ++x)
+- {
+- qasymm8_t qval_in = *(in_ptr + x);
+- const auto val = dequantize_qasymm8(qval_in, qinfo_in);
+-
+- switch (output->info()->data_type())
+- {
+- case DataType::U8:
+- {
+- *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+- break;
+- }
+- case DataType::QASYMM8:
+- {
+- const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy);
+- *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
+- break;
+- }
+- case DataType::U32:
+- {
+- *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+- break;
+- }
+- case DataType::S32:
+- {
+- *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+- break;
+- }
+- case DataType::F32:
+- {
+- *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+- break;
+- }
+- default:
+- ARM_COMPUTE_ERROR("Unsupported data type.");
+- }
+- }
+- },
+- in, out);
+-}
+-} // namespace
+-
+-NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
+-{
+-}
+-
+-void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
+-
+- _input = input;
+- _output = output;
+- _input_subtype = input_subtype;
+-
+- // Configure kernel window
+- auto win_config = validate_and_configure_window(input->info(), output->info());
+-
+- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+-
+- INEKernel::configure(std::get<1>(win_config));
+-}
+-
+-Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+- SubDataType input_subtype)
+-{
+- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
+- ARM_COMPUTE_RETURN_ON_ERROR(
+- std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+- return Status{};
+-}
+-
+-void NECastKernel::run(const Window &window, const ThreadInfo &info)
+-{
+- ARM_COMPUTE_UNUSED(info);
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+-
+- switch (_input->info()->data_type())
+- {
+- case DataType::U8:
+- if (_input_subtype == SubDataType::BOOL)
+- {
+- run_cast<bool>(_input, _output, window);
+- }
+- else
+- {
+- run_cast<uint8_t>(_input, _output, window);
+- }
+- break;
+- case DataType::QASYMM8:
+- run_cast_qasymm8(_input, _output, window);
+- break;
+- case DataType::U32:
+- run_cast<uint32_t>(_input, _output, window);
+- break;
+- case DataType::S32:
+- run_cast<int32_t>(_input, _output, window);
+- break;
+- case DataType::F32:
+- run_cast<float>(_input, _output, window);
+- break;
+- default:
+- ARM_COMPUTE_ERROR("Unsupported data type.");
+- }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
+deleted file mode 100644
+index 95e269d..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
++++ /dev/null
+@@ -1,181 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+-
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+-#include <arm_neon.h>
+-#include <cstdint>
+-
+-using namespace arm_compute::misc::shape_calculator;
+-
+-namespace arm_compute
+-{
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+- ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
+-
+- const DataLayout data_layout = input->data_layout();
+- const int idx_channel =
+- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+- 0);
+- // Validate output if initialized
+- if (output->total_size() != 0)
+- {
+- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+- const int idx_height =
+- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+- (block_shape * input->tensor_shape()[idx_width]));
+- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+- (block_shape * input->tensor_shape()[idx_height]));
+- ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+- }
+-
+- return Status{};
+-}
+-} // namespace
+-
+-NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
+- : _input(nullptr), _output(nullptr), _block_shape()
+-{
+-}
+-
+-void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
+- int32_t block_shape)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+- TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
+- // Output auto inizialitation if not yet initialized
+- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+-
+- // Perform validation step
+- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+-
+- _input = input;
+- _output = output;
+- _block_shape = block_shape;
+-
+- // Configure kernel window
+- Window win = calculate_max_window(*input->info(), Steps());
+- ICPPKernel::configure(win);
+-}
+-
+-Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+- int32_t block_shape)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+- return Status{};
+-}
+-
+-void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+-{
+- ARM_COMPUTE_UNUSED(info);
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+-
+- const int idx_channel =
+- get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+- const int depth_size = _input->info()->dimension(idx_channel);
+- const int r = (depth_size / (_block_shape * _block_shape));
+- const int element_size = _input->info()->element_size();
+-
+- Window slice_out = window.first_slice_window_3D();
+-
+- // The slice_out slice does not move
+- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+-
+- // Main loop for NCHW and NHWC
+- if (_input->info()->data_layout() == DataLayout::NCHW)
+- {
+- Window slice_in = window.first_slice_window_2D();
+- do
+- {
+- Iterator in(_input, slice_in);
+- execute_window_loop(slice_in,
+- [&](const Coordinates &id) {
+- const int x = id.x();
+- const int y = id.y();
+-
+- const int z = id.z() % r;
+- const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
+- const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
+- Coordinates output_coords{out_x, out_y, z, id[3]};
+- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+- },
+- in);
+- } while (window.slide_window_slice_2D(slice_in));
+- }
+- else
+- {
+- Window slice_in = window.first_slice_window_3D();
+- do
+- {
+- Iterator in(_input, slice_in);
+- execute_window_loop(slice_in,
+- [&](const Coordinates &id) {
+- const int x = id.y();
+- const int y = id.z();
+-
+- const int z = id.x() % r;
+- const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
+- const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
+- Coordinates output_coords{z, out_x, out_y, id[3]};
+- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+- },
+- in);
+- } while (window.slide_window_slice_3D(slice_in));
+- }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
+deleted file mode 100644
+index 200fc4f..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
++++ /dev/null
+@@ -1,221 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+-
+-#include "arm_compute/core/CPP/Validate.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/IAccessWindow.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/NEAsymm.h"
+-#include "arm_compute/core/NEON/NEFixedPoint.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Validate.h"
+-
+-#include <algorithm>
+-#include <arm_neon.h>
+-#include <cstdint>
+-#include <map>
+-#include <string>
+-
+-namespace arm_compute
+-{
+-class Coordinates;
+-
+-namespace
+-{
+-template <ElementWiseUnaryEx op, typename ScalarType>
+-inline ScalarType elementwise_op_scalar(const ScalarType &a)
+-{
+- switch (op)
+- {
+- case ElementWiseUnaryEx::NEG:
+- return -a;
+- default:
+- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+- }
+-}
+-
+-template <ElementWiseUnaryEx op, typename VectorType>
+-inline VectorType elementwise_op(const VectorType &a)
+-{
+- switch (op)
+- {
+- case ElementWiseUnaryEx::NEG:
+- return wrapper::vneg(a);
+- default:
+- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+- }
+-}
+-
+-template <ElementWiseUnaryEx op, typename ScalarType>
+-void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
+-{
+- const int window_step_x = 16 / sizeof(ScalarType);
+- const auto window_start_x = static_cast<int>(window.x().start());
+- const auto window_end_x = static_cast<int>(window.x().end());
+-
+- Window win = window;
+- win.set(Window::DimX, Window::Dimension(0, 1, 1));
+-
+- Iterator input(in, win);
+- Iterator output(out, win);
+-
+- execute_window_loop(win,
+- [&](const Coordinates &) {
+- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+-
+- int x = window_start_x;
+- for (; x <= window_end_x - window_step_x; x += window_step_x)
+- {
+- wrapper::vstore(output_ptr + x,
+- elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
+- }
+- for (; x < window_end_x; ++x)
+- {
+- *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
+- }
+- },
+- input, output);
+-}
+-
+-template <ElementWiseUnaryEx op>
+-std::function<void(const ITensor *input, ITensor *output, const Window &window)>
+-configure_func(const ITensor *input, ITensor *output)
+-{
+- std::string function_to_call("op_");
+- function_to_call += string_from_data_type(input->info()->data_type()) + "_";
+- function_to_call += string_from_data_type(output->info()->data_type());
+-
+- static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
+- map_function = {
+- {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
+- };
+-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+- map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
+-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+-
+- auto it = map_function.find(function_to_call);
+-
+- if (it != map_function.end())
+- {
+- auto func = it->second;
+- return [func](const ITensor *input, ITensor *output, const Window &window) {
+- func(input, output, window);
+- };
+- }
+- return nullptr;
+-}
+-} // namespace
+-
+-NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
+- : _function(nullptr), _input(nullptr), _output(nullptr)
+-{
+-}
+-
+-void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
+- ITensor *output)
+-{
+- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+- // Configure kernel window
+- const std::pair<TensorShape, ValidRegion> broadcast_pair =
+- ITensorInfo::broadcast_shape_and_valid_region(*input->info());
+- const TensorShape &out_shape = broadcast_pair.first;
+- const ValidRegion &valid_region = broadcast_pair.second;
+-
+- // Auto initialize output if not initialized
+- auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+-
+- Window win = calculate_max_window(valid_region);
+-
+- _input = input;
+- _output = output;
+-
+- INEKernel::configure(win);
+-
+- switch (op)
+- {
+- case ElementWiseUnaryEx::NEG:
+- _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
+- break;
+- default:
+- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+- }
+-}
+-
+-Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
+- const ITensorInfo &output)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
+- DataType::S32);
+-
+- // Validate in case of configured output
+- if (output.total_size() > 0)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+- }
+-
+- return Status{};
+-}
+-
+-Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+- const ITensorInfo *output)
+-{
+- ARM_COMPUTE_UNUSED(op);
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+- return Status{};
+-}
+-
+-void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
+-{
+- ARM_COMPUTE_UNUSED(info);
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+- ARM_COMPUTE_ERROR_ON(_function == nullptr);
+- _function(_input, _output, window);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
+deleted file mode 100644
+index 641641b..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
++++ /dev/null
+@@ -1,291 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+-
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/NEAsymm.h"
+-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Window.h"
+-
+-#include <arm_neon.h>
+-
+-using namespace arm_compute;
+-namespace
+-{
+-
+-/** Conditional element-wise operations */
+-enum class ConditionalOperation
+-{
+- PRELU, /**< (x * y) for x < 0, x for x >= 0 */
+-};
+-
+-template <ConditionalOperation op, typename ScalarType>
+-inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
+-{
+- auto res = ScalarType(0);
+-
+- switch (op)
+- {
+- case ConditionalOperation::PRELU:
+- res = a < 0 ? a * b : a;
+- break;
+- default:
+- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+- }
+- return res;
+-}
+-
+-template <ConditionalOperation op>
+-inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
+- QuantizationInfo qinfo)
+-{
+- return quantize_qasymm8(elementwise_conditional_op_scalar<op>(a, b), qinfo,
+- RoundingPolicy::TO_NEAREST_UP);
+-}
+-
+-template <ConditionalOperation op, typename VectorType>
+-inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
+-{
+- VectorType res = {0, 0, 0, 0};
+- VectorType const_0 = {0, 0, 0, 0};
+-
+- switch (op)
+- {
+- case ConditionalOperation::PRELU:
+- res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
+- ;
+- break;
+- default:
+- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+- }
+- return res;
+-}
+-
+-template <ConditionalOperation op>
+-inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
+-{
+- float32x4x4_t out = {{
+- elementwise_conditional_op<op>(a.val[0], b.val[0]),
+- elementwise_conditional_op<op>(a.val[1], b.val[1]),
+- elementwise_conditional_op<op>(a.val[2], b.val[2]),
+- elementwise_conditional_op<op>(a.val[3], b.val[3]),
+- }};
+- return out;
+-}
+-
+-template <ConditionalOperation op, typename ScalarType, typename VectorType>
+-inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
+- const ScalarType &broadcast_value,
+- const bool reorder)
+-{
+- VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+- return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
+- reorder ? a : broadcast_vector);
+-}
+-
+-template <ConditionalOperation op, typename ScalarType, typename VectorType>
+-inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
+- const ScalarType *input1_ptr,
+- const ScalarType *input2_ptr, ScalarType *output_ptr)
+-{
+- int x = window_start_x;
+- for (; x <= (window_end_x - window_step_x); x += window_step_x)
+- {
+- const auto a = wrapper::vloadq(input1_ptr + x);
+- const auto b = wrapper::vloadq(input2_ptr + x);
+- wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
+- }
+- return x;
+-}
+-
+-template <ConditionalOperation op>
+-inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
+- int window_step_x, const uint8_t *input1_ptr,
+- const uint8_t *input2_ptr, uint8_t *output_ptr,
+- int32x4_t voffset1, int32x4_t voffset2,
+- float32x4_t vscale1, float32x4_t vscale2,
+- float32x4_t voffseto, float32x4_t invvscaleo)
+-{
+- int x = window_start_x;
+- for (; x <= (window_end_x - window_step_x); x += window_step_x)
+- {
+- // Get inputs and compute output
+- const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+- const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+- const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
+- store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+- }
+- return x;
+-}
+-
+-template <ConditionalOperation op, typename ScalarType, typename VectorType>
+-inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
+- int window_step_x,
+- const ScalarType *non_broadcast_input_ptr,
+- const ScalarType &broadcast_value,
+- ScalarType *output_ptr, const bool reorder)
+-{
+- int x = window_start_x;
+- for (; x <= (window_end_x - window_step_x); x += window_step_x)
+- {
+- const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+- wrapper::vstore(output_ptr + x,
+- elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
+- }
+- return x;
+-}
+-
+-template <ConditionalOperation op>
+-inline int elementwise_conditional_op_quantized_broadcast_loop(
+- int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
+- float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
+- float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+-{
+- int x = window_start_x;
+- for (; x <= (window_end_x - window_step_x); x += window_step_x)
+- {
+- const float32x4x4_t af =
+- load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+- const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
+- reorder ? af : broadcast_vector);
+- store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+- }
+- return x;
+-}
+-
+-template <ConditionalOperation op, typename ScalarType, typename VectorType>
+-void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+- const Window &window)
+-{
+- elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
+- &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
+- &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
+-}
+-
+-template <ConditionalOperation op>
+-void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
+- const Window &window)
+-{
+- elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
+- &elementwise_conditional_op_quantized_broadcast_loop<op>,
+- &elementwise_conditional_op_quantized_loop<op>);
+-}
+-} // namespace
+-
+-NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+-
+-void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
+- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
+-
+- // Configure kernel window
+- const std::pair<TensorShape, ValidRegion> broadcast_pair =
+- ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+- const TensorShape &out_shape = broadcast_pair.first;
+- const ValidRegion &valid_region = broadcast_pair.second;
+-
+- // Auto initialize output if not initialized
+- auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+-
+- Window win = calculate_max_window(valid_region);
+-
+- _input = input;
+- _alpha = alpha;
+- _output = output;
+- INEKernel::configure(win);
+-}
+-
+-void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
+-{
+- ARM_COMPUTE_UNUSED(info);
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+-
+- if (_input->info()->data_type() == DataType::F32)
+- {
+- elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
+- _output, window);
+- }
+- else if (_input->info()->data_type() == DataType::QASYMM8)
+- {
+- elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
+- window);
+- }
+- else
+- {
+- ARM_COMPUTE_ERROR("Wrong Type");
+- }
+-}
+-
+-Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+- const ITensorInfo &output)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
+-
+- const TensorShape out_shape =
+- TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+- "Inputs are not broadcast compatible");
+-
+- // Checks performed when output is configured
+- if (output.total_size() > 0)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+- detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+- "Wrong shape for output");
+- }
+-
+- return Status{};
+-}
+-
+-Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
+- const ITensorInfo *output)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
+- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
+-
+- return Status{};
+-}
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+index 6ba0f1f..5841f1d 100644
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
++++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+ DataType::F32);
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
+deleted file mode 100644
+index 44feb20..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
++++ /dev/null
+@@ -1,181 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+-
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+-#include <arm_neon.h>
+-#include <cstdint>
+-
+-using namespace arm_compute::misc::shape_calculator;
+-
+-namespace arm_compute
+-{
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
+-
+- // Validate output if initialized
+- if (output->total_size() != 0)
+- {
+- const DataLayout data_layout = input->data_layout();
+- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+- const int idx_height =
+- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+- const int idx_channel =
+- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+- const int idx_batch =
+- get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
+- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
+- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
+- output->tensor_shape()[idx_batch]);
+- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+- 0);
+- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
+- output->tensor_shape().total_size());
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+- }
+-
+- return Status{};
+-}
+-} // namespace
+-
+-NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
+- : _input(nullptr), _output(nullptr), _block_shape()
+-{
+-}
+-
+-void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
+- int32_t block_shape)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+- TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
+- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+-
+- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+-
+- _input = input;
+- _block_shape = block_shape;
+- _output = output;
+-
+- // Configure kernel window
+- Window win = calculate_max_window(*output->info(), Steps());
+- INEKernel::configure(win);
+-}
+-
+-Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+- int32_t block_shape)
+-{
+- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+- return Status{};
+-}
+-
+-void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+-{
+- ARM_COMPUTE_UNUSED(info);
+- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+-
+- const DataLayout data_layout = _input->info()->data_layout();
+- const int channel_idx =
+- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+- const int element_size = _input->info()->element_size();
+-
+- const size_t channel_size = _input->info()->dimension(channel_idx);
+-
+- Window slice_out = window.first_slice_window_3D();
+-
+- int batch_id = 0;
+-
+- // Main loop for NCHW and NHWC
+- if (_output->info()->data_layout() == DataLayout::NCHW)
+- {
+- do
+- {
+- Iterator out(_output, slice_out);
+- execute_window_loop(slice_out,
+- [&](const Coordinates &id) {
+- const size_t channel_id = id.z();
+- const size_t in_x =
+- id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+- const size_t in_y =
+- id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+- const int z = channel_id % channel_size;
+- Coordinates input_coords{in_x, in_y, z, batch_id};
+- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+- },
+- out);
+- ++batch_id;
+- } while (window.slide_window_slice_3D(slice_out));
+- }
+- else
+- {
+- do
+- {
+- Iterator out(_output, slice_out);
+- execute_window_loop(slice_out,
+- [&](const Coordinates &id) {
+- const size_t channel_id = id.x();
+- const size_t in_x =
+- id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+- const size_t in_y =
+- id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+- const int z = channel_id % channel_size;
+- Coordinates input_coords{z, in_x, in_y, batch_id};
+- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+- },
+- out);
+- ++batch_id;
+- } while (window.slide_window_slice_3D(slice_out));
+- }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
+deleted file mode 100644
+index 2d379cf..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
++++ /dev/null
+@@ -1,144 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
+-
+-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+-#include "arm_compute/runtime/CL/CLScheduler.h"
+-
+-namespace arm_compute
+-{
+-
+-CLArgOperation::CLArgOperation()
+-{
+- // DO NOTHING
+-}
+-
+-void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+- ArgOperation op)
+-{
+- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
+- _input = input;
+- _output = output;
+- _axis = axis;
+- _arg_op = op;
+- // NOTE The argminmax_axis must have no duplication.
+- _num_of_kernels = axis.size();
+- const size_t num_of_interm_tensors = _num_of_kernels - 1;
+-
+- _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+- _argop_kernels =
+- arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
+-
+- TensorShape shape{input->info()->tensor_shape()};
+- for (size_t i = 0; i < num_of_interm_tensors; i++)
+- {
+- shape.set(_axis[i], 1);
+- _interm_tensors[i].allocator()->init(
+- TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
+- .set_data_layout(input->info()->data_layout()));
+- _interm_tensors[i].allocator()->allocate();
+- }
+-
+- // Set a vector that is ordered ICLTensors sequentially.
+- std::vector<ICLTensor *> tensors;
+- tensors.emplace_back(input);
+- for (size_t i = 0; i < num_of_interm_tensors; i++)
+- {
+- tensors.emplace_back(_interm_tensors.get() + i);
+- }
+- tensors.emplace_back(output);
+-
+- // Apply ArgMinMax on all kernels
+- for (size_t i = 0; i < _num_of_kernels; i++)
+- {
+- _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
+- }
+-}
+-
+-Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+- const ITensorInfo *output, ArgOperation op)
+-{
+- const size_t num_of_kernels = axis.size();
+- const size_t num_of_interm_tensors = num_of_kernels - 1;
+-
+- // Create temporary tensor infos
+- auto interm_tensors =
+- arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+-
+- // Create intermediate tensor info
+- TensorShape shape{input->tensor_shape()};
+-
+- for (size_t i = 0; i < num_of_interm_tensors; i++)
+- {
+- shape.set(axis[i], 1);
+- interm_tensors[i].set_data_type(input->data_type());
+- interm_tensors[i].set_tensor_shape(shape);
+- interm_tensors[i].set_num_channels(input->num_channels());
+- }
+-
+- // Set a vector that is ordered ITensorInfo sequentially.
+- std::vector<const ITensorInfo *> tensors;
+- tensors.emplace_back(input);
+- for (size_t i = 0; i < num_of_interm_tensors; i++)
+- {
+- tensors.emplace_back(interm_tensors.get() + i);
+- }
+- tensors.emplace_back(output);
+-
+- // Validate argminmax only on all kernels
+- for (size_t i = 0; i < num_of_kernels; i++)
+- {
+- ARM_COMPUTE_RETURN_ON_ERROR(
+- CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
+- }
+-
+- return Status{};
+-}
+-
+-void CLArgOperation::run()
+-{
+- for (size_t i = 0; i < _num_of_kernels; ++i)
+- {
+- CLScheduler::get().enqueue(_argop_kernels[i]);
+- }
+-}
+-
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+index 92ee69a..e5122ab 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+@@ -48,7 +48,7 @@ using namespace arm_compute;
+ void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ BinaryLogicalOperation op)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
++ auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+ k->configure(input1, input2, output, op);
+ _kernel = std::move(k);
+
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+deleted file mode 100644
+index b3118f3..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
++++ /dev/null
+@@ -1,52 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLCast.h"
+-
+-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+-
+-using namespace arm_compute;
+-
+-void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
+- k->configure(input, output, input_subtype);
+- _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
+deleted file mode 100644
+index db66250..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
++++ /dev/null
+@@ -1,52 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+-
+-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+-
+-using namespace arm_compute;
+-
+-void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+- k->configure(input, output, block_size);
+- _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+new file mode 100644
+index 0000000..3dede05
+--- /dev/null
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+@@ -0,0 +1,267 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/*
++ * Copyright (c) 2019-2020 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
++
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/UtilsEx.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
++#include "arm_compute/runtime/CL/CLScheduler.h"
++
++#include <memory>
++#include <tuple>
++
++namespace arm_compute
++{
++using namespace arm_compute::misc::shape_calculator;
++
++CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
++ std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
++ : _memory_group(std::move(memory_manager)),
++ _scale_f(),
++ _conv_f(),
++ _flip_weights(),
++ _scaled_output(),
++ _original_weights(nullptr),
++ _weights_flipped(),
++ _flip_axis(),
++ _is_prepared(false)
++{
++}
++
++Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
++ const ITensorInfo *bias, ITensorInfo *output,
++ const PadStrideInfo &info, unsigned int invalid_right,
++ unsigned int invalid_bottom,
++ const WeightsInfo &weights_info)
++{
++ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
++ input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
++ const DataLayout data_layout = input->data_layout();
++
++ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
++ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
++ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
++
++ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
++ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
++
++ auto out_dims = transposeconv_output_dimensions(
++ input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
++ weights->dimension(idx_h), info, invalid_right, invalid_bottom);
++
++ const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
++
++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
++
++ if (bias != nullptr)
++ {
++ if (is_data_type_quantized_asymmetric(input->data_type()))
++ {
++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
++ }
++ else
++ {
++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
++ }
++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
++ }
++
++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
++ "Output's width is invalid.");
++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
++ "Output's height is invalid.");
++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
++ "Output's depth is invalid.");
++
++ unsigned int pad_left = 0;
++ unsigned int pad_right = 0;
++ unsigned int pad_top = 0;
++ unsigned int pad_bottom = 0;
++ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
++ *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
++ pad_bottom);
++ TensorInfo scale_out_info(input->clone()
++ ->set_is_resizable(true)
++ .reset_padding()
++ .set_tensor_shape(scale_out_shape)
++ .set_data_layout(data_layout));
++ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
++
++ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
++ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
++ conv_info, weights_info));
++
++ return Status{};
++}
++
++void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights,
++ const ICLTensor *bias, ICLTensor *output,
++ const PadStrideInfo &info, unsigned int invalid_right,
++ unsigned int invalid_bottom,
++ const WeightsInfo &weights_info)
++{
++ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info,
++ invalid_right, invalid_bottom, weights_info);
++}
++
++void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context,
++ ICLTensor *input, ICLTensor *weights,
++ const ICLTensor *bias, ICLTensor *output,
++ const PadStrideInfo &info, unsigned int invalid_right,
++ unsigned int invalid_bottom,
++ const WeightsInfo &weights_info)
++{
++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
++
++ unsigned int pad_left = 0;
++ unsigned int pad_right = 0;
++ unsigned int pad_top = 0;
++ unsigned int pad_bottom = 0;
++ const unsigned int stride_x = info.stride().first;
++ const unsigned int stride_y = info.stride().second;
++
++ const DataLayout data_layout = input->info()->data_layout();
++
++ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
++ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
++
++ _original_weights = weights;
++ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
++ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
++ _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
++
++ auto out_dims = transposeconv_output_dimensions(
++ input->info()->dimension(idx_w), input->info()->dimension(idx_h),
++ weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
++ invalid_bottom);
++
++ const TensorShape output_shape =
++ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
++
++ // Output auto initialization if not yet initialized
++ auto_init_if_empty(
++ *output->info(),
++ input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
++
++ // Perform validation step
++ ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
++ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
++ info, invalid_right, invalid_bottom));
++
++ _is_prepared = weights_info.retain_internal_weights();
++
++ _memory_group.manage(&_scaled_output);
++
++ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
++ // to match output shape
++ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
++ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
++ pad_right, pad_top, pad_bottom);
++
++ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
++ input->info()->quantization_info());
++ scale_out_info.set_data_layout(data_layout);
++ _scaled_output.allocator()->init(scale_out_info);
++
++ // configure scale function
++ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
++ DimensionRoundingType::FLOOR);
++ _scale_f.configure(input, &_scaled_output, upsample_info);
++
++ // Setup the function to convolve the upscaled output
++ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
++ _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info,
++ weights_info);
++ _scaled_output.allocator()->allocate();
++
++ // Setup flip axis data
++ _flip_axis.allocator()->allocate();
++ _flip_axis.map(true);
++ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
++ if (weights->info()->data_layout() == DataLayout::NHWC)
++ {
++ axis_data[0] = 1;
++ axis_data[1] = 2;
++ }
++ else
++ {
++ axis_data[0] = 0;
++ axis_data[1] = 1;
++ }
++ _flip_axis.unmap();
++}
++
++void CLDirectTransposeConvLayer::run()
++{
++ prepare();
++
++ MemoryGroupResourceScope scope_mg(_memory_group);
++
++ _scale_f.run();
++ _conv_f.run();
++}
++
++void CLDirectTransposeConvLayer::prepare()
++{
++ if (!_is_prepared)
++ {
++ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
++
++ // Run weights flipping and mark original weights tensor as unused
++ _weights_flipped.allocator()->allocate();
++ _flip_weights.run();
++ _original_weights->mark_as_unused();
++
++ // Prepare convolution
++ _conv_f.prepare();
++
++ // Free flipped weights
++ if (!_weights_flipped.is_used())
++ {
++ _weights_flipped.allocator()->free();
++ }
++
++ _is_prepared = true;
++ }
++}
++} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+index 3d9a28a..ae9d8af 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+@@ -47,7 +47,7 @@ using namespace arm_compute;
+ void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+ const ICLTensor *lookups)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
++ auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+ k->configure(input, output, lookups);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+index f098832..0198946 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+@@ -45,7 +45,7 @@
+ #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+ #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ #include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+
+ #include <algorithm>
+
+@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
+ ARM_COMPUTE_UNUSED(weights);
+ ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+- CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
++ CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+
+ return Status{};
+ }
+@@ -68,7 +68,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
+
+ void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
++ auto k = support::cpp14::make_unique<CLTransposeKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+ }
+@@ -172,7 +172,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
+
+ // Quantize input
+ _quantized_input.allocator()->init(
+- input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
++ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
++ DataType::QASYMM8_SIGNED));
+ _memory_group.manage(&_quantized_input);
+ _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
+
+@@ -199,7 +200,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+@@ -256,8 +257,9 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
+ ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
+
+ // Validate quantization symm8 kernel
+- const ITensorInfo &quantized_input = TensorInfo(
+- input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
++ const ITensorInfo &quantized_input =
++ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
++ DataType::QASYMM8_SIGNED));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
+
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+index 63e291b..2ff4b96 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+@@ -46,7 +46,7 @@
+ #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+ #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ #include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+
+ #include <algorithm>
+
+@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
+
+ void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
++ auto k = support::cpp14::make_unique<CLTransposeKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+index 9aebc47..157b4d9 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+@@ -53,18 +53,21 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
+ fc->configure(input_to_use, _weights, _biases, _output);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+- else
++ else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
+ {
+- assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+-
+ bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
+ input->info()->data_type() == DataType::F16) &&
+- weights->info()->data_type() == DataType::S8;
++ (weights->info()->data_type() == DataType::S8 ||
++ weights->info()->data_type() == DataType::QASYMM8_SIGNED);
+
+ if (is_hybrid)
+ {
+ auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
++ ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
++ const auto orgin_weights_data_type = weights_info->data_type();
++ weights_info->set_data_type(DataType::QASYMM8_SIGNED);
+ fc->configure(input_to_use, _weights, _biases, _output);
++ weights_info->set_data_type(orgin_weights_data_type);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ else
+@@ -74,6 +77,11 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ }
++ else
++ {
++ throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
++ }
++
+ }();
+
+ if (_needs_reshape)
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
+deleted file mode 100644
+index ca5499d..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
++++ /dev/null
+@@ -1,180 +0,0 @@
+-/*
+- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
+-
+-#include "arm_compute/core/CL/ICLTensor.h"
+-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+-#include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "arm_compute/runtime/MemoryGroup.h"
+-
+-namespace arm_compute
+-{
+-using namespace arm_compute::misc::shape_calculator;
+-using namespace arm_compute::cl_gemm;
+-
+-namespace
+-{
+-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+-{
+- return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
+-}
+-} // namespace
+-
+-CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx(
+- std::shared_ptr<IMemoryManager> memory_manager)
+- : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(),
+- _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0),
+- _reshape_b_only_on_first_run(false), _is_prepared(false)
+-{
+-}
+-
+-void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b,
+- const ICLTensor *c, ICLTensor *output,
+- const GEMMInfo &gemm_info)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+- ARM_COMPUTE_UNUSED(c);
+- ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate(
+- a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+-
+- _is_prepared = false;
+- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+- _a_offset = a->info()->quantization_info().uniform().offset;
+- _b_offset = b->info()->quantization_info().uniform().offset;
+-
+- // Get the GPU target
+- const GPUTarget gpu_target = CLScheduler::get().target();
+-
+- // Set the target for the kernels
+- _mm_midgard_kernel.set_target(gpu_target);
+-
+- // GEMMRHSMatrixInfo rhs_info;
+- // GEMMLHSMatrixInfo lhs_info;
+-
+- // Arguments used by GEMMReshapeInfo
+- // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m,
+- // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+- // in order to know how the matrices have been reshaped
+- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+- const unsigned int m = reinterpret_input_as_3d
+- ? (a->info()->dimension(1) * a->info()->dimension(2))
+- : a->info()->dimension(1);
+- const unsigned int n = b->info()->dimension(0);
+- const unsigned int k = a->info()->dimension(0);
+- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+-
+- const ICLTensor *matrix_b = b;
+- // Configure matrix multiply kernel
+- _mm_midgard_kernel.configure(
+- a, matrix_b, output,
+- GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+-}
+-
+-Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+- const ITensorInfo *c, const ITensorInfo *output,
+- const GEMMInfo &gemm_info)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+- ARM_COMPUTE_UNUSED(c);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+- "Matrix A already reshaped is not supported");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+- "Matrix B already reshaped is not supported");
+-
+- const ITensorInfo *matrix_a_info = a;
+-
+- // Get the GPU target
+- const GPUTarget gpu_target = CLScheduler::get().target();
+-
+- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+- const unsigned int m =
+- reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+- const unsigned int n = b->dimension(0);
+- const unsigned int k = a->dimension(0);
+- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+-
+- bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target);
+-
+- const GEMMReshapeInfo reshape_info =
+- GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+-
+- TensorInfo weights_info(*b);
+- const ITensorInfo *matrix_b_info = &weights_info;
+- if (reshape_matrix_b)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(false,
+- "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b");
+- }
+-
+- // Validate matrix multiply
+- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate(
+- matrix_a_info, matrix_b_info, output, reshape_info));
+-
+- return Status{};
+-}
+-
+-void CLGEMMLowpMatrixMultiplyCoreEx::run()
+-{
+- prepare();
+-
+- MemoryGroupResourceScope scope_mg(_memory_group);
+-
+- // Run matrix multiply
+- CLScheduler::get().enqueue(_mm_midgard_kernel, false);
+-}
+-
+-void CLGEMMLowpMatrixMultiplyCoreEx::prepare()
+-{
+- if (!_is_prepared)
+- {
+- _is_prepared = true;
+- }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+index f594d7a..e0b833b 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+@@ -48,7 +48,7 @@ using namespace arm_compute;
+ void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
+ int axis)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
++ auto k = support::cpp14::make_unique<CLGatherExKernel>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+index 27ed8e8..65b89a3 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+@@ -47,7 +47,7 @@ using namespace arm_compute;
+ void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+ const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
++ auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
+ k->configure(lookups, keys, input, output, hits);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+index 80393e8..5a7e408 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
+ void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+ ICLTensor *gamma, ICLTensor *beta, float epsilon)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
++ auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+ k->configure(input, output, gamma, beta, epsilon);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
+deleted file mode 100644
+index fbb15ab..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
++++ /dev/null
+@@ -1,63 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+-
+-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+- k->configure(input, alpha, output);
+- _kernel = std::move(k);
+-
+- if (output->info()->dimension(0) > 1)
+- {
+- ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+-
+- if (broadcasted_info->info()->dimension(0) == 1)
+- {
+- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+- }
+- }
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
+deleted file mode 100644
+index 6049b7e..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
++++ /dev/null
+@@ -1,163 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
+-
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Utils.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "support/ToolchainSupport.h"
+-
+-#include <utility>
+-
+-using namespace arm_compute;
+-using namespace arm_compute::misc::shape_calculator;
+-
+-CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+- _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+- _gemm_output(), _add_output(), _is_prepared(false)
+-{
+-}
+-
+-Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+- const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+- const ITensorInfo *hidden_state, const ITensorInfo *output,
+- const ActivationLayerInfo &info)
+-{
+- const int idx_width = 0;
+- const int idx_height = 1;
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+- output);
+- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+- recurrent_weights->dimension(idx_width));
+- ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+- recurrent_weights->dimension(1));
+- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+- hidden_state->tensor_shape());
+-
+- auto shape_info =
+- TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+- input->data_type());
+-
+- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+- ARM_COMPUTE_RETURN_ON_ERROR(
+- CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+- ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
+- ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+-
+- return Status{};
+-}
+-
+-void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
+- const ICLTensor *recurrent_weights, const ICLTensor *bias,
+- ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+- ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
+- recurrent_weights->info(), bias->info(),
+- hidden_state->info(), output->info(), info));
+-
+- const int idx_height = 1;
+- TensorShape shape =
+- compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+-
+- _is_prepared = false;
+-
+- _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+- _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+-
+- // Manage intermediate buffers and configure
+- _memory_group.manage(&_fully_connected_out);
+- _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+-
+- _memory_group.manage(&_gemm_output);
+- _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+-
+- _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+- _memory_group.manage(&_add_output);
+-
+- _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
+- &_add_output, ConvertPolicy::SATURATE);
+-
+- _fully_connected_out.allocator()->allocate();
+- _gemm_output.allocator()->allocate();
+-
+- _activation_kernel.configure(&_add_output, hidden_state, info);
+- _add_output.allocator()->allocate();
+-
+- _copy_kernel.configure(hidden_state, output);
+-}
+-
+-void CLRNNLayerEx::run()
+-{
+- prepare();
+-
+- _memory_group.acquire();
+-
+- _fully_connected_kernel.run();
+- _gemm_state_f.run();
+- CLScheduler::get().enqueue(_add_kernel);
+- CLScheduler::get().enqueue(_activation_kernel);
+-
+- // copy hidden out to output
+- CLScheduler::get().enqueue(_copy_kernel);
+-
+- _memory_group.release();
+-}
+-
+-void CLRNNLayerEx::prepare()
+-{
+- if (!_is_prepared)
+- {
+- _fully_connected_kernel.prepare();
+- _gemm_state_f.prepare();
+-
+- _is_prepared = true;
+- }
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+index 8ce2d74..a41e6db 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+@@ -60,8 +60,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
+ const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+ // Create temporary tensor infos
+- auto interm_tensors =
+- arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
++ auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+@@ -119,9 +118,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+- _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+- _reduce_kernels =
+- arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
++ _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
++ _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+ // Set a vector that is ordered ICLTensors sequentially.
+ std::vector<ICLTensor *> tensors;
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
+deleted file mode 100644
+index 7d7b226..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
++++ /dev/null
+@@ -1,52 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+-
+-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+-
+-using namespace arm_compute;
+-
+-void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+- k->configure(input, output, block_size);
+- _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+index e61746e..3215d01 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+@@ -15,7 +15,7 @@
+ */
+
+ /*
+- * Copyright (c) 2017-2018 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+@@ -37,218 +37,124 @@
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+-
+ #include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+-#include "arm_compute/core/Helpers.h"
+ #include "arm_compute/core/Utils.h"
+-#include "arm_compute/core/UtilsEx.h"
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/utils/misc/ShapeCalculator.h"
++#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ #include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
++#include <cmath>
+ #include <memory>
+ #include <tuple>
+
+ using namespace arm_compute;
+ using namespace arm_compute::misc::shape_calculator;
+
+-CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+- : _memory_group(std::move(memory_manager)),
+- _scale_f(),
+- _conv_f(),
+- _flip_weights(),
+- _scaled_output(),
+- _original_weights(nullptr),
+- _weights_flipped(),
+- _is_prepared(false)
++CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
++ : _memory_manager(std::move(memory_manager)), _function()
++{
++}
++
++void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
++ ICLTensor *output, const PadStrideInfo &deconv_info,
++ unsigned int invalid_right, unsigned int invalid_bottom,
++ const WeightsInfo &weights_info)
+ {
++ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info,
++ invalid_right, invalid_bottom, weights_info);
++}
++
++void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input,
++ ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
++ const PadStrideInfo &deconv_info, unsigned int invalid_right,
++ unsigned int invalid_bottom, const WeightsInfo &weights_info)
++{
++ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
++
++ switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr,
++ output->info(), deconv_info, invalid_right,
++ invalid_bottom, weights_info))
++ {
++ case DeconvolutionMethod::DIRECT:
++ {
++ auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
++ f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
++ invalid_bottom, weights_info);
++ _function = std::move(f);
++ break;
++ }
++ case DeconvolutionMethod::GEMM:
++ {
++ auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
++ f->configure(compile_context, input, weights, bias, output, deconv_info);
++ _function = std::move(f);
++ break;
++ }
++ default:
++ ARM_COMPUTE_ERROR("Not supported.");
++ break;
++ }
+ }
+
+ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, ITensorInfo *output,
+- const PadStrideInfo &info, unsigned int invalid_right,
++ const PadStrideInfo &deconv_info, unsigned int invalid_right,
+ unsigned int invalid_bottom, const WeightsInfo &weights_info)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+- DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+-
+- const DataLayout data_layout = input->data_layout();
+-
+- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+-
+- const unsigned int kernel_x = weights->dimension(idx_w);
+- const unsigned int kernel_y = weights->dimension(idx_h);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
+- "invalid_right must be smaller than kernel_x");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
+- "inner_border_top must be smaller than kernel_y");
+-
+- // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
+- auto out_dims = transposeconv_output_dimensions(
+- input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+- weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+-
+- const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+-
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+-
+- if (bias != nullptr)
++ switch (CLTransposeConvLayer::get_deconvolution_method(
++ input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
+ {
+- if (is_data_type_quantized_asymmetric(input->data_type()))
++ case DeconvolutionMethod::DIRECT:
+ {
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
++ // Validate direct convolution layer
++ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
++ input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
++ break;
+ }
+- else
++ case DeconvolutionMethod::GEMM:
+ {
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
++ // Validate gemm-based convolution layer
++ ARM_COMPUTE_RETURN_ON_ERROR(
++ CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
++ break;
+ }
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
++ default:
++ ARM_COMPUTE_ERROR("Not supported.");
++ break;
+ }
+
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+- "Output's width is invalid.");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+- "Output's height is invalid.");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+- "Output's depth is invalid.");
+-
+- unsigned int pad_left = 0;
+- unsigned int pad_right = 0;
+- unsigned int pad_top = 0;
+- unsigned int pad_bottom = 0;
+- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+- *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+- pad_bottom);
+- TensorInfo scale_out_info(input->clone()
+- ->set_is_resizable(true)
+- .reset_padding()
+- .set_tensor_shape(scale_out_shape)
+- .set_data_layout(data_layout));
+- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+-
+- ARM_COMPUTE_RETURN_ON_ERROR(
+- CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
+- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+- conv_info, weights_info));
+-
+ return Status{};
+ }
+
+-void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+- ICLTensor *output, const PadStrideInfo &info,
+- unsigned int invalid_right, unsigned int invalid_bottom,
+- const WeightsInfo &weights_info)
++DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
++ const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
++ ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
++ unsigned int invalid_bottom, const WeightsInfo &weights_info)
+ {
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+-
+- const unsigned int stride_x = info.stride().first;
+- const unsigned int stride_y = info.stride().second;
++ ARM_COMPUTE_UNUSED(output, bias, weights_info);
+
+- const DataLayout data_layout = input->info()->data_layout();
++ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+- _original_weights = weights;
+- _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+- _flip_weights.configure(weights, &_weights_flipped);
+-
+- // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
+- // added.
+- auto out_dims = transposeconv_output_dimensions(
+- input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+- weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+- invalid_bottom);
+-
+- const TensorShape output_shape =
+- compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+-
+- // Output auto initialization if not yet initialized
+- auto_init_if_empty(
+- *output->info(),
+- input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+-
+- // Perform validation step
+- ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
+- input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+- info, invalid_right, invalid_bottom));
+-
+- _is_prepared = weights_info.retain_internal_weights();
+-
+- _memory_group.manage(&_scaled_output);
+-
+- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+- // to match output shape
+- unsigned int pad_left = 0;
+- unsigned int pad_right = 0;
+- unsigned int pad_top = 0;
+- unsigned int pad_bottom = 0;
+- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+- *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+- pad_right, pad_top, pad_bottom);
+-
+- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+- input->info()->quantization_info());
+- scale_out_info.set_data_layout(data_layout);
+- _scaled_output.allocator()->init(scale_out_info);
+-
+- // configure scale function
+- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+- DimensionRoundingType::FLOOR);
+- _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
+-
+- // setup the function to convolve the upscaled output
+- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+- _scaled_output.allocator()->allocate();
++ if (weights->dimension(idx_w) != deconv_info.stride().first ||
++ weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 ||
++ invalid_bottom != 0)
++ {
++ return DeconvolutionMethod::DIRECT;
++ }
++
++ return DeconvolutionMethod::GEMM;
+ }
+
+ void CLTransposeConvLayer::run()
+ {
+ prepare();
+-
+- _memory_group.acquire();
+-
+- _scale_f.run();
+- _conv_f.run();
+-
+- _memory_group.release();
++ _function->run();
+ }
+
+-void CLTransposeConvLayer::prepare()
+-{
+- if (!_is_prepared)
+- {
+- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+-
+- // Run weights flipping and mark original weights tensor as unused
+- _weights_flipped.allocator()->allocate();
+- _weights_flipped.map(true);
+- _original_weights->map(CLScheduler::get().queue(), true);
+- CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+- _weights_flipped.unmap();
+- _original_weights->unmap(CLScheduler::get().queue());
+- _original_weights->mark_as_unused();
+-
+- // Prepare convolution
+- _conv_f.prepare();
+-
+- if (!_weights_flipped.is_used())
+- {
+- _weights_flipped.allocator()->free();
+- }
+-
+- _is_prepared = true;
+- }
+-}
++void CLTransposeConvLayer::prepare() { _function->prepare(); }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
+deleted file mode 100644
+index 07feb5a..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
++++ /dev/null
+@@ -1,92 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+-
+-#include "arm_compute/core/CL/OpenCL.h"
+-#include "arm_compute/core/Utils.h"
+-#include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-#include <cmath>
+-#include <memory>
+-#include <tuple>
+-
+-using namespace arm_compute;
+-
+-CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
+- : _upsample(),
+- _output(nullptr)
+-{
+-}
+-
+-Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
+- const BorderSize &inner_border,
+- const PadStrideInfo &info)
+-{
+- return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
+-}
+-
+-void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
+- const BorderSize &inner_border,
+- const PadStrideInfo &info)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+- _output = output;
+- _upsample.configure(input, _output, inner_border, info);
+-}
+-
+-void CLTransposeConvLayerUpsample::run()
+-{
+- _output->map(CLScheduler::get().queue(), true);
+- if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
+- {
+- const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset;
+- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+- }
+- else
+- {
+- memset(_output->buffer(), 0, _output->info()->total_size());
+- }
+- _output->unmap(CLScheduler::get().queue());
+-
+- CLScheduler::get().enqueue(_upsample, false);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+index 114e1a7..768c15b 100644
+--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+@@ -41,14 +41,14 @@
+ #include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h"
+
+ #include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+
+ using namespace arm_compute;
+
+ void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+ const ITensor *off_value, ITensor *output, const int axis)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<CPPOneHotKernelEx>();
++ auto k = support::cpp14::make_unique<CPPOneHotKernelEx>();
+ k->configure(indices, depth, on_value, off_value, output, axis);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
+deleted file mode 100644
+index 6c90ef3..0000000
+--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
++++ /dev/null
+@@ -1,53 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+-
+-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+-#include "support/ToolchainSupport.h"
+-
+-using namespace arm_compute;
+-
+-void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>();
+- k->configure(input, output, info);
+- _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+index ff81ff8..2752eb6 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+@@ -42,7 +42,7 @@
+
+ #include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
+ #include "arm_compute/runtime/IRuntimeContext.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+
+ namespace arm_compute
+ {
+@@ -53,7 +53,7 @@ NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT
+ void NEActivationLayerEx::configure(ITensor *input, ITensor *output,
+ ActivationLayerInfo activation_info)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernelEx>();
++ auto k = support::cpp14::make_unique<NEActivationLayerKernelEx>();
+ k->configure(input, output, activation_info);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+index e42c453..2fc94b2 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+@@ -42,7 +42,7 @@
+ #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+
+ #include "arm_compute/core/ITensor.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+
+ #include <utility>
+
+@@ -53,7 +53,7 @@ template <BinaryLogicalOperation COP>
+ void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
+ ITensor *output)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
++ auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+ k->configure(COP, input1, input2, output);
+ _kernel = std::move(k);
+ }
+@@ -69,7 +69,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
+ void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
+ BinaryLogicalOperation op)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
++ auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+ k->configure(op, input1, input2, output);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
+deleted file mode 100644
+index dc5c620..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
++++ /dev/null
+@@ -1,60 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NECast.h"
+-
+-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+-#include "support/ToolchainSupport.h"
+-
+-namespace arm_compute
+-{
+-void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
+- k->configure(input, output, input_subtype);
+- _kernel = std::move(k);
+-}
+-
+-Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
+- SubDataType input_subtype)
+-{
+- return NECastKernel::validate(input, output, input_subtype);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
+deleted file mode 100644
+index 5ec0b86..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
++++ /dev/null
+@@ -1,63 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-
+-namespace arm_compute
+-{
+-void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
+- k->configure(input, output, block_shape);
+- _kernel = std::move(k);
+-}
+-
+-Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+- int32_t block_shape)
+-{
+- return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+index 53fb150..e0ab3e0 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+@@ -41,13 +41,13 @@
+ #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
+
+ #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+
+ using namespace arm_compute;
+
+ void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
++ auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+ k->configure(input, output, lookups);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+index f457732..a123439 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+@@ -58,7 +58,7 @@ namespace
+ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+- NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
++ NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+
+ return Status{};
+ }
+@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
+
+ void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
++ auto k = support::cpp14::make_unique<NETransposeKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+ }
+@@ -158,7 +158,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
+
+ // Quantize input
+ _quantized_input.allocator()->init(
+- input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
++ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
++ DataType::QASYMM8_SIGNED));
+ _scale_factor.allocator()->init(
+ TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+ _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
+@@ -186,7 +187,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
+ ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+@@ -224,8 +225,9 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+
+ // Validate quantization kernel
+- const ITensorInfo &quantized_input = TensorInfo(
+- input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
++ const ITensorInfo &quantized_input =
++ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
++ DataType::QASYMM8_SIGNED));
+ const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+index fcac3c7..dc6c784 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+@@ -56,12 +56,17 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
+ assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+
+ bool is_hybrid = input->info()->data_type() == DataType::F32 &&
+- weights->info()->data_type() == DataType::S8;
++ (weights->info()->data_type() == DataType::S8 ||
++ weights->info()->data_type() == DataType::QASYMM8_SIGNED);
+
+ if (is_hybrid)
+ {
+ auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
++ ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
++ const auto orgin_weights_data_type = weights_info->data_type();
++ weights_info->set_data_type(DataType::QASYMM8_SIGNED);
+ fc->configure(input_to_use, _weights, _biases, _output);
++ weights_info->set_data_type(orgin_weights_data_type);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ else
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
+deleted file mode 100644
+index 1290cfd..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
++++ /dev/null
+@@ -1,513 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "arm_compute/runtime/NEON/NEScheduler.h"
+-#include "arm_compute/runtime/TensorAllocator.h"
+-#include "support/ToolchainSupport.h"
+-
+-using namespace arm_compute;
+-using namespace arm_compute::misc::shape_calculator;
+-
+-NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
+- std::shared_ptr<IMemoryManager> memory_manager)
+- : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
+- _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+- _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+- _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+- _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
+- _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+- _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
+- _fuse_output_stage(false), _flip_signedness(false)
+-{
+-}
+-
+-void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
+- ITensor *output, const GEMMInfo &gemm_info)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+- ARM_COMPUTE_UNUSED(c);
+- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
+- a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+-
+- const ITensor *matrix_a = a;
+- const ITensor *matrix_b = b;
+- GEMMInfo info = gemm_info;
+-
+- // Clear state
+- _mtx_a_reshape_kernel = nullptr;
+- _mtx_b_reshape_kernel = nullptr;
+-
+- // Set internal variables
+- _a_offset = a->info()->quantization_info().uniform().offset;
+- _b_offset = b->info()->quantization_info().uniform().offset;
+- _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+- _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+- _is_prepared = false;
+- _fused_assembly_path = false;
+- _original_b = b;
+-
+- const ITensor *a_to_use = a;
+-
+- // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+- if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+- {
+- _fuse_output_stage = true;
+- _memory_group.manage(&_mm_result_s32);
+- TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+- _mm_result_s32.allocator()->init(info_mm_result_s32);
+- }
+-
+-#ifdef __aarch64__
+- switch (a->info()->data_type())
+- {
+- case DataType::QASYMM8:
+- case DataType::QASYMM8_SIGNED:
+- case DataType::U8:
+- case DataType::S8:
+- {
+- if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
+- info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+- {
+- _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+- _fused_assembly_path = _asm_glue.is_configured();
+- }
+- else
+- {
+- _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
+- gemm_info);
+- }
+- _assembly_path = _asm_glue.is_configured();
+- break;
+- }
+- default:
+- {
+- ARM_COMPUTE_ERROR("Datatype not supported");
+- break;
+- }
+- }
+-#endif /* __aarch64__ */
+- if (!(_assembly_path || _run_vector_matrix_multiplication))
+- {
+- matrix_a = &_tmp_a;
+- matrix_b = &_tmp_b;
+-
+- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+- // 4.0f) ]
+- TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
+- a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
+- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
+- // 16.0f) ]
+- TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
+- b->info()->quantization_info());
+- _tmp_a.allocator()->init(a_info);
+- _tmp_b.allocator()->init(b_info);
+- _memory_group.manage(&_tmp_a);
+- if (!_reshape_b_only_on_first_run)
+- {
+- _memory_group.manage(&_tmp_b);
+- }
+-
+- // Configure interleave kernel
+- {
+- auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+- k->configure(a_to_use, &_tmp_a);
+- _mtx_a_reshape_kernel = std::move(k);
+- }
+-
+- // Configure transpose kernel
+- {
+- auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+- k->configure(b, &_tmp_b);
+- _mtx_b_reshape_kernel = std::move(k);
+- }
+- }
+-
+- if (!_fused_assembly_path)
+- {
+- // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+- if (_a_offset != 0)
+- {
+- TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
+-
+- _vector_sum_col.allocator()->init(info_vector_sum_col);
+- if (!_reshape_b_only_on_first_run)
+- {
+- _memory_group.manage(&_vector_sum_col);
+- }
+-
+- // Configure Matrix B reduction kernel
+- _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
+- }
+-
+- // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+- if (_b_offset != 0)
+- {
+- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
+-
+- _vector_sum_row.allocator()->init(info_vector_sum_row);
+- _memory_group.manage(&_vector_sum_row);
+-
+- // Configure matrix A reduction kernel
+- _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
+- false);
+- }
+-
+- if (_fuse_output_stage)
+- {
+- // Configure matrix multiply kernel
+- if (!_assembly_path)
+- {
+- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+- k->configure(matrix_a, matrix_b, &_mm_result_s32);
+- _mm_kernel = std::move(k);
+- }
+-
+- _offset_contribution_output_stage_kernel.configure(
+- &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+- _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+- _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
+- _b_offset, info.gemmlowp_output_stage());
+- }
+- else
+- {
+- // Configure matrix multiply kernel
+- if (!_assembly_path)
+- {
+- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+- k->configure(matrix_a, matrix_b, output);
+- _mm_kernel = std::move(k);
+- }
+- // Configure offset contribution kernel
+- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+- _b_offset == 0 ? nullptr : &_vector_sum_row,
+- a_to_use->info()->dimension(0), _a_offset, _b_offset);
+- }
+- }
+-
+- // Allocate tensors
+- if (!_assembly_path && !_run_vector_matrix_multiplication)
+- {
+- _tmp_a.allocator()->allocate();
+- if (!_reshape_b_only_on_first_run)
+- {
+- _tmp_b.allocator()->allocate();
+- }
+- }
+-
+- if (!_fused_assembly_path)
+- {
+- if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+- {
+- _vector_sum_col.allocator()->allocate();
+- }
+-
+- if (_b_offset != 0)
+- {
+- _vector_sum_row.allocator()->allocate();
+- }
+- }
+-
+- if (_fuse_output_stage)
+- {
+- _mm_result_s32.allocator()->allocate();
+- }
+-}
+-
+-Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+- const ITensorInfo *c, const ITensorInfo *output,
+- const GEMMInfo &gemm_info)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+- c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+- "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+- "The product AB is defined only if the number of columns in A is "
+- "equal to the number of rows in B");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+- "Matrix A already reshaped is not supported");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+- "Matrix B already reshaped is not supported");
+-
+- GEMMInfo info = gemm_info;
+- const ITensorInfo *matrix_a_info = a;
+- const ITensorInfo *matrix_b_info = b;
+-
+- const ITensorInfo *a_to_use = a;
+-
+- TensorInfo tmp_a_info{};
+- TensorInfo tmp_b_info{};
+- TensorInfo mm_result_s32_info{};
+-
+- int32_t a_offset = a->quantization_info().uniform().offset;
+- int32_t b_offset = b->quantization_info().uniform().offset;
+-
+- bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+- if (fuse_output_stage)
+- {
+- auto_init_if_empty(
+- mm_result_s32_info,
+- a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+- }
+-
+- // Check if we need to run the optimized assembly kernel
+- bool run_optimised = false;
+- bool run_optimised_requantized = false;
+- if (a_to_use->data_type() == DataType::QASYMM8 &&
+- info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+- {
+- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
+- run_optimised_requantized = run_optimised;
+- }
+- else
+- {
+- run_optimised = bool(NEGEMMAssemblyDispatch::validate(
+- a_to_use, b, c, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
+- }
+-
+- if (run_optimised)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+- if (info.depth_output_gemm3d() != 0)
+- {
+- if (info.reinterpret_input_as_3d())
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+- }
+- else
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+- }
+- }
+- else
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+- }
+- }
+- else
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+- "NEGEMM cannot reinterpret the input tensor as 3D");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+- "NEGEMM cannot reinterpret the output tensor as 3D");
+-
+- const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+- if (!run_vector_matrix_multiplication)
+- {
+- matrix_a_info = &tmp_a_info;
+- matrix_b_info = &tmp_b_info;
+-
+- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+- // 4.0f) ]
+- TensorShape shape_tmp_a = a->tensor_shape();
+- shape_tmp_a.set(0, a->dimension(0) * 4);
+- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+-
+- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
+- // / 16.0f) ]
+- TensorShape shape_tmp_b = b->tensor_shape();
+- shape_tmp_b.set(0, b->dimension(1) * 16);
+- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+-
+- // Validate interleave kernel
+- auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+-
+- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
+- }
+- }
+-
+- if (!run_optimised_requantized)
+- {
+- TensorInfo info_vector_sum_col{};
+- TensorInfo info_vector_sum_row{};
+-
+- // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+- if (a_offset != 0)
+- {
+- info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+-
+- // Configure Matrix B reduction kernel
+- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
+- b, &info_vector_sum_col, a->dimension(0), false));
+- }
+-
+- // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+- if (b_offset != 0)
+- {
+- info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+-
+- // Configure matrix A reduction kernel
+- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
+- a_to_use, &info_vector_sum_row, a->dimension(0), false));
+- }
+-
+- if (fuse_output_stage)
+- {
+- if (!run_optimised)
+- {
+- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
+- matrix_a_info, matrix_b_info, &mm_result_s32_info));
+- }
+-
+- // Validate offset contribution kernel
+- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
+- &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+- b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
+- info.gemmlowp_output_stage()));
+- }
+- else
+- {
+- if (!run_optimised)
+- {
+- ARM_COMPUTE_RETURN_ON_ERROR(
+- NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+- }
+- // Validate offset contribution kernel
+- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
+- output, a_offset == 0 ? nullptr : &info_vector_sum_col,
+- b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
+- }
+- }
+- return Status{};
+-}
+-
+-void NEGEMMLowpMatrixMultiplyCoreEx::run()
+-{
+- prepare();
+-
+- MemoryGroupResourceScope scope_mg(_memory_group);
+-
+- // Reshape inputs
+- if (_mtx_a_reshape_kernel)
+- {
+- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+- }
+- if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+- {
+- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+- }
+-
+- // Run GEMM
+- if (_asm_glue.is_configured())
+- {
+- _asm_glue.run();
+- }
+- else
+- {
+- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+- }
+-
+- if (!_fused_assembly_path)
+- {
+- // Run matrix A reduction kernel only if _b_offset is not equal to 0
+- if (_b_offset != 0)
+- {
+- NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+- }
+-
+- // Run matrix B reduction kernel only if _a_offset is not equal to 0
+- if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+- {
+- NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+- }
+-
+- if (_fuse_output_stage)
+- {
+- // Run offset contribution kernel
+- NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+- }
+- else
+- {
+- // Run offset contribution kernel
+- NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+- }
+- }
+-}
+-
+-void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
+-{
+- if (!_is_prepared)
+- {
+- // Run assembly reshape
+- if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
+- {
+- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+-
+- _asm_glue.prepare();
+- _original_b->mark_as_unused();
+- }
+- // Run non-assembly reshape
+- else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+- {
+- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+-
+- // Run reshape kernel and mark original weights tensor as unused
+- _tmp_b.allocator()->allocate();
+- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+- _original_b->mark_as_unused();
+- }
+-
+- // Run matrix B reduction kernel only if _a_offset is not equal to 0
+- if (_a_offset != 0 && _reshape_b_only_on_first_run)
+- {
+- _vector_sum_col.allocator()->allocate();
+- NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+- }
+-
+- _is_prepared = true;
+- }
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+index c8bb88a..433c35d 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+@@ -41,7 +41,7 @@
+ #include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
+
+ #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+
+ #include <utility>
+
+@@ -49,7 +49,7 @@ namespace arm_compute
+ {
+ void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
++ auto k = support::cpp14::make_unique<NEGatherKernelEx>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+index 078019f..52d58ac 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+@@ -41,14 +41,14 @@
+ #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
+
+ #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+
+ using namespace arm_compute;
+
+ void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
+ ITensor *output, ITensor *hits)
+ {
+- auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
++ auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
+ k->configure(lookups, keys, input, output, hits);
+ _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
+deleted file mode 100644
+index dac3b84..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
++++ /dev/null
+@@ -1,55 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
+-
+-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+-#include "support/ToolchainSupport.h"
+-
+-#include <utility>
+-
+-using namespace arm_compute;
+-
+-void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
+- k->configure(input, alpha, output);
+- _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
+deleted file mode 100644
+index 0e9a5e9..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
++++ /dev/null
+@@ -1,161 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "arm_compute/runtime/NEON/NEScheduler.h"
+-
+-namespace arm_compute
+-{
+-NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+- _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+- _gemm_output(), _add_output(), _is_prepared(false)
+-{
+-}
+-
+-Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+- const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+- const ITensorInfo *hidden_state, const ITensorInfo *output,
+- const ActivationLayerInfo &info)
+-{
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+- output);
+-
+- const int idx_width = 0;
+- const int idx_height = 1;
+- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+- recurrent_weights->dimension(idx_width));
+- ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+- recurrent_weights->dimension(idx_height));
+- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+- ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+- hidden_state->tensor_shape());
+-
+- auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
+- recurrent_weights, hidden_state->dimension(idx_height)),
+- 1, input->data_type());
+-
+- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
+- &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+-
+- return Status{};
+-}
+-
+-void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
+- const ITensor *recurrent_weights, const ITensor *bias,
+- ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+- ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
+- recurrent_weights->info(), bias->info(),
+- hidden_state->info(), output->info(), info));
+-
+- const int idx_height = 1;
+- TensorShape shape = misc::shape_calculator::compute_rnn_shape(
+- recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+-
+- _is_prepared = false;
+-
+- // Manage intermediate buffers and configure
+- _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+- _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+-
+- // Manage intermediate buffers and configure
+- _memory_group.manage(&_fully_connected_out);
+- _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+-
+- _memory_group.manage(&_gemm_output);
+- _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+-
+- _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+- _memory_group.manage(&_add_output);
+-
+- _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
+- ConvertPolicy::SATURATE);
+-
+- _fully_connected_out.allocator()->allocate();
+- _gemm_output.allocator()->allocate();
+-
+- _activation_kernel.configure(&_add_output, hidden_state, info);
+- _add_output.allocator()->allocate();
+-
+- _copy_kernel.configure(hidden_state, output);
+-}
+-
+-void NERNNLayerEx::run()
+-{
+- prepare();
+-
+- MemoryGroupResourceScope scope_mg(_memory_group);
+-
+- _fully_connected_kernel.run();
+-
+- _gemm_state_f.run();
+-
+- NEScheduler::get().schedule(&_add_kernel, Window::DimY);
+- NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+-
+- // copy hidden out to output
+- NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+-}
+-
+-void NERNNLayerEx::prepare()
+-{
+- if (!_is_prepared)
+- {
+- _fully_connected_kernel.prepare();
+- _gemm_state_f.prepare();
+-
+- _is_prepared = true;
+- }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
+deleted file mode 100644
+index 116bba3..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
++++ /dev/null
+@@ -1,180 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
+-
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/runtime/NEON/NEScheduler.h"
+-
+-using namespace arm_compute;
+-
+-NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
+- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+- _reduction_ops(), _keep_dims()
+-{
+-}
+-
+-Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+- bool keep_dims, const ITensorInfo *output)
+-{
+- ARM_COMPUTE_UNUSED(keep_dims);
+- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+- ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+-
+- TensorShape out_shape = input->tensor_shape();
+- const unsigned int reduction_ops = reduction_axis.num_dimensions();
+- const int input_dims = input->num_dimensions();
+- Coordinates axis_local = reduction_axis;
+-
+- // Convert negative axis
+- for (unsigned int i = 0; i < reduction_ops; ++i)
+- {
+- axis_local[i] = wrap_around(axis_local[i], input_dims);
+- }
+-
+- std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+- for (unsigned int i = 0; i < reduction_ops; ++i)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+- input->num_dimensions() - 1);
+- if (output->total_size() > 0 && keep_dims)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+- }
+- if (keep_dims)
+- {
+- out_shape.set(axis_local[i], 1);
+- }
+- else
+- {
+- out_shape.remove_dimension(axis_local[i] - i);
+- }
+- }
+- const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+-
+- return Status{};
+-}
+-
+-void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+- ITensor *output)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+-
+- _reduction_ops = reduction_axis.num_dimensions();
+- _reduction_kernels =
+- arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+- _reduced_outs =
+- arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+- _keep_dims = keep_dims;
+-
+- Coordinates axis_local = reduction_axis;
+- const int input_dims = input->info()->num_dimensions();
+- const unsigned int reduction_ops = reduction_axis.num_dimensions();
+-
+- // Convert negative axis
+- for (unsigned int i = 0; i < reduction_ops; ++i)
+- {
+- axis_local[i] = wrap_around(axis_local[i], input_dims);
+- }
+-
+- // Perform reduction for every axis
+- for (unsigned int i = 0; i < _reduction_ops; ++i)
+- {
+- TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
+- : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+- out_shape.set(axis_local[i], 1);
+- auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+-
+- if (i == _reduction_ops - 1 && keep_dims)
+- {
+- _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+- }
+- else
+- {
+- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+- input->info()->data_type(),
+- input->info()->quantization_info())
+- .set_data_layout(output->info()->data_layout()));
+- _memory_group.manage(_reduced_outs.get() + i);
+- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
+- ReductionOperation::MEAN_SUM);
+- }
+- }
+-
+- // Allocate intermediate tensors
+- for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+- {
+- _reduced_outs[i].allocator()->allocate();
+- }
+-
+- // Configure reshape layer if we want to drop the dimensions
+- if (!keep_dims)
+- {
+- TensorShape out_shape = input->info()->tensor_shape();
+-
+- // We have to sort the reduction axis vectors in order for remove_dimension
+- // to work properly
+- std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+- for (unsigned int i = 0; i < _reduction_ops; ++i)
+- {
+- out_shape.remove_dimension(axis_local[i] - i);
+- }
+- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+- _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+- }
+-}
+-
+-void NEReduceMeanEx::run()
+-{
+- _memory_group.acquire();
+-
+- for (unsigned int i = 0; i < _reduction_ops; ++i)
+- {
+- _reduction_kernels[i].run();
+- }
+-
+- if (!_keep_dims)
+- {
+- _reshape.run();
+- }
+- _memory_group.release();
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
+deleted file mode 100644
+index 198bb76..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
++++ /dev/null
+@@ -1,114 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/runtime/NEON/NEScheduler.h"
+-
+-namespace arm_compute
+-{
+-NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
+- : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+-{
+-}
+-
+-void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
+- const ITensor *paddings, ITensor *output)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+-
+- if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+- {
+- _has_padding = true;
+- _memset_kernel.configure(
+- output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+- }
+- _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+-}
+-
+-void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
+- const int block_shape_y, const Size2D &padding_left,
+- const Size2D &padding_right, ITensor *output)
+-{
+- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+- if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+- {
+- _has_padding = true;
+- _memset_kernel.configure(
+- output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+- }
+- _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
+- output);
+-}
+-
+-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+- const ITensorInfo *paddings, const ITensorInfo *output)
+-{
+- ARM_COMPUTE_RETURN_ON_ERROR(
+- NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+-
+- return Status{};
+-}
+-
+-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
+- const int block_shape_y, const Size2D &padding_left,
+- const Size2D &padding_right, const ITensorInfo *output)
+-{
+- ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
+- input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+-
+- return Status{};
+-}
+-
+-void NESpaceToBatchLayerEx::run()
+-{
+- // Zero out output only if we have paddings
+- if (_has_padding)
+- {
+- NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+- }
+- NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
+deleted file mode 100644
+index 97697e3..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
++++ /dev/null
+@@ -1,64 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-
+-namespace arm_compute
+-{
+-void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+-{
+- auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
+- k->configure(input, output, block_shape);
+- _kernel = std::move(k);
+-}
+-
+-Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+- int32_t block_shape)
+-{
+- ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
+- return Status{};
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+index df06892..09f1780 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+@@ -1,21 +1,5 @@
+ /*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+@@ -37,14 +21,11 @@
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+-
+ #include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
+
+ #include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/UtilsEx.h"
+ #include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+ #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+ #include "arm_compute/runtime/NEON/NEScheduler.h"
+
+@@ -52,20 +33,15 @@ using namespace arm_compute::misc::shape_calculator;
+
+ namespace arm_compute
+ {
++
+ NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _conv_f(),
+ _upsample_f(),
+ _flip_weights(),
+- _permute_input(),
+- _permute_weights(),
+- _permute_output(),
+ _scaled_output(),
+ _weights_flipped(),
+- _permuted_input(),
+- _permuted_weights(),
+- _permuted_output(),
+- _is_nchw(false),
++ _flip_axis(),
+ _original_weights(nullptr),
+ _input(nullptr),
+ _info(),
+@@ -80,7 +56,7 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
+- DataType::QASYMM8);
++ DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
+ const unsigned int width_idx =
+@@ -95,13 +71,16 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
+ weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+- if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
++ if (bias != nullptr)
+ {
+- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+- }
+- else if (bias)
+- {
+- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
++ if (is_data_type_quantized_asymmetric(input->data_type()))
++ {
++ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
++ }
++ else
++ {
++ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
++ }
+ }
+
+ if (output->tensor_shape().total_size() > 0)
+@@ -110,12 +89,12 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
+
+ const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
+- "Output's dim 0 is invalid.");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
+- "Output's dim 1 is invalid.");
+- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
+- "Output's dim 2 is invalid.");
++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
++ "Output's width is invalid.");
++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
++ "Output's height is invalid.");
++ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
++ "Output's depth is invalid.");
+ }
+
+ unsigned int pad_left = 0;
+@@ -127,7 +106,6 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
+ pad_bottom);
+ TensorInfo scale_out_info(
+ input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+- scale_out_info.set_data_layout(input->data_layout());
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ const unsigned int batches_idx =
+@@ -149,19 +127,13 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
+ ITensor *output, const PadStrideInfo &info,
+ unsigned int invalid_right, unsigned int invalid_bottom)
+ {
++ // Perform validation step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
++ ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
++ input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
++ info, invalid_right, invalid_bottom));
+
+ const DataLayout data_layout = input->info()->data_layout();
+-
+- _input = input;
+- _original_weights = weights;
+- _info = info;
+- _is_prepared = false;
+- _is_nchw = data_layout == DataLayout::NCHW;
+-
+- const unsigned int stride_x = info.stride().first;
+- const unsigned int stride_y = info.stride().second;
+-
+ const unsigned int width_idx =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx =
+@@ -173,101 +145,54 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
+
+ const TensorShape output_shape =
+ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
++
++ _input = input;
++ _original_weights = weights;
++ _info = info;
++ _is_prepared = false;
++
++ unsigned int pad_left = 0;
++ unsigned int pad_right = 0;
++ unsigned int pad_top = 0;
++ unsigned int pad_bottom = 0;
++ const unsigned int stride_x = info.stride().first;
++ const unsigned int stride_y = info.stride().second;
++
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+
+- // Perform validation step
+- ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+- input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+- info, invalid_right, invalid_bottom));
+-
++ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+ _memory_group.manage(&_scaled_output);
+
+- if (!_is_nchw)
+- {
+- _memory_group.manage(&_permuted_input);
+- _memory_group.manage(&_permuted_weights);
+- _memory_group.manage(&_permuted_output);
+-
+- // Configure the function to transform the input tensor from NHWC -> NCHW
+- _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
+- _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+- _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+-
+- // Configure the function to transform the weights tensor from NHWC -> NCHW
+- _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
+- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+- _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+-
+- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+- // order to match output shape
+-
+- unsigned int pad_left = 0;
+- unsigned int pad_right = 0;
+- unsigned int pad_top = 0;
+- unsigned int pad_bottom = 0;
+- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+- *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
+- invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
+-
+- TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
+- _permuted_input.info()->quantization_info());
+- scale_out_info.set_data_layout(DataLayout::NCHW);
+- _scaled_output.allocator()->init(scale_out_info);
+-
+- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+- DimensionRoundingType::CEIL);
+- _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
+-
+- _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
+- _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
+- _flip_weights.configure(&_permuted_weights, &_weights_flipped);
+-
+- // setup the function to convolve the upscaled output
+- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+-
+- const auto out_shape = output->info()->tensor_shape();
+- TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
+- TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
+- output->info()->quantization_info());
+- _permuted_output.allocator()->init(permuted_out_info);
+- _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
+-
+- // Configure the function to transform the convoluted output to NHWC
+- _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+-
+- _permuted_input.allocator()->allocate();
+- _permuted_weights.allocator()->allocate();
+- _permuted_output.allocator()->allocate();
+- }
+- else
+- {
+- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+- // order to match output shape
+- unsigned int pad_left = 0;
+- unsigned int pad_right = 0;
+- unsigned int pad_top = 0;
+- unsigned int pad_bottom = 0;
+- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+- *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+- pad_right, pad_top, pad_bottom);
+-
+- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+- input->info()->quantization_info());
+- _scaled_output.allocator()->init(scale_out_info);
+- const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+- DimensionRoundingType::FLOOR);
+- _upsample_f.configure(input, &_scaled_output, upsample_info);
+-
+- _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+- _flip_weights.configure(weights, &_weights_flipped);
+-
+- // setup the function to convolve the upscaled output
+- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+- }
++ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
++ _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
++
++ // setup the function to convolve the upscaled output
++ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
++
++ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
++ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
++ pad_right, pad_top, pad_bottom);
++
++ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
++ DimensionRoundingType::FLOOR);
++
++ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
++ input->info()->quantization_info());
++ scale_out_info.set_data_layout(data_layout);
++ _scaled_output.allocator()->init(scale_out_info);
++
++ _upsample_f.configure(input, &_scaled_output, upsample_info);
++
++ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
++
++ // Setup flip axis data
++ _flip_axis.allocator()->allocate();
++ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
++ axis_data[0] = static_cast<uint32_t>(width_idx);
++ axis_data[1] = static_cast<uint32_t>(height_idx);
++
+ _scaled_output.allocator()->allocate();
+ }
+
+@@ -275,22 +200,10 @@ void NETransposeConvLayer::run()
+ {
+ prepare();
+
+- // MemoryGroupResourceScope scope_mg(_memory_group);
+-
+- // Permute input
+- if (!_is_nchw)
+- {
+- _permute_input.run();
+- }
++ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _upsample_f.run();
+ _conv_f.run();
+-
+- // Permute output
+- if (!_is_nchw)
+- {
+- _permute_output.run();
+- }
+ }
+
+ void NETransposeConvLayer::prepare()
+@@ -301,22 +214,12 @@ void NETransposeConvLayer::prepare()
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+- // Permute weights
+- if (!_is_nchw)
+- {
+- _permute_weights.run();
+- }
+- NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
++ _flip_weights.run();
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
+ _conv_f.prepare();
+
+- if (!_weights_flipped.is_used())
+- {
+- _weights_flipped.allocator()->free();
+- }
+-
+ _is_prepared = true;
+ }
+ }
+diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
+index 09f6725..609dd45 100644
+--- a/compute/cker/CMakeLists.txt
++++ b/compute/cker/CMakeLists.txt
+@@ -8,6 +8,9 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
+ target_link_libraries(nnfw_lib_cker INTERFACE ruy)
+ target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
+ target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
++if(EXPERIMENTAL_RUY_FEATURE)
++ target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE)
++endif(EXPERIMENTAL_RUY_FEATURE)
+ if(PROFILE_RUY)
+ target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
+ endif(PROFILE_RUY)
+diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
+index 41b1916..1bde640 100644
+--- a/compute/cker/include/cker/Types.h
++++ b/compute/cker/include/cker/Types.h
+@@ -259,6 +259,12 @@ struct FullyConnectedParams
+ // FullyConnectedWeightsFormat weights_format;
+ };
+
++struct L2NormParams
++{
++ // uint8 inference params.
++ int32_t input_zero_point;
++};
++
+ struct GatherParams
+ {
+ int32_t axis;
+@@ -338,6 +344,11 @@ struct SpaceToBatchParams
+ int32_t output_offset;
+ };
+
++struct SpaceToDepthParams
++{
++ int32_t block_size;
++};
++
+ enum class Order
+ {
+ kColMajor,
+diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
+index b69d55c..2abb998 100644
+--- a/compute/cker/include/cker/Utils.h
++++ b/compute/cker/include/cker/Utils.h
+@@ -123,6 +123,68 @@ inline int CountLeadingZeros(uint32_t integer_input)
+ return leading_zeros;
+ }
+
++inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
++ int32_t *output_inv_sqrt, int *output_shift)
++{
++ assert(input >= 0);
++ if (input <= 1)
++ {
++ // Handle the input value 1 separately to avoid overflow in that case
++ // in the general computation below (b/143972021). Also handle 0 as if it
++ // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
++ // but rare/unrealistic input value. We can expect both to occur in some
++ // incompletely trained models, but probably not in fully trained models.
++ *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
++ *output_shift = 0;
++ return;
++ }
++ assert(input > 1);
++ *output_shift = 11;
++ while (input >= (1 << 29))
++ {
++ input /= 4;
++ ++*output_shift;
++ }
++ const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
++ const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
++ const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
++ *output_shift -= left_shift_bit_pairs;
++ input <<= 2 * left_shift_bit_pairs;
++ assert(input >= (1 << 27));
++ assert(input < (1 << 29));
++ using gemmlowp::FixedPoint;
++ using gemmlowp::Rescale;
++ using gemmlowp::SaturatingRoundingMultiplyByPOT;
++ // Using 3 integer bits gives us enough room for the internal arithmetic in
++ // this Newton-Raphson iteration.
++ using F3 = FixedPoint<int32_t, 3>;
++ using F0 = FixedPoint<int32_t, 0>;
++ const F3 fixedpoint_input = F3::FromRaw(input >> 1);
++ const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
++ const F3 fixedpoint_half_three =
++ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
++ // Newton-Raphson iteration
++ // Naive unoptimized starting guess: x = 1
++ F3 x = F3::One();
++ // Naive unoptimized number of iterations: 5
++ for (int i = 0; i < 5; i++)
++ {
++ const F3 x3 = Rescale<3>(x * x * x);
++ x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
++ }
++ const F0 fixedpoint_half_sqrt_2 =
++ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
++ x = x * fixedpoint_half_sqrt_2;
++ *output_inv_sqrt = x.raw();
++ if (*output_shift < 0)
++ {
++ *output_inv_sqrt <<= -*output_shift;
++ *output_shift = 0;
++ }
++ // Convert right shift (right is positive) to left shift.
++ *output_shift *= reverse_shift;
++}
++
+ // Comment from tensorflow lite:
+ //
+ // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
+index 9bcf3fd..9b72811 100644
+--- a/compute/cker/include/cker/operation/FullyConnected.h
++++ b/compute/cker/include/cker/operation/FullyConnected.h
+@@ -78,8 +78,11 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
+ MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
+ output_data, /*result_stride=*/1);
+
+- // Apply activation function
+- ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
++ if (params.activation != FusedActivationFunctionType::kNone)
++ {
++ // Apply activation function
++ ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
++ }
+ }
+
+ inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+@@ -195,7 +198,11 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
+ #endif
+
+ // Apply activation function to floats.
+- ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
++ if (params.activation != FusedActivationFunctionType::kNone)
++ {
++ // Apply activation function
++ ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
++ }
+ return;
+ }
+
+diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
+new file mode 100644
+index 0000000..a0075c3
+--- /dev/null
++++ b/compute/cker/include/cker/operation/L2Normalize.h
+@@ -0,0 +1,94 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __NNFW_CKER_L2NORMALIZE_H__
++#define __NNFW_CKER_L2NORMALIZE_H__
++
++#include "cker/Shape.h"
++#include "cker/Utils.h"
++#include "cker/Types.h"
++
++namespace nnfw
++{
++namespace cker
++{
++
++void L2NormalizeFloat32(const Shape &input_shape, const float *input_data,
++ const Shape &output_shape, float *output_data)
++{
++ float epsilon = 1e-6;
++ const int trailing_dim = input_shape.DimensionsCount() - 1;
++ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
++ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
++ for (int i = 0; i < outer_size; ++i)
++ {
++ float squared_l2_norm = 0;
++ for (int c = 0; c < depth; ++c)
++ {
++ const float val = input_data[c];
++ squared_l2_norm += val * val;
++ }
++ float l2_norm = std::sqrt(squared_l2_norm);
++ l2_norm = std::max(l2_norm, epsilon);
++ for (int c = 0; c < depth; ++c)
++ {
++ *output_data = *input_data / l2_norm;
++ ++output_data;
++ ++input_data;
++ }
++ }
++}
++
++void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uint8_t *input_data,
++ const Shape &output_shape, uint8_t *output_data)
++{
++ const int trailing_dim = input_shape.DimensionsCount() - 1;
++ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
++ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
++ const int32_t input_zero_point = params.input_zero_point;
++
++ for (int i = 0; i < outer_size; ++i)
++ {
++ int32_t square_l2_norm = 0;
++ for (int c = 0; c < depth; c++)
++ {
++ // Note that input_data advances by depth in the second pass below.
++ int32_t diff = input_data[c] - input_zero_point;
++ square_l2_norm += diff * diff;
++ }
++ int32_t inv_l2norm_multiplier;
++ int inv_l2norm_shift;
++ GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift);
++ for (int c = 0; c < depth; c++)
++ {
++ int32_t diff = *input_data - input_zero_point;
++ int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
++ 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
++ int32_t unclamped_output_val = 128 + rescaled_diff;
++ int32_t output_val = std::min(static_cast<int32_t>(255),
++ std::max(static_cast<int32_t>(0), unclamped_output_val));
++ *output_data = static_cast<uint8_t>(output_val);
++ ++input_data;
++ ++output_data;
++ }
++ }
++}
++
++} // namespace cker
++} // namespace nnfw
++
++#endif // __NNFW_CKER_L2NORMALIZE_H__
+diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
+index 7477858..3d3e59e 100644
+--- a/compute/cker/include/cker/operation/Logistic.h
++++ b/compute/cker/include/cker/operation/Logistic.h
+@@ -32,18 +32,9 @@ namespace cker
+ inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ float *output_data)
+ {
+-#ifdef __aarch64__
+ auto input_map = MapAsVector(input_data, input_shape);
+ auto output_map = MapAsVector(output_data, output_shape);
+ output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+-#else
+- // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2)
+- const int size = MatchingFlatSize(input_shape, output_shape);
+- for (int i = 0; i < size; i++)
+- {
+- output_data[i] = 1.f / (1.f + std::exp(-input_data[i]));
+- }
+-#endif
+ }
+
+ } // namespace cker
+diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h
+index af432f3..4a2732d 100644
+--- a/compute/cker/include/cker/operation/Pad.h
++++ b/compute/cker/include/cker/operation/Pad.h
+@@ -26,9 +26,10 @@ namespace nnfw
+ {
+ namespace cker
+ {
++template <typename T>
+ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape,
+- const float *input_data, const Shape &output_shape, float *output_data,
+- const float *constant_value_data)
++ const T *input_data, const Shape &output_shape, T *output_data,
++ const T *constant_value_data)
+ {
+ // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
+ // TODO: come up with more subtle solution that uses subtensors like arm compute
+@@ -38,7 +39,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+ /** List of padding information */
+ using PaddingList = std::vector<PaddingInfo>;
+
+- auto constant_value = constant_value_data ? *constant_value_data : 0;
++ const T constant_value = constant_value_data ? *constant_value_data : 0;
+ assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
+
+ PaddingList padding_list(pad_rank);
+@@ -64,7 +65,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+ {
+ const int32_t in_row_len = input_shape.Dims(0);
+ std::fill_n(output_data, padding_list[0].first, constant_value);
+- std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float));
++ std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T));
+ std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
+ constant_value);
+ break;
+@@ -89,7 +90,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+ out_offset += padding_list[1].first;
+
+ // copy a row of input data
+- memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
++ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
+
+ out_offset += in_row_len;
+
+@@ -132,7 +133,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+ out_offset += padding_list[2].first;
+
+ // copy a row of input data
+- memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
++ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
+
+ out_offset += in_row_len;
+
+@@ -191,7 +192,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+ out_c_offset += padding_list[3].first;
+
+ // copy a row of input data
+- memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float));
++ memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T));
+
+ out_c_offset += in_row_len;
+
+diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
+new file mode 100644
+index 0000000..5c82d11
+--- /dev/null
++++ b/compute/cker/include/cker/operation/Quantize.h
+@@ -0,0 +1,47 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __NNFW_CKER_QUANTIZE_H__
++#define __NNFW_CKER_QUANTIZE_H__
++
++#include "cker/Shape.h"
++#include "cker/Types.h"
++#include "cker/Utils.h"
++#include <stdexcept>
++#include <iostream>
++namespace nnfw
++{
++namespace cker
++{
++template <typename InputT, typename OutputT>
++inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape,
++ OutputT *output_data, const float output_scale, const int32_t output_offset)
++{
++ const int flat_size = MatchingFlatSize(input_shape, output_shape);
++ int min_val = std::numeric_limits<OutputT>::min();
++ int max_val = std::numeric_limits<OutputT>::max();
++
++ for (int i = 0; i < flat_size; i++)
++ {
++ int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset;
++ int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
++ output_data[i] = clamped;
++ }
++}
++} // namespace cker
++} // namespace nnfw
++
++#endif // __NNFW_CKER_QUANTIZE_H__
+diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h
+new file mode 100644
+index 0000000..ef67931
+--- /dev/null
++++ b/compute/cker/include/cker/operation/SpaceToDepth.h
+@@ -0,0 +1,71 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__
++#define __NNFW_CKER_SPACE_TO_DEPTH_H__
++
++#include "cker/Shape.h"
++#include "cker/Types.h"
++
++namespace nnfw
++{
++namespace cker
++{
++
++template <typename T>
++inline void SpaceToDepth(const SpaceToDepthParams &params, const Shape &unextended_input_shape,
++ const T *input_data, const Shape &unextended_output_shape, T *output_data)
++{
++ assert(unextended_input_shape.DimensionsCount() <= 4);
++ assert(unextended_output_shape.DimensionsCount() <= 4);
++ const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
++ const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
++
++ const int output_depth = output_shape.Dims(3);
++ const int output_width = output_shape.Dims(2);
++ const int output_height = output_shape.Dims(1);
++
++ const int input_depth = input_shape.Dims(3);
++ const int batch_size = input_shape.Dims(0);
++
++ // Number of continuous values that we can copy in one interation.
++ const int stride = params.block_size * input_depth;
++
++ for (int batch = 0; batch < batch_size; ++batch)
++ {
++ for (int out_h = 0; out_h < output_height; ++out_h)
++ {
++ T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
++ for (int offset_h = 0; offset_h < params.block_size; ++offset_h)
++ {
++ T *dst = output_ptr;
++ for (int out_w = 0; out_w < output_width; ++out_w)
++ {
++ memcpy(dst, input_data, stride * sizeof(T));
++ input_data += stride;
++ dst += output_depth;
++ }
++ output_ptr += stride;
++ }
++ }
++ }
++}
++
++} // namespace cker
++} // namespace nnfw
++
++#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
+diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
+index 432b181..080f66f 100644
+--- a/compute/cker/include/cker/ruy/RuySupport.h
++++ b/compute/cker/include/cker/ruy/RuySupport.h
+@@ -24,7 +24,7 @@
+
+ namespace
+ {
+-const int kDefaultNumThreadpoolThreads = 4;
++const int kDefaultNumThreadpoolThreads = 1;
+ }
+
+ namespace nnfw
+diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md
+index 2bfd14c..657f0f7 100644
+--- a/docs/howto/how-to-build-runtime.md
++++ b/docs/howto/how-to-build-runtime.md
+@@ -13,7 +13,7 @@ In the Ubuntu, you can easily install it with the following command.
+
+ ```
+ $ sudo apt-get install cmake libboost-all-dev
+-```
++```
+
+ If your linux system does not have the basic development configuration, you will need to install more packages. A list of all packages needed to configure the development environment can be found in the https://github.com/Samsung/ONE/blob/master/infra/docker/Dockerfile.1804 file.
+
+@@ -44,7 +44,7 @@ python3-venv \
+ scons \
+ software-properties-common \
+ unzip \
+-wget
++wget
+
+ $ mkdir /tmp/gtest
+ $ cd /tmp/gtest
+@@ -63,7 +63,7 @@ In a typical linux development environment, including Ubuntu, you can build the
+ ```
+ $ git clone https://github.com/Samsung/ONE.git one
+ $ cd one
+-$ cp -n Makefile.template Makefile; make install
++$ make -f Makefile.template install
+ ```
+
+ Unfortunately, the debug build on the x86_64 architecture currently has an error. To solve the problem, you must use gcc version 9 or higher. Another workaround is to do a release build rather than a debug build. This is not a suitable method for debugging during development, but it is enough to check the function of the runtime. To release build the runtime, add the environment variable `BUILD_TYPE=release` to the build command as follows.
+diff --git a/docs/nnfw/howto/CrossBuildForAndroid.md b/docs/nnfw/howto/CrossBuildForAndroid.md
+index d7e48c8..08d5fd6 100644
+--- a/docs/nnfw/howto/CrossBuildForAndroid.md
++++ b/docs/nnfw/howto/CrossBuildForAndroid.md
+@@ -44,11 +44,9 @@ Different from cross build for linux,
+ Here is an example of using Makefile.
+
+ ```bash
+-cp -n Makefile.template Makefile
+-
+ TARGET_OS=android \
+ CROSS_BUILD=1 \
+ NDK_DIR=/path/android-tools/r20/ndk \
+ EXT_ACL_FOLDER=/path/arm_compute-v19.11.1-bin-android/lib/android-arm64-v8a-neon-cl \
+-make install
++make -f Makefile.template install
+ ```
+diff --git a/docs/runtime/core.md b/docs/runtime/core.md
+index 42ba75f..64a6c62 100644
+--- a/docs/runtime/core.md
++++ b/docs/runtime/core.md
+@@ -68,7 +68,7 @@ Let's say we have some functions written in a certain programming language. Then
+
+ With generated tensors and kernels, the compiler creates executor objects. There are 3 types of executors are supported - Linear, Dataflow, and Parallel. Linear executor is the default executor and Dataflow Executor and Parallel Executor are experimental.
+
+-For more about executors, please refer to [Executors](./executors.md) document.
++For more about executors, please refer to [Executors](executors.md) document.
+
+ ### Module `exec`
+
+@@ -83,4 +83,4 @@ For more about executors, please refer to [Executors](./executors.md) document.
+
+ Backends are plugins and they are loaded dynamically(via `dlopen`). So this module is a set of interface classes for backend implementation. `compiler` can compile with a variety of backends without knowing specific backend implementation.
+
+-Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](./backend-api.md) document.
++Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](backend-api.md) document.
+diff --git a/docs/runtime/heterogeneous-execution.md b/docs/runtime/heterogeneous-execution.md
+index dc39dae..e7a5e27 100644
+--- a/docs/runtime/heterogeneous-execution.md
++++ b/docs/runtime/heterogeneous-execution.md
+@@ -12,11 +12,11 @@ Here is another case. Let's say we have a model that is not sequential so there
+
+ ![Add-3Conv model](heterogeneous-execution-add-3-conv-model.png)
+
+-Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](./executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
++Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
+
+ ## Graph Transformation
+
+-Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from `PermutationInsertionPass`. This process is done from [Lowering](./core.md#1-lowering) phase of compilation.
++Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from `PermutationInsertionPass`. This process is done from [Lowering](core.md#1-lowering) phase of compilation.
+
+ Here is an example of that. Let's say we have assigned different backends for Add and Conv2D. So a Permute operation is inserted between them.
+
+diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake
+index 51a235a..adec1f9 100644
+--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake
++++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake
+@@ -8,7 +8,7 @@ function(_ARMComputeSource_import)
+ nnas_include(OptionTools)
+
+ envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+- set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v19.11.1.tar.gz)
++ set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v20.05.tar.gz)
+ ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL})
+
+ set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE)
+diff --git a/infra/cmake/packages/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffersConfig.cmake
+index ab0b770..da084e7 100644
+--- a/infra/cmake/packages/FlatBuffersConfig.cmake
++++ b/infra/cmake/packages/FlatBuffersConfig.cmake
+@@ -25,7 +25,8 @@ function(_FlatBuffers_build)
+ BUILD_DIR ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS/build
+ INSTALL_DIR ${EXT_OVERLAY_DIR}
+ BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS}
+- IDENTIFIER "1.10-fix1"
++ IDENTIFIER "1.10-fix2"
++ EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF"
+ PKG_NAME "FLATBUFFERS")
+
+ endfunction(_FlatBuffers_build)
+diff --git a/infra/cmake/packages/HDF5Config.cmake b/infra/cmake/packages/HDF5Config.cmake
+index e282e0b..19803f1 100644
+--- a/infra/cmake/packages/HDF5Config.cmake
++++ b/infra/cmake/packages/HDF5Config.cmake
+@@ -27,6 +27,7 @@ _HDF5_build()
+ find_path(HDF5_CONFIG_DIR "hdf5-config.cmake"
+ PATHS ${EXT_OVERLAY_DIR}
+ PATH_SUFFIXES
++ cmake
+ share/cmake
+ share/cmake/hdf5
+ cmake/hdf5
+diff --git a/infra/cmake/packages/Pybind11Config.cmake b/infra/cmake/packages/Pybind11Config.cmake
+new file mode 100644
+index 0000000..3061779
+--- /dev/null
++++ b/infra/cmake/packages/Pybind11Config.cmake
+@@ -0,0 +1,21 @@
++function(_Pybind11_import)
++ nnas_find_package(Pybind11Source QUIET)
++
++ if(NOT Pybind11Source_FOUND)
++ set(Pybind11_FOUND FALSE PARENT_SCOPE)
++ return()
++ endif(NOT Pybind11Source_FOUND)
++
++ nnas_include(ExternalBuildTools)
++ ExternalBuild_CMake(CMAKE_DIR ${Pybind11Source_DIR}
++ BUILD_DIR ${CMAKE_BINARY_DIR}/externals/PYBIND11/build
++ INSTALL_DIR ${EXT_OVERLAY_DIR}
++ IDENTIFIER "2.3.0"
++ PKG_NAME "PYBIND11")
++
++ find_path(Pybind11_INCLUDE_DIRS NAMES pybind11.h PATHS ${EXT_OVERLAY_DIR} PATH_SUFFIXES include/pybind11)
++
++ set(Pybind11_FOUND TRUE PARENT_SCOPE)
++endfunction(_Pybind11_import)
++
++_Pybind11_import()
+diff --git a/infra/cmake/packages/Pybind11SourceConfig.cmake b/infra/cmake/packages/Pybind11SourceConfig.cmake
+new file mode 100644
+index 0000000..4a9c676
+--- /dev/null
++++ b/infra/cmake/packages/Pybind11SourceConfig.cmake
+@@ -0,0 +1,18 @@
++function(_Pybind11Source_import)
++ if(NOT DOWNLOAD_PYBIND11)
++ set(Pybind11Source_FOUND FALSE PARENT_SCOPE)
++ return()
++ endif(NOT DOWNLOAD_PYBIND11)
++
++ nnas_include(ExternalSourceTools)
++ nnas_include(OptionTools)
++
++ envoption(PYBIND11_URL https://github.com/pybind/pybind11/archive/v2.3.0.tar.gz)
++
++ ExternalSource_Download(PYBIND11 ${PYBIND11_URL})
++
++ set(Pybind11Source_DIR ${PYBIND11_SOURCE_DIR} PARENT_SCOPE)
++ set(Pybind11Source_FOUND TRUE PARENT_SCOPE)
++endfunction(_Pybind11Source_import)
++
++_Pybind11Source_import()
+diff --git a/infra/docker/Dockerfile b/infra/docker/Dockerfile
+index e675b53..052cc4f 100644
+--- a/infra/docker/Dockerfile
++++ b/infra/docker/Dockerfile
+@@ -1,8 +1,6 @@
+ FROM ubuntu:16.04
+
+ ARG UBUNTU_MIRROR
+-ENV http_proxy $http_proxy
+-ENV https_proxy $https_proxy
+
+ RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
+ RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
+@@ -22,6 +20,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
+
+ # Additonal tools
+ RUN apt-get update && apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
++RUN pip3 install --upgrade pip
+ RUN pip3 install yapf==0.22.0 numpy
+
+ # Install google test (source)
+diff --git a/infra/docker/Dockerfile.1804 b/infra/docker/Dockerfile.1804
+index fc6fc9a..cc31bba 100644
+--- a/infra/docker/Dockerfile.1804
++++ b/infra/docker/Dockerfile.1804
+@@ -1,12 +1,6 @@
+ FROM ubuntu:18.04
+
+ ARG UBUNTU_MIRROR
+-ENV http_proxy $http_proxy
+-ENV https_proxy $https_proxy
+-
+-RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
+-RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
+-RUN if [ -n "$UBUNTU_MIRROR" ] ; then sed "s/archive.ubuntu.com/${UBUNTU_MIRROR}/g" -i /etc/apt/sources.list ; fi
+
+ # Install 'add-apt-repository'
+ RUN apt-get update && apt-get -qqy install software-properties-common
+@@ -22,6 +16,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
+
+ # Additonal tools
+ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
++RUN pip3 install --upgrade pip
+ RUN pip3 install yapf==0.22.0 numpy
+
+ # Install google test (source)
+diff --git a/infra/nncc/CMakeLists.txt b/infra/nncc/CMakeLists.txt
+index 3ac6680..0be6885 100644
+--- a/infra/nncc/CMakeLists.txt
++++ b/infra/nncc/CMakeLists.txt
+@@ -98,6 +98,7 @@ option(DOWNLOAD_CAFFE "Download Caffe source" ON)
+ option(DOWNLOAD_PYTORCH "Download Pytorch source" ON)
+ option(DOWNLOAD_ONNX "Download ONNX source" ON)
+ option(DOWNLOAD_ABSEIL "Download Abseil-cpp source" ON)
++option(DOWNLOAD_PYBIND11 "Download Pybind11 source" ON)
+
+ option(DOWNLOAD_GTEST "Download Google Test source" ON)
+ option(BUILD_GTEST "Build Google Test from the downloaded source" ON)
+diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount
+index d4610e3..d06c5c9 100644
+--- a/infra/nncc/command/utcount
++++ b/infra/nncc/command/utcount
+@@ -13,7 +13,7 @@ BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp stdex \
+ oops pepper-assert \
+ hermes hermes-std \
+ loco locop locomotiv logo-core logo \
+-foder souschef arser \
++foder souschef arser vconone \
+ safemain mio-circle mio-tflite \
+ tflite2circle \
+ luci \
+diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
+index 8e7f78e..2442a2d 100644
+--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
++++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
+@@ -100,7 +100,7 @@ target_include_directories(tensorflow-lite-2.2.0 SYSTEM PUBLIC ${TFLITE_INCLUDES
+ target_compile_definitions(tensorflow-lite-2.2.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV")
+ set_property(TARGET tensorflow-lite-2.2.0 PROPERTY POSITION_INDEPENDENT_CODE ON)
+ target_link_libraries(tensorflow-lite-2.2.0 eigen ${LIB_PTHREAD} dl)
+-if(${BUILD_WITH_NNAPI})
++if(NOT ANDROID AND ${BUILD_WITH_NNAPI})
+ target_link_libraries(tensorflow-lite-2.2.0 rt)
+ endif()
+
+diff --git a/infra/nnfw/config/gbs.conf b/infra/nnfw/config/gbs.conf
+index 515cada..bad9eb2 100644
+--- a/infra/nnfw/config/gbs.conf
++++ b/infra/nnfw/config/gbs.conf
+@@ -5,7 +5,7 @@ profile = profile.tizen
+ [profile.tizen]
+ user=obs_viewer
+ obs = obs.tizen
+-repos = repo.tizen_base,repo.tizen_mobile
++repos = repo.tizen_one,repo.tizen_base,repo.tizen_mobile
+ buildroot = /home/GBS-ROOT/
+
+ [obs.tizen]
+@@ -15,6 +15,8 @@ url = http://api.tizen.org
+ url = http://download.tizen.org/snapshots/tizen/unified/latest/repos/standard/packages/
+
+ [repo.tizen_base]
+-url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
++url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
+
++[repo.tizen_one]
++url = http://nnfw.mooo.com/archive/tizen/
+
+diff --git a/infra/packaging/preset/20200630 b/infra/packaging/preset/20200630
+index e159935..c3ca4b6 100644
+--- a/infra/packaging/preset/20200630
++++ b/infra/packaging/preset/20200630
+@@ -14,6 +14,7 @@ function preset_configure()
+ REQUIRED_UNITS+=("souschef")
+ REQUIRED_UNITS+=("safemain")
+ REQUIRED_UNITS+=("arser")
++ REQUIRED_UNITS+=("vconone")
+ # Hermes Logging Framework
+ REQUIRED_UNITS+=("hermes" "hermes-std")
+ # loco IR and related utilities
+@@ -28,11 +29,14 @@ function preset_configure()
+ REQUIRED_UNITS+=("record-minmax" "circle-quantizer")
+ REQUIRED_UNITS+=("one-cmds")
+
++ NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)}
++
+ # TODO Use "nncc configure" and "nncc build"
+ cmake \
+ -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
+ -DCMAKE_BUILD_TYPE=release \
+ -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
++ -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
+ ${EXTRA_OPTIONS[@]} \
+ "${NNAS_PROJECT_PATH}/infra/nncc"
+ }
+@@ -44,14 +48,4 @@ function preset_install()
+
+ # Install tf2nnpkg
+ install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
+-
+- # Create python virtual enviornment
+- python3 -m venv "${NNAS_INSTALL_PREFIX}/bin/venv"
+-
+- # Install tensorflow
+- source "${NNAS_INSTALL_PREFIX}/bin/venv/bin/activate"
+- python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+- install -U pip setuptools
+- python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+- install tensorflow-cpu==2.3.0rc0
+ }
+diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630
+index 9101f82..7846fd3 100644
+--- a/infra/packaging/res/tf2nnpkg.20200630
++++ b/infra/packaging/res/tf2nnpkg.20200630
+@@ -14,10 +14,16 @@ command_exists() {
+ usage()
+ {
+ echo "Convert TensorFlow model to nnpackage."
+- echo "Usage: tf2nnpkg --info <path/to/info> --graphdef <path/to/pb> [OPTION] -o <path/to/nnpkg/directory>"
+- exit 0
++ echo "Usage: tf2nnpkg"
++ echo " --info <path/to/info>"
++ echo " --graphdef <path/to/pb>"
++ echo " -o <path/to/nnpkg/directory>"
++ echo " --v2 (optional) Use TF 2.x interface"
++ exit 255
+ }
+
++TF_INTERFACE="--v1"
++
+ # Parse command-line arguments
+ #
+ while [ "$#" -ne 0 ]; do
+@@ -39,6 +45,10 @@ while [ "$#" -ne 0 ]; do
+ export OUTPUT_DIR="$2"
+ shift 2
+ ;;
++ '--v2')
++ TF_INTERFACE="--v2"
++ shift
++ ;;
+ *)
+ echo "${CUR}"
+ shift
+@@ -83,10 +93,7 @@ OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' '
+ INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':')
+
+ # generate tflite file
+-python "${ROOT}/bin/tf2tfliteV2.py" --v2 --input_path ${GRAPHDEF_FILE} \
+---output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
+---input_arrays ${INPUT} --output_arrays ${OUTPUT} || \
+-python "${ROOT}/bin/tf2tfliteV2.py" --v1 --input_path ${GRAPHDEF_FILE} \
++python "${ROOT}/bin/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${GRAPHDEF_FILE} \
+ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
+ --input_arrays ${INPUT} --input_shapes ${INPUT_SHAPES} \
+ --output_arrays ${OUTPUT}
+diff --git a/infra/scripts/build-tcm.sh b/infra/scripts/build-tcm.sh
+new file mode 100755
+index 0000000..22fb335
+--- /dev/null
++++ b/infra/scripts/build-tcm.sh
+@@ -0,0 +1,24 @@
++#!/bin/bash
++#
++# STEP 1
++# Download latest TCM tool from
++# https://github.sec.samsung.net/RS-TCM/tca-standalone/releases/download/v0.0.8/tca-standalone-0.0.8.jar
++#
++# STEP 2
++# Create symbolic link `./src` for source directory to be analyzed which has `.ahub` configuration.
++#
++# STEP 3
++# run this `build-tcm.sh` script.
++#
++# See the following link for additional details.
++# https://github.sec.samsung.net/RS-TCM/tca-standalone/wiki/Tutorials-CPP-Gtest
++#
++
++echo ${PROJECT_DIR:=${PWD}}
++
++java -jar $PROJECT_DIR/tca-standalone-0.0.8.jar \
++ --outdir=$PROJECT_DIR/tcm-output \
++ --config=$PROJECT_DIR/.ahub/tcchecker-tca/config.yaml \
++ --local=$PROJECT_DIR/src \
++ --logfile=$PROJECT_DIR/tcm-output/tcm.log \
++ --debug
+diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh
+index d436e8a..a0323e0 100644
+--- a/infra/scripts/compiler_modules.sh
++++ b/infra/scripts/compiler_modules.sh
+@@ -7,7 +7,7 @@ DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp;stdex"
+ DEBUG_BUILD_ITEMS+=";oops;pepper-assert"
+ DEBUG_BUILD_ITEMS+=";hermes;hermes-std"
+ DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo"
+-DEBUG_BUILD_ITEMS+=";foder;souschef;arser"
++DEBUG_BUILD_ITEMS+=";foder;souschef;arser;vconone"
+ DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite"
+ DEBUG_BUILD_ITEMS+=";tflite2circle"
+ DEBUG_BUILD_ITEMS+=";luci"
+diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh
+index 7da6736..011d14c 100755
+--- a/infra/scripts/docker_build_cross_aarch64_runtime.sh
++++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+ echo "It will use default rootfs path"
+ else
+ DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh
+index f1f666a..551fb57 100755
+--- a/infra/scripts/docker_build_cross_arm_runtime.sh
++++ b/infra/scripts/docker_build_cross_arm_runtime.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+ echo "It will use default rootfs path"
+ else
+ DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh
+index ea66f17..876f318 100755
+--- a/infra/scripts/docker_build_cross_arm_runtime_release.sh
++++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+ echo "It will use default rootfs path"
+ else
+ DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh
+index 08244e5..f42251b 100755
+--- a/infra/scripts/docker_build_cross_coverage.sh
++++ b/infra/scripts/docker_build_cross_coverage.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+ echo "It will use default rootfs path"
+ else
+ DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh
+index 418b50d..5b12531 100755
+--- a/infra/scripts/docker_build_nncc.sh
++++ b/infra/scripts/docker_build_nncc.sh
+@@ -54,6 +54,16 @@ pushd $ROOT_PATH > /dev/null
+ mkdir -p ${NNCC_INSTALL_PREFIX}
+ ./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}"
+
++# create python virtual environment
++./nncc docker-run python3 -m venv "${NNCC_INSTALL_PREFIX}/bin/venv"
++
++./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
++ -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
++ install -U pip setuptools
++./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
++ -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
++ install tensorflow-cpu==2.3.0rc0
++
+ mkdir -p ${ARCHIVE_PATH}
+ tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./
+
+diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh
+index 18809ad..ee0f183 100755
+--- a/infra/scripts/docker_build_tizen_cross.sh
++++ b/infra/scripts/docker_build_tizen_cross.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+ echo "It will use default rootfs path"
+ else
+ DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh
+index 556c5bd..55adaa1 100755
+--- a/infra/scripts/docker_collect_nnpkg_resources.sh
++++ b/infra/scripts/docker_collect_nnpkg_resources.sh
+@@ -60,7 +60,7 @@ pushd $ROOT_PATH > /dev/null
+ REQUIRED_UNITS=()
+ # Common Libraries
+ REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
+-REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "oops")
++REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "vconone")
+ # Hermes Logging Framework
+ REQUIRED_UNITS+=("hermes" "hermes-std")
+ # loco IR and related utilities
+diff --git a/infra/scripts/tizen_xu4_test.sh b/infra/scripts/tizen_xu4_test.sh
+index 5521b5f..640a0e0 100755
+--- a/infra/scripts/tizen_xu4_test.sh
++++ b/infra/scripts/tizen_xu4_test.sh
+@@ -23,7 +23,7 @@ function install_model()
+ {
+ # download tflite model files
+ pushd $HOST_HOME
+- tests/scripts/framework/run_test.sh --download=on
++ tests/scripts/framework/run_test.sh --download=on --run=off
+ # TODO Since this command removes model file(.zip),
+ # We must always download the file unlike model file(.tflite).
+ # Because caching applies only to tflite file.
+diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
+index ce1cd0b..e26ffcb 100644
+--- a/packaging/nnfw.spec
++++ b/packaging/nnfw.spec
+@@ -30,7 +30,7 @@ BuildRequires: flatbuffers-devel
+ %ifarch %{arm} aarch64
+ # Require python for acl-ex library build pre-process
+ BuildRequires: python
+-BuildRequires: libarmcl-devel
++BuildRequires: libarmcl-devel >= v20.05
+ %endif
+
+ Requires(post): /sbin/ldconfig
+diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
+new file mode 100644
+index 0000000..7322e90
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
+@@ -0,0 +1,26 @@
++operand {
++ name: "ifm"
++ type: UINT8
++ shape { dim: 1 dim: 8 dim: 8 dim: 1 }
++ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
++}
++operand {
++ name: "ofm"
++ type: UINT8
++ shape { dim: 1 dim: 7 dim: 7 dim: 1 }
++ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
++}
++operation {
++ type: "AveragePool2D"
++ averagepool2d_options {
++ padding: VALID
++ stride_w: 1
++ stride_h: 1
++ filter_width: 2
++ filter_height: 2
++ }
++ input: "ifm"
++ output: "ofm"
++}
++input: "ifm"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
+new file mode 100644
+index 0000000..a09afc1
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
+@@ -0,0 +1,44 @@
++operand {
++ name: "ifm"
++ type: FLOAT32
++ shape { dim: 1 dim: 4 dim: 5 dim: 5 }
++}
++operand {
++ name: "ker"
++ type: FLOAT32
++ shape { dim: 1 dim: 1 dim: 2 dim: 25 }
++}
++operand {
++ name: "bias"
++ type: FLOAT32
++ shape { dim: 25 }
++ filler {
++ tag: "constant"
++ arg: "1.1"
++ }
++}
++operand {
++ name: "ofm"
++ type: FLOAT32
++ shape { dim: 1 dim: 2 dim: 2 dim: 25 }
++}
++operation {
++ type: "DepthwiseConv2D"
++ version: 2
++ depthwiseconv2d_options {
++ padding: VALID
++ stride_w: 2
++ stride_h: 2
++ dilation_w_factor: 2
++ dilation_h_factor: 1
++ depth_multiplier: 5
++ activation : RELU6
++ }
++ input: "ifm"
++ input: "ker"
++ input: "bias"
++ output: "ofm"
++}
++input: "ifm"
++input: "ker"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
+new file mode 100644
+index 0000000..edfabc6
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
+@@ -0,0 +1,3 @@
++# To check if DEPTHWISE_CONV_2D version is 2
++
++RULE "OP_VERSION_CHECK" $(op_version DEPTHWISE_CONV_2D) '=' 2
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
+new file mode 100644
+index 0000000..5e0b6b5
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
+@@ -0,0 +1,61 @@
++operand {
++ name: "ifm"
++ type: UINT8
++ shape { dim: 1 dim: 112 dim: 112 dim: 4 }
++ quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
++}
++operand {
++ name: "ker"
++ type: UINT8
++ shape { dim: 1 dim: 3 dim: 3 dim: 4 }
++ filler {
++ tag: "gaussian"
++ arg: "0.0"
++ arg: "1.0"
++ }
++ quant {
++ min: -30.3175 min: -0.779597 min: -10.2751 min: -10.8594
++ max: 4.35049 max: 2.70807 max: 11.0269 max: 20.97
++ scale:0.135953 scale: 0.0136771 scale: 0.0835375 scale: 0.124821
++ zero_point:223 zero_point: 57 zero_point: 123 zero_point: 87
++ quantized_dimension: 3
++ }
++}
++operand {
++ name: "bias"
++ type: INT32
++ shape { dim: 4 }
++ filler {
++ tag: "gaussian"
++ arg: "0"
++ arg: "1.0"
++ }
++ quant {
++ scale: 1.4758e-16 scale: 3.15185e-05 scale: 2.20685e-05 scale: 1.72205e-16
++ zero_point: 0 zero_point: 0 zero_point: 0 zero_point: 0
++ }
++}
++operand {
++ name: "ofm"
++ type: UINT8
++ shape { dim: 1 dim: 112 dim: 112 dim: 4 }
++ quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
++
++}
++operation {
++ type: "DepthwiseConv2D"
++ depthwiseconv2d_options {
++ padding: SAME
++ stride_w: 1
++ stride_h: 1
++ depth_multiplier: 1
++ activation : RELU6
++ }
++ input: "ifm"
++ input: "ker"
++ input: "bias"
++ output: "ofm"
++}
++input: "ifm"
++input: "ker"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
+new file mode 100644
+index 0000000..3fff5cd
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
+@@ -0,0 +1,22 @@
++operand {
++ name: "ifm1"
++ type: UINT8
++ shape { dim: 1 dim: 4 dim: 4 dim: 3 }
++ quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
++}
++operand {
++ name: "ofm"
++ type: UINT8
++ shape { dim: 1 dim: 4 dim: 4 dim: 3 }
++ quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
++}
++operation {
++ type: "L2Normalize"
++ l2norm_options {
++ activation: NONE
++ }
++ input: "ifm1"
++ output: "ofm"
++}
++input: "ifm1"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
+new file mode 100644
+index 0000000..7b2a84d
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
+@@ -0,0 +1,19 @@
++operand {
++ name: "ifm"
++ type: UINT8
++ shape { dim: 1 dim: 3 dim: 3 dim: 2 }
++ quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
++}
++operand {
++ name: "ofm"
++ type: UINT8
++ shape { dim: 1 dim: 3 dim: 3 dim: 2 }
++ quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
++}
++operation {
++ type: "Logistic"
++ input: "ifm"
++ output: "ofm"
++}
++input: "ifm"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
+index 79271a4..1313e26 100644
+--- a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
++++ b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
+@@ -10,7 +10,7 @@ operand {
+ operand {
+ name: "ker"
+ type: FLOAT32
+- shape { dim: 1 dim: 3 dim: 3 dim: 1 }
++ shape { dim: 3 dim: 1 dim: 1 dim: 3 }
+ filler {
+ tag: "gaussian"
+ arg: "0.0"
+diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
+new file mode 100644
+index 0000000..887380c
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
+@@ -0,0 +1,27 @@
++operand {
++ name: "ifm"
++ type: FLOAT32
++ shape { dim: 4 }
++}
++operand {
++ name: "ofm"
++ type: FLOAT32
++ shape { }
++}
++operand {
++ name: "ofm_idx"
++ type: INT32
++ shape { dim: 4 }
++}
++operation {
++ type: "Unique"
++ unique_options {
++ idx_out_type: INT32
++ }
++ input: "ifm"
++ output: "ofm"
++ output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
+new file mode 100644
+index 0000000..9beb516
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
+@@ -0,0 +1,27 @@
++operand {
++ name: "ifm"
++ type: FLOAT32
++ shape { dim: 4 }
++}
++operand {
++ name: "ofm"
++ type: FLOAT32
++ shape { }
++}
++operand {
++ name: "ofm_idx"
++ type: INT64
++ shape { dim: 4 }
++}
++operation {
++ type: "Unique"
++ unique_options {
++ idx_out_type: INT64
++ }
++ input: "ifm"
++ output: "ofm"
++ output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_001/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.recipe b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
+new file mode 100644
+index 0000000..67b947f
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
+@@ -0,0 +1,27 @@
++operand {
++ name: "ifm"
++ type: INT32
++ shape { dim: 5 }
++}
++operand {
++ name: "ofm"
++ type: INT32
++ shape { }
++}
++operand {
++ name: "ofm_idx"
++ type: INT32
++ shape { dim: 5 }
++}
++operation {
++ type: "Unique"
++ unique_options {
++ idx_out_type: INT32
++ }
++ input: "ifm"
++ output: "ofm"
++ output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.reverse b/res/TensorFlowLiteRecipes/Unique_002/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.recipe b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
+new file mode 100644
+index 0000000..375db66
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
+@@ -0,0 +1,27 @@
++operand {
++ name: "ifm"
++ type: INT32
++ shape { dim: 5 }
++}
++operand {
++ name: "ofm"
++ type: INT32
++ shape { }
++}
++operand {
++ name: "ofm_idx"
++ type: INT64
++ shape { dim: 5 }
++}
++operation {
++ type: "Unique"
++ unique_options {
++ idx_out_type: INT64
++ }
++ input: "ifm"
++ output: "ofm"
++ output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.reverse b/res/TensorFlowLiteRecipes/Unique_003/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
+new file mode 100644
+index 0000000..d3985e4
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
+@@ -0,0 +1,28 @@
++operand {
++ name: "ifm"
++ type: UINT8
++ shape { dim: 4 }
++ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
++}
++operand {
++ name: "ofm"
++ type: UINT8
++ shape { }
++}
++operand {
++ name: "ofm_idx"
++ type: INT32
++ shape { dim: 4 }
++}
++operation {
++ type: "Unique"
++ unique_options {
++ idx_out_type: INT32
++ }
++ input: "ifm"
++ output: "ofm"
++ output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
+new file mode 100644
+index 0000000..b08dd85
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
+@@ -0,0 +1,28 @@
++operand {
++ name: "ifm"
++ type: UINT8
++ shape { dim: 5 }
++ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
++}
++operand {
++ name: "ofm"
++ type: UINT8
++ shape { }
++}
++operand {
++ name: "ofm_idx"
++ type: INT64
++ shape { dim: 5 }
++}
++operation {
++ type: "Unique"
++ unique_options {
++ idx_out_type: INT64
++ }
++ input: "ifm"
++ output: "ofm"
++ output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/runtime/libs/benchmark/CMakeLists.txt b/runtime/libs/benchmark/CMakeLists.txt
+index 2af0ffa..748b2d1 100644
+--- a/runtime/libs/benchmark/CMakeLists.txt
++++ b/runtime/libs/benchmark/CMakeLists.txt
+@@ -1,6 +1,5 @@
+ file(GLOB_RECURSE SOURCES "src/*.cpp")
+
+-add_library(nnfw_lib_benchmark SHARED ${SOURCES})
++add_library(nnfw_lib_benchmark STATIC ${SOURCES})
+ target_include_directories(nnfw_lib_benchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+ target_link_libraries(nnfw_lib_benchmark PRIVATE ${LIB_PTHREAD})
+-install(TARGETS nnfw_lib_benchmark DESTINATION lib)
+diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp
+index 7a3f9a5..df573da 100644
+--- a/runtime/libs/benchmark/src/Result.cpp
++++ b/runtime/libs/benchmark/src/Result.cpp
+@@ -166,7 +166,7 @@ Result::Result(const Phases &phases)
+ if (option.memory)
+ {
+ print_memory = true;
+- for (int i = PhaseEnum::MODEL_LOAD; i <= PhaseEnum::EXECUTE; ++i)
++ for (int i = PhaseEnum::MODEL_LOAD; i < PhaseEnum::EXECUTE; ++i)
+ {
+ auto phase = phases.at(gPhaseStrings[i]);
+ for (int j = MemoryType::RSS; j <= MemoryType::PSS; ++j)
+diff --git a/runtime/onert/api/include/nnfw.h b/runtime/onert/api/include/nnfw.h
+index 031aabd..03a3aed 100644
+--- a/runtime/onert/api/include/nnfw.h
++++ b/runtime/onert/api/include/nnfw.h
+@@ -99,6 +99,8 @@ typedef enum {
+ NNFW_STATUS_ERROR = 1,
+ /** Unexpected null argument is given. */
+ NNFW_STATUS_UNEXPECTED_NULL = 2,
++ /** When a function was called but it is not valid for the current session state. */
++ NNFW_STATUS_INVALID_STATE = 3,
+ } NNFW_STATUS;
+
+ /**
+@@ -432,10 +434,10 @@ NNFW_STATUS nnfw_output_tensorinfo(nnfw_session *session, uint32_t index,
+ *
+ * <p>Supported backends differs on each platforms.
+ * For example, `x86_64` supports "cpu" only.
+- * Can set multiple backends by semicolon (ex: "acl_cl;cpu").
+- * Among the multiple backends, the 1st element is used as default backend.</p>
+- *
+- * @note Possible backend strings are: "cpu", "acl_cl", "acl_neon", "srcn"
++ * Multiple backends can be set and they must be separated by a semicolon (ex: "acl_cl;cpu").
++ * For each backend string, `libbackend_{backend}.so` will be dynamically loaded during
++ * {@link nnfw_prepare}.
++ * Among the multiple backends, the 1st element is used as the default backend.</p>
+ *
+ * @param[in] session session to which avilable backends are set
+ * @param[in] backends available backends on which nnfw uses
+@@ -449,12 +451,10 @@ NNFW_STATUS nnfw_set_available_backends(nnfw_session *session, const char *backe
+ *
+ * This function should be called before {@link nnfw_prepare} is invoked.
+ *
+- * <p>Supported backends differs on each platforms.
+- * For example, `x86_64` supports "cpu" only.
+- * The backend for op has higher priority than available backends specified by
+- * nnfw_set_available_backends.</p>
++ * <p>The backend for op has higher priority than available backends specified by
++ * {@link nnfw_set_available_backends}.</p>
+ *
+- * @note Possible backend strings are: "cpu", "acl_cl", "acl_neon"
++ * @deprecated Deprecated since 1.8.0.
+ *
+ * @param[in] session session to be modified
+ * @param[in] op operation to be set
+diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc
+index 0747583..34a46ed 100644
+--- a/runtime/onert/api/src/nnfw_api.cc
++++ b/runtime/onert/api/src/nnfw_api.cc
+@@ -31,6 +31,7 @@ STATIC_ASSERT_ENUM_CHECK(NNFW_TYPE_TENSOR_INT64, 5);
+ STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_NO_ERROR, 0);
+ STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_ERROR, 1);
+ STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_UNEXPECTED_NULL, 2);
++STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_INVALID_STATE, 3);
+
+ STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_NONE, 0);
+ STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_CHANNELS_LAST, 1);
+diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
+index d03ddd4..b3390fa 100644
+--- a/runtime/onert/api/src/nnfw_api_internal.cc
++++ b/runtime/onert/api/src/nnfw_api_internal.cc
+@@ -76,7 +76,7 @@ nnfw_session::~nnfw_session() = default;
+ NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
+ {
+ if (!isStateInitialized())
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+
+ if (!package_dir)
+ {
+@@ -156,7 +156,7 @@ NNFW_STATUS nnfw_session::prepare()
+ std::cerr << "invalid state";
+ }
+ std::cerr << std::endl;
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+ }
+
+ if (!_subgraphs || !primary_subgraph() || primary_subgraph()->isBuildingPhase())
+@@ -188,7 +188,7 @@ NNFW_STATUS nnfw_session::run()
+ {
+ std::cerr << "Error during nnfw_session::run : "
+ << "run should be run after prepare" << std::endl;
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+ }
+
+ try
+@@ -211,7 +211,7 @@ NNFW_STATUS nnfw_session::run_async()
+ {
+ std::cerr << "Error during nnfw_session::run_async : "
+ << "run_async should be run after prepare" << std::endl;
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+ }
+
+ _execution->startExecute();
+@@ -241,7 +241,7 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo
+ if (!isStatePreparedOrFinishedRun())
+ {
+ std::cerr << "Error during nnfw_session::set_input : invalid state" << std::endl;
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+ }
+
+ if (!buffer && length != 0)
+@@ -270,7 +270,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
+ if (!isStatePreparedOrFinishedRun())
+ {
+ std::cerr << "Error during nnfw_session::set_output : invalid state" << std::endl;
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+ }
+
+ if (!buffer && length != 0)
+@@ -296,7 +296,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
+ NNFW_STATUS nnfw_session::input_size(uint32_t *number)
+ {
+ if (isStateInitialized()) // Model is not loaded
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+
+ try
+ {
+@@ -318,7 +318,7 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number)
+ NNFW_STATUS nnfw_session::output_size(uint32_t *number)
+ {
+ if (isStateInitialized()) // Model is not loaded
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+
+ try
+ {
+@@ -410,7 +410,7 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti)
+ {
+ std::cerr << "Error during set_input_tensorinfo : should be run after load_model"
+ << std::endl;
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+ }
+
+ if (ti.rank <= 0 || ti.rank > NNFW_MAX_RANK)
+@@ -463,6 +463,9 @@ NNFW_STATUS nnfw_session::set_input_tensorinfo(uint32_t index, const nnfw_tensor
+
+ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
+ {
++ if (isStateInitialized())
++ return NNFW_STATUS_INVALID_STATE;
++
+ try
+ {
+ if (ti == nullptr)
+@@ -499,7 +502,7 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
+ NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
+ {
+ if (isStateInitialized())
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+
+ if (ti == nullptr)
+ {
+@@ -570,7 +573,7 @@ static std::string get_op_backend_string(std::string op)
+ NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
+ {
+ if (!isStateModelLoaded())
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+
+ try
+ {
+@@ -596,7 +599,7 @@ NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
+ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
+ {
+ if (!isStateModelLoaded())
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+
+ try
+ {
+@@ -627,7 +630,7 @@ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
+ NNFW_STATUS nnfw_session::set_config(const char *key, const char *value)
+ {
+ if (!isStateModelLoaded())
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+
+ auto &options = _compiler->options();
+
+@@ -693,7 +696,7 @@ onert::ir::Graph *nnfw_session::primary_subgraph()
+ NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size)
+ {
+ if (!isStateModelLoaded())
+- return NNFW_STATUS_ERROR;
++ return NNFW_STATUS_INVALID_STATE;
+
+ auto &options = _compiler->options();
+
+diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
+index 3ca4058..4ab2d4c 100644
+--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
++++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
+@@ -31,6 +31,7 @@
+ #include "exec/FunctionSequence.h"
+ #include "util/logging.h"
+ #include "util/Utils.h"
++#include "AclKernelGen.h"
+
+ namespace onert
+ {
+@@ -76,15 +77,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
+ const auto block_size_index{
+ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto block_size_alloc = _tensor_builder->at(block_size_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+
+ assert(_ctx.at(block_size_index).data());
+
+ auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>();
+
+- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -96,15 +97,27 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
+- ? arm_compute::SubDataType::BOOL
+- : arm_compute::SubDataType::NONE;
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+- auto fn = std::make_unique<::arm_compute::CLCast>();
++ std::unique_ptr<::arm_compute::IFunction> fn;
++ if (ifm_tensor->data_type() == ofm_tensor->data_type())
++ {
++ auto l = std::make_unique<::arm_compute::CLCopy>();
++
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
++ fn = std::move(l);
++ }
++ else
++ {
++ auto l = std::make_unique<::arm_compute::CLCast>();
++
++ // TODO Support converting float to int32 as round down
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
++
++ fn = std::move(l);
++ }
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -132,10 +145,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+ ker_width, ker_height);
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto ker_alloc = _tensor_builder->at(ker_index).get();
+- auto bias_alloc = _tensor_builder->at(bias_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto ker_tensor = _tensor_builder->at(ker_index).get();
++ auto bias_tensor = _tensor_builder->at(bias_index).get();
+
+ const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+ const auto act_info = acl_common::asActivationLayerInfo(activation);
+@@ -143,8 +156,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+ auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
+- conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
++ ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
++ ::arm_compute::Size2D(1U, 1U), act_info);
+
+ _return_fn = asAclClFunction(std::move(fn));
+ }
+@@ -171,10 +185,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+ const auto multiplier = node.param().multiplier;
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto ker_alloc = _tensor_builder->at(ker_index).get();
+- auto bias_alloc = _tensor_builder->at(bias_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto ker_tensor = _tensor_builder->at(ker_index).get();
++ auto bias_tensor = _tensor_builder->at(bias_index).get();
+
+ const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+ const auto act_info = acl_common::asActivationLayerInfo(activation);
+@@ -182,8 +196,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+ {
+ auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
+
+- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
+- ofm_alloc->handle(), conv_info, multiplier, act_info);
++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
++ ofm_tensor->handle(), conv_info, multiplier, act_info);
+
+ _return_fn = asAclClFunction(std::move(fn));
+ }
+@@ -217,19 +231,20 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
+ VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
+ VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+- ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
+- ::arm_compute::Size2D{kw, kh},
+- acl_common::asPadStrideInfo(padding, stride)};
++ ::arm_compute::PoolingLayerInfo info{
++ ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh},
++ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)};
+
+ auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclClFunction(std::move(fn)),
++ ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+@@ -260,19 +275,21 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+ VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
+ VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ ::arm_compute::PoolingLayerInfo info{
+ ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
+- acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
++ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride),
++ true /* exclude_padding */};
+
+ auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclClFunction(std::move(fn)),
++ ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Concat &node)
+@@ -296,7 +313,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ return;
+ }
+
+- auto output_alloc = _tensor_builder->at(ofm_index).get();
++ auto output_tensor = _tensor_builder->at(ofm_index).get();
+ std::vector<::arm_compute::ICLTensor *> input_tensors;
+ for (auto &ifm_ind : input_indexes)
+ input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
+@@ -305,7 +322,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ if (input_indexes.size() < 2)
+ {
+ auto l = std::make_unique<::arm_compute::CLCopy>();
+- l->configure(input_tensors.at(0), output_alloc->handle());
++ l->configure(input_tensors.at(0), output_tensor->handle());
+ fn = std::move(l);
+ }
+ else
+@@ -313,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ auto l = std::make_unique<::arm_compute::CLConcatenateLayer>();
+ const auto rank = _ctx.at(ofm_index).shape().rank();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = output_alloc->layout();
++ const auto backend_layout = output_tensor->layout();
+ const auto fixed_axis =
+ acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
+- l->configure(input_tensors, output_alloc->handle(), fixed_axis);
++ l->configure(input_tensors, output_tensor->handle(), fixed_axis);
+ fn = std::move(l);
+ }
+
+@@ -327,75 +344,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+
+ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+ {
+- using ir::operation::FullyConnected;
+-
+ const auto output_index{node.getOutputs().at(0)};
+- const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+- const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+- const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+-
+- const auto input_rank = _ctx.at(input_index).shape().rank();
+-
+- const auto output_size =
+- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
+- UNUSED_RELEASE(output_size);
+- assert(_ctx.at(bias_index).shape().dim(0) == output_size);
+- assert(_ctx.at(weight_index).shape().dim(0) == output_size);
+- const auto batch_size =
+- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
+- const auto input_size =
+- _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
+-
+- // Check for reshaping input's shape into rank-2
+- bool needs_reshape = false;
+- ir::Shape reshape(2);
+- if (input_rank == 3 || input_rank == 4)
+- {
+- const auto &ifm_shape = _ctx.at(input_index).shape();
+- auto feature_size = 1;
+- for (int i = 0; i < ifm_shape.rank(); ++i)
+- {
+- feature_size *= ifm_shape.dim(i);
+- }
+-
+- UNUSED_RELEASE(feature_size);
+- assert(feature_size == batch_size * input_size);
+-
+- // for reshaping
+- needs_reshape = true;
+- reshape.dim(0) = batch_size; /* H */
+- reshape.dim(1) = input_size; /* W */
+- }
+-
++ auto output_tensor = _tensor_builder->at(output_index).get();
+ const auto activation = node.param().activation;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- const auto input_alloc = _tensor_builder->at(input_index).get();
+- const auto weight_alloc = _tensor_builder->at(weight_index).get();
+- const auto bias_alloc = _tensor_builder->at(bias_index).get();
+- const auto frontend_layout = _current_op_seq_layout;
+- const auto acl_layout = output_alloc->handle()->info()->data_layout();
+-
+- auto fn = std::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
+- _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+-
+- arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type =
+- arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL;
+- if (_ctx.at(weight_index).isConstant())
+- {
+- kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
+- assert(_ctx.at(weight_index).data());
+- }
+- fn->configure(
+- input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
+- needs_reshape,
+- ::onert::backend::acl_common::asTensorShape(
+- reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
+- kernel_type);
+-
++ auto fn = acl_common::kernelGenFullyConnected<acl_common::AclClFunction, ::arm_compute::ICLTensor,
++ ::arm_compute::CLFullyConnectedReshapingLayer>(
++ node, _ctx, _tensor_builder, _current_op_seq_layout);
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)),
+- ActivationBuilder::generate(activation, output_alloc->handle()));
++ std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Mul &node)
+@@ -406,17 +363,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
+ arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclClFunction(std::move(fn)),
++ ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Reduce &node)
+@@ -427,14 +385,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ const auto keep_dims{node.param().keep_dims};
+ const auto reduce_type = node.param().reduce_type;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ // Convert to ACL axes taking into account negative values and possible duplicates.
+ const auto &axes = _ctx.at(axes_index);
+ const auto input_rank = _ctx.at(input_index).shape().rank();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = input_alloc->layout();
++ const auto backend_layout = input_tensor->layout();
+
+ std::unique_ptr<arm_compute::IFunction> fn;
+ if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
+@@ -443,7 +401,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+
+ const auto acl_axes =
+ acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
+- l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle());
++ l->configure(input_tensor->handle(), acl_axes, keep_dims, output_tensor->handle());
+
+ fn = std::move(l);
+ }
+@@ -453,7 +411,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+ const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
+- l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims,
++ l->configure(input_tensor->handle(), output_tensor->handle(), acl_axes, keep_dims,
+ acl_common::convertReduceType(reduce_type));
+
+ fn = std::move(l);
+@@ -469,13 +427,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ // NOTE This operation must not be changed the layout from frontend to backend
+ // So, PermutationOperationPass makes layouts of frontend and backend the same.
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = output_alloc->layout();
++ const auto backend_layout = output_tensor->layout();
+ assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
+ frontend_layout == backend_layout);
+ UNUSED_RELEASE(frontend_layout);
+@@ -483,7 +441,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+
+ auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -503,10 +461,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
+ (void)dims;
+ (void)ndim;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+ auto fn = std::make_unique<arm_compute::CLReshapeLayer>();
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+ auto acl_fn = asAclClFunction(std::move(fn));
+ _return_fn = std::move(acl_fn);
+ }
+@@ -516,15 +474,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<arm_compute::CLActivationLayer>();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -538,13 +496,13 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
+
+ const auto beta = node.param().beta;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -558,10 +516,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+ const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
+ const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
+
+- auto outputData_alloc = _tensor_builder->at(output_index).get();
+- auto inputData_alloc = _tensor_builder->at(input_index).get();
++ auto outputData_tensor = _tensor_builder->at(output_index).get();
++ auto inputData_tensor = _tensor_builder->at(input_index).get();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = inputData_alloc->layout();
++ const auto backend_layout = inputData_tensor->layout();
+
+ // Set initializers for indices data such as order of inputData
+ int input_rank = _ctx.at(input_index).shape().rank();
+@@ -613,7 +571,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+
+ auto fn = std::make_unique<::arm_compute::CLSlice>();
+
+- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
++ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -628,10 +586,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+ const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
+ const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
+
+- auto outputData_alloc = _tensor_builder->at(output_index).get();
+- auto inputData_alloc = _tensor_builder->at(input_index).get();
++ auto outputData_tensor = _tensor_builder->at(output_index).get();
++ auto inputData_tensor = _tensor_builder->at(input_index).get();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = inputData_alloc->layout();
++ const auto backend_layout = inputData_tensor->layout();
+
+ // Set initializers for indices data such as order of inputData
+ int input_rank = _ctx.at(input_index).shape().rank();
+@@ -704,7 +662,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+
+ auto fn = std::make_unique<::arm_compute::CLStridedSlice>();
+
+- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
++ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
+ strides_set, begin_mask, end_mask, shrink_axis_mask);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+@@ -720,10 +678,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+
+ const auto rank = _ctx.at(ifm_idx).shape().rank();
+
+- auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = ifm_alloc->layout();
++ const auto backend_layout = ifm_tensor->layout();
+
+ std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
+ // Reversed
+@@ -732,7 +690,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+
+ auto fn = std::make_unique<::arm_compute::CLPermute>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -747,17 +705,18 @@ void KernelGenerator::visit(const ir::operation::Add &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+ arm_compute::ConvertPolicy::SATURATE);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclClFunction(std::move(fn)),
++ ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Sub &node)
+@@ -768,17 +727,18 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+ arm_compute::ConvertPolicy::SATURATE);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclClFunction(std::move(fn)),
++ ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Div &node)
+@@ -789,16 +749,17 @@ void KernelGenerator::visit(const ir::operation::Div &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclClFunction(std::move(fn)),
++ ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Exp &node)
+@@ -806,12 +767,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLExpLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -823,12 +784,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -842,20 +803,21 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
+ const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
+ const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto gamma_alloc = _tensor_builder->at(gamma_index).get();
+- auto beta_alloc = _tensor_builder->at(beta_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto gamma_tensor = _tensor_builder->at(gamma_index).get();
++ auto beta_tensor = _tensor_builder->at(beta_index).get();
+ auto epsilon = node.param().epsilon;
+ auto activation = node.param().activation;
+
+ auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
+- beta_alloc->handle(), epsilon);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
++ beta_tensor->handle(), epsilon);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclClFunction(std::move(fn)),
++ ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Logistic &node)
+@@ -863,15 +825,15 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
+
+ auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -884,13 +846,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
+ const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input0_alloc = _tensor_builder->at(input0_index).get();
+- auto input1_alloc = _tensor_builder->at(input1_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input0_tensor = _tensor_builder->at(input0_index).get();
++ auto input1_tensor = _tensor_builder->at(input1_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>();
+
+- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
+ ::arm_compute::BinaryLogicalOperation::AND);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+@@ -900,159 +862,8 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
+
+ void KernelGenerator::visit(const ir::operation::LSTM &node)
+ {
+- // TODO Support dynamic rnn
+- // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+- const auto scratch_buffer_index{
+- node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+- const auto output_state_out_index{
+- node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+- const auto cell_state_out_index{
+- node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+- const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+-
+- const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+- const auto input_to_input_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+- const auto input_to_forget_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+- const auto input_to_cell_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+- const auto input_to_output_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+- const auto recurrent_to_input_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+- const auto recurrent_to_forget_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+- const auto recurrent_to_cell_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+- const auto recurrent_to_output_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+- const auto cell_to_input_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+- const auto cell_to_forget_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+- const auto cell_to_output_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+- const auto input_gate_bias_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+- const auto forget_gate_bias_index{
+- node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+- const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+- const auto output_gate_bias_index{
+- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+- const auto projection_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+- const auto projection_bias_index{
+- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+- const auto output_state_in_index{
+- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+- const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+- const auto cell_threshold = node.param().cell_threshold;
+- const auto projection_threshold = node.param().projection_threshold;
+-
+- bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+- _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+- bool has_recurrent_to_input_weights =
+- _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+- _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+- bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+- bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+- bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+- _ctx.at(projection_weights_index).shape().dim(1) != 0;
+- bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
+-
+- // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+- // true: no CIFG
+- // false: CIFG
+- // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+- bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+-
+- // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+- // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+- // true: peephole
+- // false: no peephole
+- bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+-
+- // NOTE Although the projection weights has data the projection bias may not have data.
+- bool has_projection_param = has_projection_weights;
+-
+- const auto activation = node.param().activation;
+- const auto cell_clip = cell_threshold;
+- const auto projection_clip = projection_threshold;
+- assert(cell_clip >= 0.f && projection_clip >= 0.f);
+-
+- auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
+- auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
+- auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
+- auto output_alloc = _tensor_builder->at(output_index).get();
+-
+- auto input_alloc = _tensor_builder->at(input_index).get();
+-
+- auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
+- auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
+- auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
+- auto recurrent_to_forget_weights_alloc =
+- _tensor_builder->at(recurrent_to_forget_weights_index).get();
+- auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
+- auto recurrent_to_output_weights_alloc =
+- _tensor_builder->at(recurrent_to_output_weights_index).get();
+-
+- auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
+- auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
+- auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
+- auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
+- auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
+-
+- auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+-
+- auto fn = std::make_unique<::arm_compute::CLLSTMLayer>();
+-
+- ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
+- if (has_cifg_param)
+- {
+- auto input_to_input_weights_alloc =
+- _tensor_builder->at(input_to_input_weights_index).get(); // optional
+- auto recurrent_to_input_weights_alloc =
+- _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+- auto cell_to_input_weights_handle =
+- has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
+- : nullptr; // optional (non-cifg && peephole)
+- auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
+- lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
+- recurrent_to_input_weights_alloc->handle(),
+- cell_to_input_weights_handle, input_gate_bias_alloc->handle());
+- }
+- if (has_peephole_param)
+- {
+- auto cell_to_forget_weights_alloc =
+- _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+- auto cell_to_output_weights_alloc =
+- _tensor_builder->at(cell_to_output_weights_index).get(); // optional
+- lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
+- cell_to_output_weights_alloc->handle());
+- }
+- if (has_projection_param)
+- {
+- auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
+- auto projection_bias_handle = has_projection_bias
+- ? _tensor_builder->at(projection_bias_index).get()->handle()
+- : nullptr; // optional
+- lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
+- }
+-
+- fn->configure(
+- input_alloc->handle(), input_to_forget_weights_alloc->handle(),
+- input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
+- recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
+- recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
+- cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
+- cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
+- output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
+- lstm_params, act_info, cell_clip, projection_clip);
+-
+- auto acl_fn = asAclClFunction(std::move(fn));
+-
+- _return_fn = std::move(acl_fn);
++ _return_fn = acl_common::kernelGenLSTM<acl_common::AclClFunction, ::arm_compute::ICLTensor,
++ ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_builder);
+ }
+
+ void KernelGenerator::visit(const ir::operation::Comparison &node)
+@@ -1063,13 +874,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
+
+ const auto comparison_type = node.param().comparison_type;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input0_alloc = _tensor_builder->at(input0_index).get();
+- auto input1_alloc = _tensor_builder->at(input1_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input0_tensor = _tensor_builder->at(input0_index).get();
++ auto input1_tensor = _tensor_builder->at(input1_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLComparison>();
+
+- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
+ (arm_compute::ComparisonOperation)comparison_type);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+@@ -1107,13 +918,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
+ for (const auto &input_index : input_indexes)
+ {
+ size_t input_rank = _ctx.at(input_index).shape().rank();
+- const auto &input_alloc = _tensor_builder->at(input_index);
+- orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape());
+- assert(input_rank == input_alloc->num_dimensions());
+- if (input_rank != input_alloc->info()->num_dimensions())
++ const auto &input_tensor = _tensor_builder->at(input_index);
++ orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
++ assert(input_rank == input_tensor->num_dimensions());
++ if (input_rank != input_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+ _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
+ }
+ }
+@@ -1135,8 +946,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+ const auto ofm_idx{node.getOutputs().at(0)};
+ const auto ifm_idx{node.getInputs().at(0)};
+ const auto permute_type = node.getPermuteType();
+- auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
+ const auto rank = _ctx.at(ofm_idx).shape().rank();
+ assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
+
+@@ -1149,7 +960,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+
+ auto l = std::make_unique<::arm_compute::CLPermute>();
+
+- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
+
+ fn = std::move(l);
+ }
+@@ -1160,7 +971,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+
+ auto l = std::make_unique<::arm_compute::CLPermute>();
+
+- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
+
+ fn = std::move(l);
+ }
+@@ -1168,7 +979,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+ {
+ auto l = std::make_unique<::arm_compute::CLCopy>();
+
+- l->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ fn = std::move(l);
+ }
+@@ -1183,12 +994,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ _return_fn = asAclClFunction(std::move(fn));
+ }
+@@ -1198,15 +1009,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<arm_compute::CLActivationLayer>();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1219,12 +1030,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
+
+ const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLScale>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
+ ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
+ ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
+
+@@ -1238,15 +1049,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+
+ auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1258,15 +1069,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
+
+ auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1288,25 +1099,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
+
+ const auto activation = node.param().activation;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
+
+- auto input_alloc = _tensor_builder->at(input_index).get();
+- auto weights_alloc = _tensor_builder->at(weights_index).get();
+- auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
+- auto bias_alloc = _tensor_builder->at(bias_index).get();
+- auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
++ auto weights_tensor = _tensor_builder->at(weights_index).get();
++ auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
++ auto bias_tensor = _tensor_builder->at(bias_index).get();
++ auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
+ auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+
+ auto copy_layer = std::make_unique<::arm_compute::CLCopy>();
+- copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
++ copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
+ _return_fn = asAclClFunction(std::move(copy_layer));
+
+- auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>(
++ auto fn = std::make_unique<::arm_compute::CLRNNLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+- fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
+- bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
+- act_info);
++ fn->configure(input_tensor->handle(), weights_tensor->handle(),
++ recurrent_weights_tensor->handle(), bias_tensor->handle(),
++ hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
+ _return_fn = asAclClFunction(std::move(fn));
+ }
+
+@@ -1315,12 +1126,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLFloor>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1335,10 +1146,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+ node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+ const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+- auto paddings_alloc = _tensor_builder->at(paddings_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto block_size_tensor = _tensor_builder->at(block_size_index).get();
++ auto paddings_tensor = _tensor_builder->at(paddings_index).get();
+
+ assert(_ctx.at(block_size_index).data());
+ assert(_ctx.at(paddings_index).data());
+@@ -1346,8 +1157,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+ std::unique_ptr<::arm_compute::IFunction> fn;
+
+ auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>();
+- l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
+- ofm_alloc->handle());
++ l->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
++ ofm_tensor->handle());
+ fn = std::move(l);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+@@ -1362,12 +1173,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
+
+ auto block_size = node.param().block_size;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+- auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>();
++ auto fn = std::make_unique<::arm_compute::CLSpaceToDepthLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1389,19 +1200,21 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
+ ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ ::arm_compute::PoolingLayerInfo info{
+ ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
++ ifm_tensor->info()->data_layout(),
+ ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
+
+ auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclClFunction(std::move(fn)),
++ ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
+@@ -1410,13 +1223,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
+ const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
+ const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+- auto values_alloc = _tensor_builder->at(values_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto lookups_tensor = _tensor_builder->at(lookups_index).get();
++ auto values_tensor = _tensor_builder->at(values_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>();
+
+- fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
++ fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1442,15 +1255,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+ float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
+ float bias = 0.0f; // Don't offset the reduction.
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
+ radius, alpha, beta, bias, false);
+
+ auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1466,17 +1279,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
+ const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
+ const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto hits_alloc = _tensor_builder->at(hits_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto hits_tensor = _tensor_builder->at(hits_index).get();
+
+- auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+- auto keys_alloc = _tensor_builder->at(keys_index).get();
+- auto values_alloc = _tensor_builder->at(values_index).get();
++ auto lookups_tensor = _tensor_builder->at(lookups_index).get();
++ auto keys_tensor = _tensor_builder->at(keys_index).get();
++ auto values_tensor = _tensor_builder->at(values_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLHashtableLookup>();
+
+- fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
+- output_alloc->handle(), hits_alloc->handle());
++ fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
++ output_tensor->handle(), hits_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1489,13 +1302,13 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
+ const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
+ const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto alpha_alloc = _tensor_builder->at(alpha_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto alpha_tensor = _tensor_builder->at(alpha_index).get();
+
+- auto fn = std::make_unique<::arm_compute::CLPReLU>();
++ auto fn = std::make_unique<::arm_compute::CLPReluLayer>();
+
+- fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1518,7 +1331,6 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
+ (node.param().padding.type == ir::PaddingType::VALID));
+ auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
+ ker_shape.W, ker_shape.H);
+-
+ uint32_t invalid_horizontal = 0;
+ uint32_t invalid_vertical = 0;
+ if (node.param().padding.type == ir::PaddingType::VALID)
+@@ -1528,17 +1340,17 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
+ invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
+ }
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto ker_alloc = _tensor_builder->at(ker_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto ker_tensor = _tensor_builder->at(ker_index).get();
+
+ const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
+
+ auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
+- invalid_horizontal, invalid_vertical);
++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
++ tconv_info, invalid_horizontal, invalid_vertical);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1550,15 +1362,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+
+ auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1571,13 +1383,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
+ const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input0_alloc = _tensor_builder->at(input0_index).get();
+- auto input1_alloc = _tensor_builder->at(input1_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input0_tensor = _tensor_builder->at(input0_index).get();
++ auto input1_tensor = _tensor_builder->at(input1_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLBitwiseOr>();
+
+- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1589,12 +1401,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLBitwiseNot>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1607,13 +1419,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1634,13 +1446,13 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
+
+ const auto k = node.param().k;
+
+- auto values_alloc = _tensor_builder->at(outputValues_index).get();
+- auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
+- auto input_alloc = _tensor_builder->at(inputData_index).get();
++ auto values_tensor = _tensor_builder->at(outputValues_index).get();
++ auto indices_tensor = _tensor_builder->at(outputIndices_index).get();
++ auto input_tensor = _tensor_builder->at(inputData_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLTopKV2>();
+
+- fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
++ fn->configure(input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1659,9 +1471,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+ const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
+ const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto indices_alloc = _tensor_builder->at(indices_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto indices_tensor = _tensor_builder->at(indices_index).get();
+
+ // NOTE The frontend layout and backend layout must be the same for this operation.
+ // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
+@@ -1671,43 +1483,43 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+ // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+ // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+ // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+- const auto backend_layout = ofm_alloc->layout();
++ const auto backend_layout = ofm_tensor->layout();
+ UNUSED_RELEASE(backend_layout);
+- assert(backend_layout == ifm_alloc->layout());
+- assert(backend_layout == indices_alloc->layout());
++ assert(backend_layout == ifm_tensor->layout());
++ assert(backend_layout == indices_tensor->layout());
+ assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
+
+ auto fn = std::make_unique<::arm_compute::CLGatherEx>();
+
+ // input is n-D, indices k-D, output is (n + k - 1)-D
+ size_t n = ifm_rank;
+- assert(n == ifm_alloc->num_dimensions());
++ assert(n == ifm_tensor->num_dimensions());
+ size_t k = _ctx.at(indices_index).shape().rank();
+- assert(k == indices_alloc->num_dimensions());
++ assert(k == indices_tensor->num_dimensions());
+
+ // Disable applied dim_correction
+- const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape();
+- if (n != ifm_alloc->info()->num_dimensions())
++ const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
++ if (n != ifm_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+ const auto ifm = _ctx.at(ifm_index);
+- ifm_alloc->info()->set_tensor_shape(
++ ifm_tensor->info()->set_tensor_shape(
+ acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
+ }
+- const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape();
+- if (k != indices_alloc->info()->num_dimensions())
++ const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
++ if (k != indices_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and indices tensor is applied dim_correction
+ const auto indices = _ctx.at(indices_index);
+- indices_alloc->info()->set_tensor_shape(
++ indices_tensor->info()->set_tensor_shape(
+ acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
+ }
+
+- fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
++ fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
+
+ // Revert disabling applied dim_correction
+- ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
+- indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
++ ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
++ indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1719,12 +1531,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLNeg>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1736,15 +1548,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+
+ auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1761,11 +1573,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+
+ assert((ifm_shape.rank() - 1) == ofm_shape.rank());
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
+ auto frontend_layout = _current_op_seq_layout;
+- auto backend_layout = ifm_alloc->layout();
++ auto backend_layout = ifm_tensor->layout();
+
+ int axis_value = node.param().axis;
+ if (axis_value < 0)
+@@ -1776,10 +1588,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+ auto acl_axis =
+ acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
+
+- auto fn = std::make_unique<::arm_compute::CLArgOperation>();
++ auto fn = std::make_unique<::arm_compute::CLArgMinMaxLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
+- ::arm_compute::ArgOperation::MAX);
++ fn->configure(ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
++ ::arm_compute::ReductionOperation::ARG_IDX_MAX);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1791,12 +1603,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+- auto fn = std::make_unique<::arm_compute::CLCast>();
++ auto fn = std::make_unique<::arm_compute::CLDequantizationLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE);
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1814,15 +1626,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
+ auto beta = node.param().beta;
+ auto bias = node.param().bias;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const auto norm_info = ::arm_compute::NormalizationLayerInfo(
+ ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+
+ auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1837,12 +1649,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
+ auto block_size = node.param().block_size;
+ assert(block_size > 0);
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+- auto fn = std::make_unique<::arm_compute::CLDepthToSpace>();
++ auto fn = std::make_unique<::arm_compute::CLDepthToSpaceLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -1860,13 +1672,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
+ for (const auto &output : node.getOutputs())
+ output_indexes.emplace_back(output);
+
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- std::vector<arm_compute::ICLTensor *> output_allocs;
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ std::vector<arm_compute::ICLTensor *> output_tensors;
+ for (const auto &ofm_ind : output_indexes)
+- output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
++ output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = ifm_alloc->layout();
++ const auto backend_layout = ifm_tensor->layout();
+ auto axis = node.param().axis;
+ if (axis < 0)
+ axis += ifm_rank;
+@@ -1874,7 +1686,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
+
+ auto fn = std::make_unique<::arm_compute::CLSplit>();
+
+- fn->configure(ifm_alloc->handle(), output_allocs, axis);
++ fn->configure(ifm_tensor->handle(), output_tensors, axis);
+
+ _return_fn = asAclClFunction(std::move(fn));
+ }
+@@ -1906,13 +1718,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
+ for (const auto &output_index : output_indexes)
+ {
+ size_t output_rank = _ctx.at(output_index).shape().rank();
+- const auto &output_alloc = _tensor_builder->at(output_index);
+- orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
+- assert(output_rank == output_alloc->num_dimensions());
+- if (output_rank != output_alloc->info()->num_dimensions())
++ const auto &output_tensor = _tensor_builder->at(output_index);
++ orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
++ assert(output_rank == output_tensor->num_dimensions());
++ if (output_rank != output_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+- output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++ output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+ _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
+ }
+ }
+@@ -1959,12 +1771,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
+
+ // Disable applied dim_correction
+ size_t input_rank = _ctx.at(input_index).shape().rank();
+- const auto &input_alloc = _tensor_builder->at(input_index);
+- assert(input_rank == input_alloc->num_dimensions());
+- if (input_rank != input_alloc->info()->num_dimensions())
++ const auto &input_tensor = _tensor_builder->at(input_index);
++ assert(input_rank == input_tensor->num_dimensions());
++ if (input_rank != input_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+ _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
+ }
+
+@@ -1982,13 +1794,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLElementwiseMin>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -2001,13 +1813,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLElementwiseMax>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+
+@@ -2019,12 +1831,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+ 0);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+@@ -2037,12 +1849,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+ 0);
+
+ auto acl_fn = asAclClFunction(std::move(fn));
+diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h
+new file mode 100644
+index 0000000..6253434
+--- /dev/null
++++ b/runtime/onert/backend/acl_common/AclKernelGen.h
+@@ -0,0 +1,269 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
++#define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
++
++#include <exec/IFunction.h>
++#include <ir/Operands.h>
++
++#include <ir/operation/LSTM.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace acl_common
++{
++
++template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
++ typename T_TensorBuilder>
++std::unique_ptr<exec::IFunction>
++kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands,
++ const std::shared_ptr<T_TensorBuilder> &tensor_builder)
++{
++ // TODO Support dynamic rnn
++ // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
++ const auto scratch_buffer_index{
++ node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
++ const auto output_state_out_index{
++ node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
++ const auto cell_state_out_index{
++ node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
++ const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
++
++ const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
++ const auto input_to_input_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
++ const auto input_to_forget_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
++ const auto input_to_cell_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
++ const auto input_to_output_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
++ const auto recurrent_to_input_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
++ const auto recurrent_to_forget_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
++ const auto recurrent_to_cell_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
++ const auto recurrent_to_output_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
++ const auto cell_to_input_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
++ const auto cell_to_forget_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
++ const auto cell_to_output_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
++ const auto input_gate_bias_index{
++ node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
++ const auto forget_gate_bias_index{
++ node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
++ const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
++ const auto output_gate_bias_index{
++ node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
++ const auto projection_weights_index{
++ node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
++ const auto projection_bias_index{
++ node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
++ const auto output_state_in_index{
++ node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
++ const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
++ const auto cell_threshold = node.param().cell_threshold;
++ const auto projection_threshold = node.param().projection_threshold;
++
++ bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
++ operands.at(input_to_input_weights_index).shape().dim(1) != 0;
++ bool has_recurrent_to_input_weights =
++ operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
++ operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
++ bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0;
++ bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0;
++ bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 &&
++ operands.at(projection_weights_index).shape().dim(1) != 0;
++ bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0);
++
++ // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
++ // true: no CIFG
++ // false: CIFG
++ // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
++ bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
++
++ // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
++ // But the cell_to_input_weights does not exist in regular CIFG although peephole.
++ // true: peephole
++ // false: no peephole
++ bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
++
++ // NOTE Although the projection weights has data the projection bias may not have data.
++ bool has_projection_param = has_projection_weights;
++
++ const auto activation = node.param().activation;
++ const auto cell_clip = cell_threshold;
++ const auto projection_clip = projection_threshold;
++ assert(cell_clip >= 0.f && projection_clip >= 0.f);
++
++ auto scratch_buffer_tensor = tensor_builder->at(scratch_buffer_index).get();
++ auto output_state_out_tensor = tensor_builder->at(output_state_out_index).get();
++ auto cell_state_out_tensor = tensor_builder->at(cell_state_out_index).get();
++ auto output_tensor = tensor_builder->at(output_index).get();
++
++ auto input_tensor = tensor_builder->at(input_index).get();
++
++ auto input_to_forget_weights_tensor = tensor_builder->at(input_to_forget_weights_index).get();
++ auto input_to_cell_weights_tensor = tensor_builder->at(input_to_cell_weights_index).get();
++ auto input_to_output_weights_tensor = tensor_builder->at(input_to_output_weights_index).get();
++ auto recurrent_to_forget_weights_tensor =
++ tensor_builder->at(recurrent_to_forget_weights_index).get();
++ auto recurrent_to_cell_weights_tensor = tensor_builder->at(recurrent_to_cell_weights_index).get();
++ auto recurrent_to_output_weights_tensor =
++ tensor_builder->at(recurrent_to_output_weights_index).get();
++
++ auto forget_gate_bias_tensor = tensor_builder->at(forget_gate_bias_index).get();
++ auto cell_bias_tensor = tensor_builder->at(cell_bias_index).get();
++ auto output_gate_bias_tensor = tensor_builder->at(output_gate_bias_index).get();
++ auto output_state_in_tensor = tensor_builder->at(output_state_in_index).get();
++ auto cell_state_in_tensor = tensor_builder->at(cell_state_in_index).get();
++
++ auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
++
++ auto fn = std::make_unique<T_ACLLayer>();
++
++ ::arm_compute::LSTMParams<T_Tensor> lstm_params{};
++ if (has_cifg_param)
++ {
++ auto input_to_input_weights_tensor =
++ tensor_builder->at(input_to_input_weights_index).get(); // optional
++ auto recurrent_to_input_weights_tensor =
++ tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
++ auto cell_to_input_weights_handle =
++ has_peephole_param ? tensor_builder->at(cell_to_input_weights_index).get()->handle()
++ : nullptr; // optional (non-cifg && peephole)
++ auto input_gate_bias_tensor = tensor_builder->at(input_gate_bias_index).get(); // optional
++ lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
++ recurrent_to_input_weights_tensor->handle(),
++ cell_to_input_weights_handle, input_gate_bias_tensor->handle());
++ }
++ if (has_peephole_param)
++ {
++ auto cell_to_forget_weights_tensor =
++ tensor_builder->at(cell_to_forget_weights_index).get(); // optional
++ auto cell_to_output_weights_tensor =
++ tensor_builder->at(cell_to_output_weights_index).get(); // optional
++ lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
++ cell_to_output_weights_tensor->handle());
++ }
++ if (has_projection_param)
++ {
++ auto projection_weights_tensor = tensor_builder->at(projection_weights_index).get(); // optional
++ auto projection_bias_handle = has_projection_bias
++ ? tensor_builder->at(projection_bias_index).get()->handle()
++ : nullptr; // optional
++ lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
++ }
++
++ fn->configure(input_tensor->handle(), input_to_forget_weights_tensor->handle(),
++ input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(),
++ recurrent_to_forget_weights_tensor->handle(),
++ recurrent_to_cell_weights_tensor->handle(),
++ recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(),
++ cell_bias_tensor->handle(), output_gate_bias_tensor->handle(),
++ output_state_in_tensor->handle(), cell_state_in_tensor->handle(),
++ scratch_buffer_tensor->handle(), output_state_out_tensor->handle(),
++ cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info,
++ cell_clip, projection_clip);
++
++ return std::make_unique<T_FunctionWrapper>(std::move(fn));
++}
++
++template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
++ typename T_TensorBuilder>
++std::unique_ptr<exec::IFunction>
++kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands,
++ const std::shared_ptr<T_TensorBuilder> &tensor_builder, ir::Layout layout)
++{
++ using ir::operation::FullyConnected;
++
++ const auto output_index{node.getOutputs().at(0)};
++ const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
++ const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
++ const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
++
++ const auto input_rank = operands.at(input_index).shape().rank();
++
++ const auto output_size =
++ operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1);
++ UNUSED_RELEASE(output_size);
++ assert(operands.at(bias_index).shape().dim(0) == output_size);
++ assert(operands.at(weight_index).shape().dim(0) == output_size);
++ const auto batch_size =
++ operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2);
++ const auto input_size =
++ operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1);
++
++ // Check for reshaping input's shape into rank-2
++ bool needs_reshape = false;
++ ir::Shape reshape(2);
++ if (input_rank == 3 || input_rank == 4)
++ {
++ const auto &ifm_shape = operands.at(input_index).shape();
++ auto feature_size = 1;
++ for (int i = 0; i < ifm_shape.rank(); ++i)
++ {
++ feature_size *= ifm_shape.dim(i);
++ }
++
++ UNUSED_RELEASE(feature_size);
++ assert(feature_size == batch_size * input_size);
++
++ // for reshaping
++ needs_reshape = true;
++ reshape.dim(0) = batch_size; /* H */
++ reshape.dim(1) = input_size; /* W */
++ }
++
++ auto output_tensor = tensor_builder->at(output_index).get();
++ const auto input_tensor = tensor_builder->at(input_index).get();
++ const auto weight_tensor = tensor_builder->at(weight_index).get();
++ const auto bias_tensor = tensor_builder->at(bias_index).get();
++ const auto frontend_layout = layout;
++ const auto acl_layout = output_tensor->handle()->info()->data_layout();
++
++ auto fn =
++ std::make_unique<T_ACLLayer>(tensor_builder->acl_tensor_manager()->internal_buffer_manager());
++
++ typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL;
++ if (operands.at(weight_index).isConstant())
++ {
++ kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS;
++ assert(operands.at(weight_index).data());
++ }
++
++ fn->configure(
++ input_tensor->handle(), weight_tensor->handle(), bias_tensor->handle(),
++ output_tensor->handle(), needs_reshape,
++ ::onert::backend::acl_common::asTensorShape(
++ reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
++ kernel_type);
++
++ return std::make_unique<T_FunctionWrapper>(std::move(fn));
++}
++
++} // namespace acl_common
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
+diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
+index e471867..37ec993 100644
+--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
++++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
+@@ -31,6 +31,7 @@
+ #include "exec/NopFunction.h"
+ #include "util/logging.h"
+ #include "util/Utils.h"
++#include "AclKernelGen.h"
+
+ namespace onert
+ {
+@@ -74,15 +75,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+
+ auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -96,10 +97,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+
+ const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto frontend_layout = _current_op_seq_layout;
+- auto backend_layout = ifm_alloc->layout();
++ auto backend_layout = ifm_tensor->layout();
+
+ int axis_value = node.param().axis;
+ if (axis_value < 0)
+@@ -112,7 +113,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+
+ auto fn = std::make_unique<::arm_compute::NEArgMinMaxLayer>();
+
+- fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(),
++ fn->configure(ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
+ arm_compute::ReductionOperation::ARG_IDX_MAX);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+@@ -127,15 +128,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
+ const auto block_size_index{
+ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto block_size_alloc = _tensor_builder->at(block_size_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+
+ assert(_ctx.at(block_size_index).data());
+
+ auto fn = std::make_unique<::arm_compute::NEBatchToSpaceLayer>();
+
+- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -147,15 +148,26 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+- auto fn = std::make_unique<::arm_compute::NECast>();
++ std::unique_ptr<::arm_compute::IFunction> fn;
++ if (ifm_tensor->data_type() == ofm_tensor->data_type())
++ {
++ auto l = std::make_unique<::arm_compute::NECopy>();
++
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
++
++ fn = std::move(l);
++ }
++ else
++ {
++ auto l = std::make_unique<::arm_compute::NECast>();
+
+- auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
+- ? arm_compute::SubDataType::BOOL
+- : arm_compute::SubDataType::NONE;
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
++
++ fn = std::move(l);
++ }
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -183,10 +195,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+ ker_width, ker_height);
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto ker_alloc = _tensor_builder->at(ker_index).get();
+- auto bias_alloc = _tensor_builder->at(bias_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto ker_tensor = _tensor_builder->at(ker_index).get();
++ auto bias_tensor = _tensor_builder->at(bias_index).get();
+
+ const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+ const auto act_info = acl_common::asActivationLayerInfo(activation);
+@@ -194,8 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+ auto fn = std::make_unique<::arm_compute::NEConvolutionLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
+- conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
++ ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
++ ::arm_compute::Size2D(1U, 1U), act_info);
+
+ _return_fn = asAclFunction(std::move(fn));
+ }
+@@ -208,12 +221,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
+ auto block_size = node.param().block_size;
+ assert(block_size > 0);
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+- auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayerEx>();
++ auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -242,10 +255,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+ const auto multiplier = node.param().multiplier;
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto ker_alloc = _tensor_builder->at(ker_index).get();
+- auto bias_alloc = _tensor_builder->at(bias_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto ker_tensor = _tensor_builder->at(ker_index).get();
++ auto bias_tensor = _tensor_builder->at(bias_index).get();
+
+ const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+ const auto act_info = acl_common::asActivationLayerInfo(activation);
+@@ -253,8 +266,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+ {
+ auto fn = std::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>();
+
+- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
+- ofm_alloc->handle(), conv_info, multiplier, act_info);
++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
++ ofm_tensor->handle(), conv_info, multiplier, act_info);
+
+ _return_fn = asAclFunction(std::move(fn));
+ }
+@@ -265,12 +278,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEDequantizationLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -305,19 +318,19 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
+ VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
+ VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+- ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
+- ::arm_compute::Size2D{kw, kh},
+- acl_common::asPadStrideInfo(padding, stride)};
++ ::arm_compute::PoolingLayerInfo info{
++ ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh},
++ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)};
+
+ auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+@@ -348,19 +361,20 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+ VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
+ VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ ::arm_compute::PoolingLayerInfo info{
+ ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
+- acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
++ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride),
++ true /* exclude_padding */};
+
+ auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Concat &node)
+@@ -383,7 +397,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ return;
+ }
+
+- auto output_alloc = _tensor_builder->at(ofm_index).get();
++ auto output_tensor = _tensor_builder->at(ofm_index).get();
+ std::vector<::arm_compute::ITensor *> input_tensors;
+ for (const auto &ifm_ind : input_indexes)
+ input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
+@@ -392,7 +406,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ if (input_indexes.size() < 2)
+ {
+ auto l = std::make_unique<::arm_compute::NECopy>();
+- l->configure(input_tensors.at(0), output_alloc->handle());
++ l->configure(input_tensors.at(0), output_tensor->handle());
+ fn = std::move(l);
+ }
+ else
+@@ -400,10 +414,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ auto l = std::make_unique<::arm_compute::NEConcatenateLayer>();
+ const auto rank = _ctx.at(ofm_index).shape().rank();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = output_alloc->layout();
++ const auto backend_layout = output_tensor->layout();
+ const auto fixed_axis =
+ acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
+- l->configure(input_tensors, output_alloc->handle(), fixed_axis);
++ l->configure(input_tensors, output_tensor->handle(), fixed_axis);
+ fn = std::move(l);
+ }
+
+@@ -418,13 +432,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
+ const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
+ const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+- auto values_alloc = _tensor_builder->at(values_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto lookups_tensor = _tensor_builder->at(lookups_index).get();
++ auto values_tensor = _tensor_builder->at(values_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEEmbeddingLookup>();
+
+- fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
++ fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -436,12 +450,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEFloor>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -450,76 +464,15 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
+
+ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+ {
+- using ir::operation::FullyConnected;
+-
+ const auto output_index{node.getOutputs().at(0)};
+- const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+- const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+- const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+-
+- const auto input_rank = _ctx.at(input_index).shape().rank();
+-
+- const auto output_size =
+- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
+- UNUSED_RELEASE(output_size);
+- assert(_ctx.at(bias_index).shape().dim(0) == output_size);
+- assert(_ctx.at(weight_index).shape().dim(0) == output_size);
+- const auto batch_size =
+- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
+- const auto input_size =
+- _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
+-
+- // Check for reshaping input's shape into rank-2
+- bool needs_reshape = false;
+- ir::Shape reshape(2);
+- if (input_rank == 3 || input_rank == 4)
+- {
+- const auto &ifm_shape = _ctx.at(input_index).shape();
+- auto feature_size = 1;
+- for (int i = 0; i < ifm_shape.rank(); ++i)
+- {
+- feature_size *= ifm_shape.dim(i);
+- }
+-
+- UNUSED_RELEASE(feature_size);
+- assert(feature_size == batch_size * input_size);
+-
+- // for reshaping
+- needs_reshape = true;
+- reshape.dim(0) = batch_size; /* H */
+- reshape.dim(1) = input_size; /* W */
+- }
+-
++ auto output_tensor = _tensor_builder->at(output_index).get();
+ const auto activation = node.param().activation;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- const auto input_alloc = _tensor_builder->at(input_index).get();
+- const auto weight_alloc = _tensor_builder->at(weight_index).get();
+- const auto bias_alloc = _tensor_builder->at(bias_index).get();
+- const auto frontend_layout = _current_op_seq_layout;
+- const auto acl_layout = output_alloc->handle()->info()->data_layout();
+-
+- auto fn = std::make_unique<arm_compute::NEFullyConnectedReshapingLayer>(
+- _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+-
+- arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type =
+- arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL;
+- if (_ctx.at(weight_index).isConstant())
+- {
+- kernel_type = arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
+- assert(_ctx.at(weight_index).data());
+- }
+-
+- fn->configure(
+- input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
+- needs_reshape,
+- ::onert::backend::acl_common::asTensorShape(
+- reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
+- kernel_type);
+-
++ auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
++ ::arm_compute::NEFullyConnectedReshapingLayer>(
++ node, _ctx, _tensor_builder, _current_op_seq_layout);
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)),
+- ActivationBuilder::generate(activation, output_alloc->handle()));
++ std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
+@@ -531,17 +484,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
+ const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
+ const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto hits_alloc = _tensor_builder->at(hits_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto hits_tensor = _tensor_builder->at(hits_index).get();
+
+- auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+- auto keys_alloc = _tensor_builder->at(keys_index).get();
+- auto values_alloc = _tensor_builder->at(values_index).get();
++ auto lookups_tensor = _tensor_builder->at(lookups_index).get();
++ auto keys_tensor = _tensor_builder->at(keys_index).get();
++ auto values_tensor = _tensor_builder->at(values_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEHashtableLookup>();
+
+- fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
+- output_alloc->handle(), hits_alloc->handle());
++ fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
++ output_tensor->handle(), hits_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -561,10 +514,10 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+ // Converting in reverse order
+ const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto indices_alloc = _tensor_builder->at(indices_index).get();
+- const auto backend_layout = ofm_alloc->layout();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto indices_tensor = _tensor_builder->at(indices_index).get();
++ const auto backend_layout = ofm_tensor->layout();
+ UNUSED_RELEASE(backend_layout);
+
+ // NOTE The frontend layout and backend layout must be the same for this operation.
+@@ -575,35 +528,35 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+ // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+ // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+ // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+- assert(backend_layout == ifm_alloc->layout());
+- assert(backend_layout == indices_alloc->layout());
++ assert(backend_layout == ifm_tensor->layout());
++ assert(backend_layout == indices_tensor->layout());
+ assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
+
+ auto fn = std::make_unique<::arm_compute::NEGatherEx>();
+
+ // input is n-D, indices k-D, output is (n + k - 1)-D
+ size_t n = ifm_rank;
+- assert(n == ifm_alloc->num_dimensions());
++ assert(n == ifm_tensor->num_dimensions());
+ size_t k = _ctx.at(indices_index).shape().rank();
+- assert(k == indices_alloc->num_dimensions());
++ assert(k == indices_tensor->num_dimensions());
+
+ // Disable applied dim_correction
+- if (n != ifm_alloc->info()->num_dimensions())
++ if (n != ifm_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+ const auto ifm = _ctx.at(ifm_index);
+- ifm_alloc->info()->set_tensor_shape(
++ ifm_tensor->info()->set_tensor_shape(
+ acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
+ }
+- if (k != indices_alloc->info()->num_dimensions())
++ if (k != indices_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and indices tensor is applied dim_correction
+ const auto indices = _ctx.at(indices_index);
+- indices_alloc->info()->set_tensor_shape(
++ indices_tensor->info()->set_tensor_shape(
+ acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
+ }
+
+- fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
++ fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
+
+ // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
+ // use arm_compute::TensorInfo::offset_element_in_bytes()
+@@ -621,20 +574,20 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
+ const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
+ const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto gamma_alloc = _tensor_builder->at(gamma_index).get();
+- auto beta_alloc = _tensor_builder->at(beta_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto gamma_tensor = _tensor_builder->at(gamma_index).get();
++ auto beta_tensor = _tensor_builder->at(beta_index).get();
+ auto epsilon = node.param().epsilon;
+ auto activation = node.param().activation;
+
+ auto fn = std::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
+- beta_alloc->handle(), epsilon);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
++ beta_tensor->handle(), epsilon);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+@@ -656,15 +609,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+ float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
+ float bias = 0.0f; // Don't offset the reduction.
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
+ radius, alpha, beta, bias, false);
+
+ auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -686,19 +639,20 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
+ ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ ::arm_compute::PoolingLayerInfo info{
+ ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
++ ifm_tensor->info()->data_layout(),
+ ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
+
+ auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
+@@ -712,15 +666,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
+ auto beta = node.param().beta;
+ auto bias = node.param().bias;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const auto norm_info = ::arm_compute::NormalizationLayerInfo(
+ ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+
+ auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -733,13 +687,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
+ const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input0_alloc = _tensor_builder->at(input0_index).get();
+- auto input1_alloc = _tensor_builder->at(input1_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input0_tensor = _tensor_builder->at(input0_index).get();
++ auto input1_tensor = _tensor_builder->at(input1_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NELogicalAnd>();
+
+- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -751,12 +705,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEBitwiseNot>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -769,13 +723,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
+ const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input0_alloc = _tensor_builder->at(input0_index).get();
+- auto input1_alloc = _tensor_builder->at(input1_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input0_tensor = _tensor_builder->at(input0_index).get();
++ auto input1_tensor = _tensor_builder->at(input1_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NELogicalOr>();
+
+- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -787,8 +741,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
+@@ -798,7 +752,7 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+ // instead of 'INF', and then the result of this op will be errors due to the 'NaN'.
+ auto fn = std::make_unique<::arm_compute::NEActivationLayerEx>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -807,159 +761,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+
+ void KernelGenerator::visit(const ir::operation::LSTM &node)
+ {
+- // TODO Support dynamic rnn
+- // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+- const auto scratch_buffer_index{
+- node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+- const auto output_state_out_index{
+- node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+- const auto cell_state_out_index{
+- node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+- const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+-
+- const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+- const auto input_to_input_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+- const auto input_to_forget_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+- const auto input_to_cell_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+- const auto input_to_output_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+- const auto recurrent_to_input_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+- const auto recurrent_to_forget_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+- const auto recurrent_to_cell_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+- const auto recurrent_to_output_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+- const auto cell_to_input_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+- const auto cell_to_forget_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+- const auto cell_to_output_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+- const auto input_gate_bias_index{
+- node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+- const auto forget_gate_bias_index{
+- node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+- const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+- const auto output_gate_bias_index{
+- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+- const auto projection_weights_index{
+- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+- const auto projection_bias_index{
+- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+- const auto output_state_in_index{
+- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+- const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+- const auto cell_threshold = node.param().cell_threshold;
+- const auto projection_threshold = node.param().projection_threshold;
+-
+- bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+- _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+- bool has_recurrent_to_input_weights =
+- _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+- _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+- bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+- bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+- bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+- _ctx.at(projection_weights_index).shape().dim(1) != 0;
+- bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
+-
+- // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+- // true: no CIFG
+- // false: CIFG
+- // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+- bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+-
+- // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+- // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+- // true: peephole
+- // false: no peephole
+- bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+-
+- // NOTE Although the projection weights has data the projection bias may not have data.
+- bool has_projection_param = has_projection_weights;
+-
+- const auto activation = node.param().activation;
+- const auto cell_clip = cell_threshold;
+- const auto projection_clip = projection_threshold;
+- assert(cell_clip >= 0.f && projection_clip >= 0.f);
+-
+- auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
+- auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
+- auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
+- auto output_alloc = _tensor_builder->at(output_index).get();
+-
+- auto input_alloc = _tensor_builder->at(input_index).get();
+-
+- auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
+- auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
+- auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
+- auto recurrent_to_forget_weights_alloc =
+- _tensor_builder->at(recurrent_to_forget_weights_index).get();
+- auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
+- auto recurrent_to_output_weights_alloc =
+- _tensor_builder->at(recurrent_to_output_weights_index).get();
+-
+- auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
+- auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
+- auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
+- auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
+- auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
+-
+- auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+-
+- auto fn = std::make_unique<::arm_compute::NELSTMLayer>();
+-
+- ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{};
+- if (has_cifg_param)
+- {
+- auto input_to_input_weights_alloc =
+- _tensor_builder->at(input_to_input_weights_index).get(); // optional
+- auto recurrent_to_input_weights_alloc =
+- _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+- auto cell_to_input_weights_handle =
+- has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
+- : nullptr; // optional (non-cifg && peephole)
+- auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
+- lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
+- recurrent_to_input_weights_alloc->handle(),
+- cell_to_input_weights_handle, input_gate_bias_alloc->handle());
+- }
+- if (has_peephole_param)
+- {
+- auto cell_to_forget_weights_alloc =
+- _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+- auto cell_to_output_weights_alloc =
+- _tensor_builder->at(cell_to_output_weights_index).get(); // optional
+- lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
+- cell_to_output_weights_alloc->handle());
+- }
+- if (has_projection_param)
+- {
+- auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
+- auto projection_bias_handle = has_projection_bias
+- ? _tensor_builder->at(projection_bias_index).get()->handle()
+- : nullptr; // optional
+- lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
+- }
+-
+- fn->configure(
+- input_alloc->handle(), input_to_forget_weights_alloc->handle(),
+- input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
+- recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
+- recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
+- cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
+- cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
+- output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
+- lstm_params, act_info, cell_clip, projection_clip);
+-
+- auto acl_fn = asAclFunction(std::move(fn));
+-
+- _return_fn = std::move(acl_fn);
++ _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
++ ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_builder);
+ }
+
+ void KernelGenerator::visit(const ir::operation::Mul &node)
+@@ -970,18 +773,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEPixelWiseMultiplication>();
+
+ // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
+ arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Neg &node)
+@@ -989,12 +792,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NENegLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1030,12 +833,12 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
+ for (const auto &input_index : input_indexes)
+ {
+ size_t input_rank = _ctx.at(input_index).shape().rank();
+- const auto &input_alloc = _tensor_builder->at(input_index);
+- assert(input_rank == input_alloc->num_dimensions());
+- if (input_rank != input_alloc->info()->num_dimensions())
++ const auto &input_tensor = _tensor_builder->at(input_index);
++ assert(input_rank == input_tensor->num_dimensions());
++ if (input_rank != input_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+ _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
+ }
+ }
+@@ -1094,8 +897,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+ const auto ofm_idx{node.getOutputs().at(0)};
+ const auto ifm_idx{node.getInputs().at(0)};
+ const auto permute_type = node.getPermuteType();
+- auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
+ const auto rank = _ctx.at(ofm_idx).shape().rank();
+ assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
+
+@@ -1108,7 +911,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+
+ auto l = std::make_unique<::arm_compute::NEPermute>();
+
+- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
+
+ fn = std::move(l);
+ }
+@@ -1119,7 +922,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+
+ auto l = std::make_unique<::arm_compute::NEPermute>();
+
+- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
+
+ fn = std::move(l);
+ }
+@@ -1127,7 +930,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+ {
+ auto l = std::make_unique<::arm_compute::NECopy>();
+
+- l->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ fn = std::move(l);
+ }
+@@ -1143,15 +946,15 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
+ const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
+ const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto alpha_alloc = _tensor_builder->at(alpha_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto alpha_tensor = _tensor_builder->at(alpha_index).get();
+
+ std::unique_ptr<::arm_compute::IFunction> fn;
+
+- auto l = std::make_unique<::arm_compute::NEPReLU>();
++ auto l = std::make_unique<::arm_compute::NEPReluLayer>();
+
+- l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
++ l->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
+
+ fn = std::move(l);
+
+@@ -1166,14 +969,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
+ const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ // Convert to ACL axes taking into account negative values and possible duplicates.
+ const auto &axes = _ctx.at(axes_index);
+ const auto input_rank = _ctx.at(input_index).shape().rank();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = input_alloc->layout();
++ const auto backend_layout = input_tensor->layout();
+ const auto reduce_axes =
+ acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
+ const auto reduce_type = node.param().reduce_type;
+@@ -1182,11 +985,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ std::unique_ptr<::arm_compute::IFunction> fn;
+ if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
+ {
+- // NOTE NEReduceMean has a bug that does not support NHWC layout
+- // NEReduceMean intermediate tensors are always NCHW layout
+- auto l = std::make_unique<::arm_compute::NEReduceMeanEx>();
++ auto l = std::make_unique<::arm_compute::NEReduceMean>();
+
+- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
++ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
+
+ fn = std::move(l);
+ }
+@@ -1194,7 +995,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ {
+ auto l = std::make_unique<::arm_compute::NEReduceSum>();
+
+- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
++ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
+
+ fn = std::move(l);
+ }
+@@ -1202,7 +1003,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ {
+ auto l = std::make_unique<::arm_compute::NEReduceOperation>();
+
+- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle(),
++ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
+ acl_common::convertReduceType(reduce_type));
+
+ fn = std::move(l);
+@@ -1218,15 +1019,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<arm_compute::NEActivationLayer>();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1238,15 +1039,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+
+ auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1258,15 +1059,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
+
+ auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1278,13 +1079,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ // NOTE This operation must not be changed the layout from frontend to backend
+ // So, PermutationOperationPass makes layouts of frontend and backend the same.
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = output_alloc->layout();
++ const auto backend_layout = output_tensor->layout();
+ assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
+ frontend_layout == backend_layout);
+ UNUSED_RELEASE(frontend_layout);
+@@ -1292,7 +1093,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+
+ auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1305,12 +1106,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
+
+ const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEScale>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
+ ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
+ ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
+
+@@ -1334,25 +1135,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
+
+ const auto activation = node.param().activation;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
+
+- auto input_alloc = _tensor_builder->at(input_index).get();
+- auto weights_alloc = _tensor_builder->at(weights_index).get();
+- auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
+- auto bias_alloc = _tensor_builder->at(bias_index).get();
+- auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
++ auto weights_tensor = _tensor_builder->at(weights_index).get();
++ auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
++ auto bias_tensor = _tensor_builder->at(bias_index).get();
++ auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
+ auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+
+ auto copy_layer = std::make_unique<::arm_compute::NECopy>();
+- copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
++ copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
+ _return_fn = asAclFunction(std::move(copy_layer));
+
+- auto fn = std::make_unique<::arm_compute::NERNNLayerEx>(
++ auto fn = std::make_unique<::arm_compute::NERNNLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+- fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
+- bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
+- act_info);
++ fn->configure(input_tensor->handle(), weights_tensor->handle(),
++ recurrent_weights_tensor->handle(), bias_tensor->handle(),
++ hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
+ _return_fn = asAclFunction(std::move(fn));
+ }
+
+@@ -1361,12 +1162,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NERsqrtLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ _return_fn = asAclFunction(std::move(fn));
+ }
+@@ -1383,10 +1184,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
+ (void)dims;
+ (void)ndim;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+ auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+ auto acl_fn = asAclFunction(std::move(fn));
+ _return_fn = std::move(acl_fn);
+ }
+@@ -1396,15 +1197,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<arm_compute::NEActivationLayer>();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1417,13 +1218,25 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
+ const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
+ const auto beta = node.param().beta;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
++ const auto frontend_layout = _current_op_seq_layout;
++ const auto backend_layout = input_tensor->layout();
++
++ // Disable applied dim_correction
++ const size_t input_rank = _ctx.at(input_index).shape().rank();
++ if (input_rank != input_tensor->info()->num_dimensions())
++ {
++ // This means that high dimension's value is 1 and input tensor is applied dim_correction
++ const auto input = _ctx.at(input_index);
++ input_tensor->info()->set_tensor_shape(
++ acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
++ }
+
+ auto fn = std::make_unique<::arm_compute::NESoftmaxLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1438,20 +1251,18 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+ node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+ const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+- auto paddings_alloc = _tensor_builder->at(paddings_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto block_size_tensor = _tensor_builder->at(block_size_index).get();
++ auto paddings_tensor = _tensor_builder->at(paddings_index).get();
+
+ assert(_ctx.at(block_size_index).data());
+ assert(_ctx.at(paddings_index).data());
+
+- // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is
+- // not 0.
+- auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayerEx>();
++ auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayer>();
+
+- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
+- ofm_alloc->handle());
++ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
++ ofm_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1465,12 +1276,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
+
+ auto block_size = node.param().block_size;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+
+- auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayerEx>();
++ auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayer>();
+
+- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
++ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1489,13 +1300,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
+ for (const auto &output : node.getOutputs())
+ output_indexes.emplace_back(output);
+
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- std::vector<arm_compute::ITensor *> output_allocs;
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ std::vector<arm_compute::ITensor *> output_tensors;
+ for (const auto &ofm_ind : output_indexes)
+- output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
++ output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = ifm_alloc->layout();
++ const auto backend_layout = ifm_tensor->layout();
+ auto axis = node.param().axis;
+ if (axis < 0)
+ axis += ifm_rank;
+@@ -1503,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
+
+ auto fn = std::make_unique<::arm_compute::NESplit>();
+
+- fn->configure(ifm_alloc->handle(), output_allocs, axis);
++ fn->configure(ifm_tensor->handle(), output_tensors, axis);
+
+ _return_fn = asAclFunction(std::move(fn));
+ }
+@@ -1513,15 +1324,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+
+ auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1534,13 +1345,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEElementwiseSquaredDiff>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1555,17 +1366,17 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEArithmeticSubtraction>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+ arm_compute::ConvertPolicy::SATURATE);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Slice &node)
+@@ -1575,10 +1386,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+ const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
+ const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
+
+- auto outputData_alloc = _tensor_builder->at(output_index).get();
+- auto inputData_alloc = _tensor_builder->at(input_index).get();
++ auto outputData_tensor = _tensor_builder->at(output_index).get();
++ auto inputData_tensor = _tensor_builder->at(input_index).get();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = inputData_alloc->layout();
++ const auto backend_layout = inputData_tensor->layout();
+
+ // Set initializers for indices data such as order of inputData
+ int input_rank = _ctx.at(input_index).shape().rank();
+@@ -1628,7 +1439,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+
+ auto fn = std::make_unique<::arm_compute::NESlice>();
+
+- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
++ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1643,10 +1454,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+ const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
+ const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
+
+- auto outputData_alloc = _tensor_builder->at(output_index).get();
+- auto inputData_alloc = _tensor_builder->at(input_index).get();
++ auto outputData_tensor = _tensor_builder->at(output_index).get();
++ auto inputData_tensor = _tensor_builder->at(input_index).get();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = inputData_alloc->layout();
++ const auto backend_layout = inputData_tensor->layout();
+
+ // Set initializers for indices data such as order of inputData
+ int input_rank = _ctx.at(input_index).shape().rank();
+@@ -1715,7 +1526,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+
+ auto fn = std::make_unique<::arm_compute::NEStridedSlice>();
+
+- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
++ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
+ strides_set, begin_mask, end_mask, shrink_axis_mask);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+@@ -1749,16 +1560,16 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
+ invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
+ }
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+- auto ker_alloc = _tensor_builder->at(ker_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++ auto ker_tensor = _tensor_builder->at(ker_index).get();
+
+ const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
+
+ auto fn = std::make_unique<::arm_compute::NETransposeConvLayer>();
+
+- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
+- invalid_horizontal, invalid_vertical);
++ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
++ tconv_info, invalid_horizontal, invalid_vertical);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1771,10 +1582,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+ const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+ const auto &perm{node.param().perm};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+- const auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
++ const auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
+ const auto frontend_layout = _current_op_seq_layout;
+- const auto backend_layout = ifm_alloc->layout();
++ const auto backend_layout = ifm_tensor->layout();
+
+ const auto rank = _ctx.at(ifm_idx).shape().rank();
+ std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
+@@ -1783,11 +1594,11 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+
+ std::unique_ptr<::arm_compute::IFunction> fn;
+
+- if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2)
++ if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
+ {
+ auto l = std::make_unique<::arm_compute::NETranspose>();
+
+- l->configure(ifm_alloc->handle(), ofm_alloc->handle());
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ fn = std::move(l);
+ }
+@@ -1795,7 +1606,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+ {
+ auto l = std::make_unique<::arm_compute::NEPermute>();
+
+- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
++ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
+
+ fn = std::move(l);
+ }
+@@ -1834,13 +1645,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
+ for (const auto &output_index : output_indexes)
+ {
+ size_t output_rank = _ctx.at(output_index).shape().rank();
+- const auto &output_alloc = _tensor_builder->at(output_index);
+- orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
+- assert(output_rank == output_alloc->num_dimensions());
+- if (output_rank != output_alloc->info()->num_dimensions())
++ const auto &output_tensor = _tensor_builder->at(output_index);
++ orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
++ assert(output_rank == output_tensor->num_dimensions());
++ if (output_rank != output_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+- output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++ output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+ _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
+ }
+ }
+@@ -1858,17 +1669,17 @@ void KernelGenerator::visit(const ir::operation::Add &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEArithmeticAddition>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+ arm_compute::ConvertPolicy::SATURATE);
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Div &node)
+@@ -1879,16 +1690,16 @@ void KernelGenerator::visit(const ir::operation::Div &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEElementwiseDivision>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+
+ _return_fn = std::make_unique<exec::FunctionSequence>(
+- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+
+ void KernelGenerator::visit(const ir::operation::Exp &node)
+@@ -1896,12 +1707,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEExpLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1913,12 +1724,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input_tensor = _tensor_builder->at(input_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEReshapeLayer>();
+
+- fn->configure(input_alloc->handle(), output_alloc->handle());
++ fn->configure(input_tensor->handle(), output_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1933,13 +1744,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
+
+ const auto comparison_type = node.param().comparison_type;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input0_alloc = _tensor_builder->at(input0_index).get();
+- auto input1_alloc = _tensor_builder->at(input1_index).get();
++ auto output_tensor = _tensor_builder->at(output_index).get();
++ auto input0_tensor = _tensor_builder->at(input0_index).get();
++ auto input1_tensor = _tensor_builder->at(input1_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEElementwiseComparison>();
+
+- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
++ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
+ (arm_compute::ComparisonOperation)comparison_type);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+@@ -1953,13 +1764,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEElementwiseMin>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+@@ -1972,13 +1783,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+
+ auto fn = std::make_unique<::arm_compute::NEElementwiseMax>();
+
+- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/backend/cpu/ConstantInitializer.cc
+index 71e3136..deb27f0 100644
+--- a/runtime/onert/backend/cpu/ConstantInitializer.cc
++++ b/runtime/onert/backend/cpu/ConstantInitializer.cc
+@@ -15,6 +15,7 @@
+ */
+
+ #include "ConstantInitializer.h"
++#include "Tensor.h"
+
+ namespace onert
+ {
+@@ -30,39 +31,61 @@ ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
+ // DO NOTHING
+ }
+
++void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index,
++ const ir::Operand &obj)
++{
++ registerExternalInitializer(index, obj);
++}
++
++void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index,
++ const ir::Operand &obj)
++{
++ // For only CONSTANTS
++ // TODO Add to check if tensor has been allocated
++ if (!obj.isConstant())
++ return;
++
++ _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) {
++ auto data = model_obj.shareData();
++ assert(data && data->base());
++ ExternalTensor &tensor = dynamic_cast<ExternalTensor &>(itensor);
++ tensor.setData(data);
++ };
++}
++
+ void ConstantInitializer::visit(const ir::operation::Conv2D &node)
+ {
+ const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
+ const auto &kernel_obj = _operands.at(kernel_index);
+- registerCopyInitializer(kernel_index, kernel_obj);
++ registerExternalInitializer(kernel_index, kernel_obj);
+
+ const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
+ const auto &bias_obj = _operands.at(bias_index);
+- registerCopyInitializer(bias_index, bias_obj);
++ registerExternalInitializer(bias_index, bias_obj);
+ }
+
+ void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node)
+ {
+ const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
+ const auto &kernel_obj = _operands.at(kernel_index);
+- registerCopyInitializer(kernel_index, kernel_obj);
++ registerExternalInitializer(kernel_index, kernel_obj);
+
+ const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
+ const auto &bias_obj = _operands.at(bias_index);
+- registerCopyInitializer(bias_index, bias_obj);
++ registerExternalInitializer(bias_index, bias_obj);
+ }
+
+ void ConstantInitializer::visit(const ir::operation::FullyConnected &node)
+ {
+ const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
+ const auto &weight_obj = _operands.at(weight_index);
+- registerCopyInitializer(weight_index, weight_obj);
++ registerExternalInitializer(weight_index, weight_obj);
+
+ const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
+ if (!bias_index.undefined())
+ {
+ const auto &bias_obj = _operands.at(bias_index);
+- registerCopyInitializer(bias_index, bias_obj);
++ registerExternalInitializer(bias_index, bias_obj);
+ }
+ }
+
+diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h
+index bd06c64..de03a69 100644
+--- a/runtime/onert/backend/cpu/ConstantInitializer.h
++++ b/runtime/onert/backend/cpu/ConstantInitializer.h
+@@ -36,6 +36,15 @@ public:
+ const std::shared_ptr<TensorBuilder> &tensor_builder);
+
+ public:
++ void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override;
++
++ // TODO: For now the only cpu backend supports constant tensor to use data from external
++ // If the other backend supports (to do this,
++ // ExternalTensor should be abstract such as IExternal, maybe),
++ // this can be an interface of IConstantInitializer
++ void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &);
++
++public:
+ void visit(const ir::operation::Conv2D &) override;
+ void visit(const ir::operation::DepthwiseConv2D &) override;
+ void visit(const ir::operation::FullyConnected &) override;
+diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
+index 72f9606..2766aa2 100644
+--- a/runtime/onert/backend/cpu/KernelGenerator.cc
++++ b/runtime/onert/backend/cpu/KernelGenerator.cc
+@@ -60,6 +60,7 @@
+ #include "ops/SoftMaxLayer.h"
+ #include "ops/StridedSliceLayer.h"
+ #include "ops/SpaceToBatchNDLayer.h"
++#include "ops/SpaceToDepthLayer.h"
+ #include "ops/SplitLayer.h"
+ #include "ops/SubLayer.h"
+ #include "ops/TanhLayer.h"
+@@ -70,11 +71,13 @@
+ #include "ops/ZerosLikeLayer.h"
+ #include "ops/SquaredDiffLayer.h"
+ #include "ops/LogicalOrLayer.h"
++#include "ops/L2NormLayer.h"
+ #include "ops/MatrixBandPartLayer.h"
+ #include "ops/BatchMatMulLayer.h"
+ #include "ops/BroadcastToLayer.h"
+ #include "ops/FusedBatchNormLayer.h"
+ #include "ops/LogSoftMaxLayer.h"
++#include "ops/QuantizeLayer.h"
+
+ #include <backend/Backend.h>
+ #include <backend/IConfig.h>
+@@ -184,10 +187,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+ const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
+ const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+- auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
+- auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
++ auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
++ auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
+
+ const auto stride = node.param().stride;
+ const auto activation = node.param().activation;
+@@ -196,9 +199,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+
+ if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic())
+ {
+- fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, param_padding.param.left,
++ fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left,
+ param_padding.param.right, param_padding.param.top, param_padding.param.bottom,
+- stride.horizontal, stride.vertical, activation, ofm_alloc);
++ stride.horizontal, stride.vertical, activation, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ return;
+@@ -213,9 +216,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+ const auto padding =
+ ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
+
+- fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, padding.left, padding.right,
+- padding.top, padding.bottom, stride.horizontal, stride.vertical, activation,
+- ofm_alloc);
++ fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
++ padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
++ activation, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -241,16 +244,16 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+ const auto multiplier = node.param().multiplier;
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+- auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
+- auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
++ auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
++ auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
+
+ auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>();
+
+- fn->configure(ifm_alloc, ker_alloc, bias_alloc, padding.left, padding.right, padding.top,
++ fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top,
+ padding.bottom, stride.horizontal, stride.vertical, multiplier, activation,
+- ofm_alloc);
++ ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -270,13 +273,13 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
+ ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::MaxPoolLayer>();
+
+- fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
+- stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
++ fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
++ stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -295,13 +298,13 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+ ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::AvgPoolLayer>();
+
+- fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
+- stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
++ fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
++ stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -313,7 +316,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ const auto rank = _ctx.at(ofm_index).shape().rank();
+ const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
+
+- auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
++ auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+
+ std::vector<const IPortableTensor *> input_tensors;
+ for (auto &ifm_idx : node.getInputs())
+@@ -321,7 +324,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+
+ auto fn = std::make_unique<ops::ConcatLayer>();
+
+- fn->configure(input_tensors, axis, output_alloc);
++ fn->configure(input_tensors, axis, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -332,13 +335,13 @@ void KernelGenerator::visit(const ir::operation::Fill &node)
+ const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)};
+ const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto value_alloc = _tensor_builder->portableAt(value_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto value_tensor = _tensor_builder->portableAt(value_index).get();
+
+ auto fn = std::make_unique<ops::FillLayer>();
+
+- fn->configure(input_alloc, value_alloc, output_alloc);
++ fn->configure(input_tensor, value_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -353,15 +356,15 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+ const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+ const auto activation = node.param().activation;
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto weight_alloc = _tensor_builder->portableAt(weight_index).get();
+- auto bias_alloc =
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto weight_tensor = _tensor_builder->portableAt(weight_index).get();
++ auto bias_tensor =
+ bias_index.undefined() ? nullptr : _tensor_builder->portableAt(bias_index).get();
+
+ auto fn = std::make_unique<ops::FullyConnectedLayer>();
+
+- fn->configure(input_alloc, weight_alloc, bias_alloc, activation, output_alloc);
++ fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -371,21 +374,21 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ // optional 2nd input
+- IPortableTensor *shape_alloc = nullptr;
++ IPortableTensor *shape_tensor = nullptr;
+
+ if (node.getInputs().size() == 2)
+ {
+ const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)};
+- shape_alloc = _tensor_builder->portableAt(shape_index).get();
++ shape_tensor = _tensor_builder->portableAt(shape_index).get();
+ }
+
+ auto fn = std::make_unique<ops::ReshapeLayer>();
+
+- fn->configure(input_alloc, shape_alloc, output_alloc);
++ fn->configure(input_tensor, shape_tensor, output_tensor);
+ _return_fn = std::move(fn);
+ }
+
+@@ -394,13 +397,13 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ // Squeeze can share same kernel with reshape
+ auto fn = std::make_unique<ops::ReshapeLayer>();
+
+- fn->configure(input_alloc, nullptr, output_alloc);
++ fn->configure(input_tensor, nullptr, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -412,12 +415,12 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
+
+ const auto beta = node.param().beta;
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::SoftMaxLayer>();
+
+- fn->configure(input_alloc, beta, output_alloc);
++ fn->configure(input_tensor, beta, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -430,13 +433,13 @@ void KernelGenerator::visit(const ir::operation::Add &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::AddLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -447,15 +450,15 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto comparison_type = node.param().comparison_type;
+
+ auto fn = std::make_unique<ops::CompareLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, comparison_type, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -466,11 +469,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+ const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
+ const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
+
+- const auto backend_layout = output_alloc->layout();
++ const auto backend_layout = output_tensor->layout();
+ UNUSED_RELEASE(backend_layout);
+
+ // NOTE The frontend layout and backend layout must be the same for this operation.
+@@ -481,8 +484,8 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+ // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+ // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+ // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+- assert(backend_layout == input_alloc->layout());
+- assert(backend_layout == indices_alloc->layout());
++ assert(backend_layout == input_tensor->layout());
++ assert(backend_layout == indices_tensor->layout());
+ const auto &input_shape = _ctx.at(input_index).shape();
+ UNUSED_RELEASE(input_shape);
+ assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout);
+@@ -492,7 +495,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+
+ auto fn = std::make_unique<ops::GatherLayer>();
+
+- fn->configure(input_alloc, indices_alloc, output_alloc, axis_value);
++ fn->configure(input_tensor, indices_tensor, output_tensor, axis_value);
+
+ _return_fn = std::move(fn);
+ }
+@@ -506,13 +509,13 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::SubLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -526,13 +529,13 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::MulLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -547,18 +550,18 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
+
+ const auto axis = node.param().axis;
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
+- auto depth_alloc = _tensor_builder->portableAt(depth_index).get();
+- auto onvalue_alloc = _tensor_builder->portableAt(onvalue_index).get();
+- auto offvalue_alloc = _tensor_builder->portableAt(offvalue_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
++ auto depth_tensor = _tensor_builder->portableAt(depth_index).get();
++ auto onvalue_tensor = _tensor_builder->portableAt(onvalue_index).get();
++ auto offvalue_tensor = _tensor_builder->portableAt(offvalue_index).get();
+
+- assert(indices_alloc->data_type() == OperandType::INT32);
+- assert(axis <= static_cast<int>(indices_alloc->num_dimensions()));
++ assert(indices_tensor->data_type() == OperandType::INT32);
++ assert(axis <= static_cast<int>(indices_tensor->num_dimensions()));
+
+ auto fn = std::make_unique<ops::OneHotLayer>();
+
+- fn->configure(indices_alloc, depth_alloc, onvalue_alloc, offvalue_alloc, output_alloc, axis);
++ fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis);
+
+ _return_fn = std::move(fn);
+ }
+@@ -572,13 +575,13 @@ void KernelGenerator::visit(const ir::operation::Div &node)
+
+ const auto activation = node.param().activation;
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::DivLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -587,16 +590,16 @@ void KernelGenerator::visit(const ir::operation::Einsum &node)
+ {
+ const auto ofm_index{node.getOutputs().at(0)};
+
+- auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+- std::vector<const IPortableTensor *> input_allocs;
++ auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
++ std::vector<const IPortableTensor *> input_tensors;
+ for (auto &ifm_idx : node.getInputs())
+- input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
++ input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+
+ const auto equation = node.param().equation;
+
+ auto fn = std::make_unique<ops::EinsumLayer>();
+
+- fn->configure(input_allocs, equation, output_alloc);
++ fn->configure(input_tensors, equation, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -605,14 +608,14 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
+ {
+ auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
+ std::vector<custom::TypeInfo> &types,
+- std::vector<std::shared_ptr<IPortableTensor>> &allocs) {
++ std::vector<std::shared_ptr<IPortableTensor>> &tensors) {
+ for (auto &idx : opSeq)
+ {
+ const auto &operand = _ctx.at(idx);
+ // TODO make sure using `_current_op_seq_layout` is correct for custom operations
+ types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()});
+- auto in_alloc = _tensor_builder->portableAt(idx);
+- allocs.emplace_back(in_alloc);
++ auto in_tensor = _tensor_builder->portableAt(idx);
++ tensors.emplace_back(in_tensor);
+ }
+ };
+
+@@ -634,12 +637,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::ExpLayer>();
+
+- fn->configure(input_alloc, output_alloc);
++ fn->configure(input_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -650,13 +653,13 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
+ const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
+ const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
+
+ auto fn = std::make_unique<ops::ExpandDimsLayer>();
+
+- fn->configure(input_alloc, axis_alloc, output_alloc);
++ fn->configure(input_tensor, axis_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -666,12 +669,12 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::LogisticLayer>();
+
+- fn->configure(input_alloc, output_alloc);
++ fn->configure(input_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -681,12 +684,12 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::TanhLayer>();
+
+- fn->configure(input_alloc, output_alloc);
++ fn->configure(input_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -700,7 +703,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
+
+ assert(-rank <= axis && axis < rank);
+
+- auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
++ auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+
+ std::vector<const IPortableTensor *> input_tensors;
+ for (auto &ifm_idx : node.getInputs())
+@@ -708,7 +711,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
+
+ auto fn = std::make_unique<ops::PackLayer>();
+
+- fn->configure(input_tensors, axis, output_alloc);
++ fn->configure(input_tensors, axis, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -722,7 +725,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
+
+ assert(rank == 0 || (-rank <= axis && axis < rank));
+
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ std::vector<IPortableTensor *> output_tensors;
+ for (auto &output_idx : node.getOutputs())
+@@ -732,7 +735,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
+
+ uint32_t axis_resolved = (axis < 0 ? axis + rank : axis);
+
+- fn->configure(input_alloc, axis_resolved, node.param().num, output_tensors);
++ fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors);
+
+ _return_fn = std::move(fn);
+ }
+@@ -751,8 +754,16 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
+
+ auto fn = std::make_unique<ops::PadLayer>();
+
+- fn->configure(input, output, pad_base, pad_rank);
++ bool isPadV2 = node.getInputs().size() == 3 ? true : false;
++ const void *value = nullptr;
+
++ if (isPadV2)
++ {
++ const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)};
++ value = reinterpret_cast<const void *>(_ctx.at(value_index).data()->base());
++ }
++
++ fn->configure(input, output, pad_base, pad_rank, value);
+ _return_fn = std::move(fn);
+ }
+
+@@ -762,13 +773,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::MaxLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -779,13 +790,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::MinLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -795,12 +806,12 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::CastLayer>();
+
+- fn->configure(ifm_alloc, ofm_alloc);
++ fn->configure(ifm_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -810,12 +821,12 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::TransposeLayer>();
+
+- fn->configure(input_alloc, output_alloc, node.param().perm);
++ fn->configure(input_tensor, output_tensor, node.param().perm);
+
+ _return_fn = std::move(fn);
+ }
+@@ -827,15 +838,15 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
+
+ const auto keep_dims = node.param().keep_dims;
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto axes_alloc = _tensor_builder->portableAt(axes_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto axes_tensor = _tensor_builder->portableAt(axes_index).get();
+
+ if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN)
+ {
+ auto fn = std::make_unique<ops::MeanLayer>();
+
+- fn->configure(input_alloc, axes_alloc, output_alloc, keep_dims);
++ fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims);
+
+ _return_fn = std::move(fn);
+ }
+@@ -844,7 +855,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ auto fn = std::make_unique<ops::ReduceLayer>();
+
+ const auto reduce_type = convertReduceType(node.param().reduce_type);
+- fn->configure(input_alloc, axes_alloc, output_alloc, reduce_type, keep_dims);
++ fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims);
+
+ _return_fn = std::move(fn);
+ }
+@@ -855,12 +866,12 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(0)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::ReLULayer>();
+
+- fn->configure(input_alloc, output_alloc);
++ fn->configure(input_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -872,14 +883,14 @@ void KernelGenerator::visit(const ir::operation::Select &node)
+ const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
+ const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto condition_alloc = _tensor_builder->portableAt(condition_index).get();
+- auto true_alloc = _tensor_builder->portableAt(true_index).get();
+- auto false_alloc = _tensor_builder->portableAt(false_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto condition_tensor = _tensor_builder->portableAt(condition_index).get();
++ auto true_tensor = _tensor_builder->portableAt(true_index).get();
++ auto false_tensor = _tensor_builder->portableAt(false_index).get();
+
+ auto fn = std::make_unique<ops::SelectLayer>();
+
+- fn->configure(condition_alloc, true_alloc, false_alloc, output_alloc);
++ fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -891,14 +902,14 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+ const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
+ const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto begins_alloc = _tensor_builder->portableAt(begins_index).get();
+- auto sizes_alloc = _tensor_builder->portableAt(sizes_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto begins_tensor = _tensor_builder->portableAt(begins_index).get();
++ auto sizes_tensor = _tensor_builder->portableAt(sizes_index).get();
+
+ auto fn = std::make_unique<ops::SliceLayer>();
+
+- fn->configure(input_alloc, begins_alloc, sizes_alloc, output_alloc);
++ fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -911,11 +922,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+ const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
+ const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto starts_alloc = _tensor_builder->portableAt(starts_index).get();
+- auto ends_alloc = _tensor_builder->portableAt(ends_index).get();
+- auto strides_alloc = _tensor_builder->portableAt(strides_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto starts_tensor = _tensor_builder->portableAt(starts_index).get();
++ auto ends_tensor = _tensor_builder->portableAt(ends_index).get();
++ auto strides_tensor = _tensor_builder->portableAt(strides_index).get();
+
+ auto begin_mask = node.param().begin_mask;
+ auto end_mask = node.param().end_mask;
+@@ -923,7 +934,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+
+ auto fn = std::make_unique<ops::StridedSliceLayer>();
+
+- fn->configure(input_alloc, starts_alloc, ends_alloc, strides_alloc, output_alloc, begin_mask,
++ fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask,
+ end_mask, shrink_axis_mask);
+
+ _return_fn = std::move(fn);
+@@ -957,12 +968,12 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::AbsLayer>();
+
+- fn->configure(ifm_alloc, ofm_alloc);
++ fn->configure(ifm_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -972,12 +983,12 @@ void KernelGenerator::visit(const ir::operation::Sin &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Sin::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::SinLayer>();
+
+- fn->configure(ifm_alloc, ofm_alloc);
++ fn->configure(ifm_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -987,12 +998,12 @@ void KernelGenerator::visit(const ir::operation::Cos &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Cos::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::CosLayer>();
+
+- fn->configure(ifm_alloc, ofm_alloc);
++ fn->configure(ifm_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1002,12 +1013,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::RsqrtLayer>();
+
+- fn->configure(ifm_alloc, ofm_alloc);
++ fn->configure(ifm_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1017,12 +1028,12 @@ void KernelGenerator::visit(const ir::operation::Shape &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::ShapeLayer>();
+
+- fn->configure(ifm_alloc, ofm_alloc);
++ fn->configure(ifm_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1033,13 +1044,13 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
+ const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)};
+ const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
+
+ auto fn = std::make_unique<ops::ReverseLayer>();
+
+- fn->configure(input_alloc, axis_alloc, output_alloc);
++ fn->configure(input_tensor, axis_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1049,12 +1060,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::NegLayer>();
+
+- fn->configure(ifm_alloc, ofm_alloc);
++ fn->configure(ifm_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1066,12 +1077,12 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+
+ const auto axis = node.param().axis;
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::ArgMinMaxLayer>();
+
+- fn->configure(input_alloc, output_alloc, axis, /* is_arg_max */ true);
++ fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1082,13 +1093,13 @@ void KernelGenerator::visit(const ir::operation::Pow &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::PowLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, ir::Activation::NONE, output_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1098,12 +1109,12 @@ void KernelGenerator::visit(const ir::operation::Log &node)
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Log::Input::INPUT)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+
+ auto fn = std::make_unique<ops::LogLayer>();
+
+- fn->configure(ifm_alloc, ofm_alloc);
++ fn->configure(ifm_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1113,12 +1124,12 @@ void KernelGenerator::visit(const ir::operation::Round &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Round::INPUT)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::RoundLayer>();
+
+- fn->configure(input_alloc, output_alloc);
++ fn->configure(input_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1128,12 +1139,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::LogicalNot::INPUT)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::LogicalNotLayer>();
+
+- fn->configure(input_alloc, output_alloc);
++ fn->configure(input_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1144,28 +1155,43 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
+ const auto lhs_index{node.getInputs().at(0)};
+ const auto rhs_index{node.getInputs().at(1)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::LogicalOrLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
+
+ _return_fn = std::move(fn);
+ }
+
+-void KernelGenerator::visit(const ir::operation::ZerosLike &node)
++void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+ {
+ const auto output_index{node.getOutputs().at(0)};
+- const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
++ const auto input_index{node.getInputs().at(0)};
+
+ auto output_alloc = _tensor_builder->portableAt(output_index).get();
+ auto input_alloc = _tensor_builder->portableAt(input_index).get();
+
+- auto fn = std::make_unique<ops::ZerosLikeLayer>();
++ auto fn = std::make_unique<ops::L2NormLayer>();
+
+ fn->configure(input_alloc, output_alloc);
++
++ _return_fn = std::move(fn);
++}
++
++void KernelGenerator::visit(const ir::operation::ZerosLike &node)
++{
++ const auto output_index{node.getOutputs().at(0)};
++ const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
++
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++
++ auto fn = std::make_unique<ops::ZerosLikeLayer>();
++
++ fn->configure(input_tensor, output_tensor);
+ _return_fn = std::move(fn);
+ }
+
+@@ -1176,14 +1202,14 @@ void KernelGenerator::visit(const ir::operation::Range &node)
+ const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)};
+ const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto start_alloc = _tensor_builder->portableAt(start_index).get();
+- auto limit_alloc = _tensor_builder->portableAt(limit_index).get();
+- auto delta_alloc = _tensor_builder->portableAt(delta_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto start_tensor = _tensor_builder->portableAt(start_index).get();
++ auto limit_tensor = _tensor_builder->portableAt(limit_index).get();
++ auto delta_tensor = _tensor_builder->portableAt(delta_index).get();
+
+ auto fn = std::make_unique<ops::RangeLayer>();
+
+- fn->configure(start_alloc, limit_alloc, delta_alloc, output_alloc);
++ fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor);
+ _return_fn = std::move(fn);
+ }
+
+@@ -1193,13 +1219,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+
+- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ auto fn = std::make_unique<ops::SqDiffLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
+ _return_fn = std::move(fn);
+ }
+
+@@ -1209,13 +1235,13 @@ void KernelGenerator::visit(const ir::operation::Tile &node)
+ const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)};
+ const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto multiples_alloc = _tensor_builder->portableAt(multiples_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto multiples_tensor = _tensor_builder->portableAt(multiples_index).get();
+
+ auto fn = std::make_unique<ops::TileLayer>();
+
+- fn->configure(input_alloc, multiples_alloc, output_alloc);
++ fn->configure(input_tensor, multiples_tensor, output_tensor);
+ _return_fn = std::move(fn);
+ }
+
+@@ -1226,14 +1252,14 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
+ const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)};
+ const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto num_lower_alloc = _tensor_builder->portableAt(num_lower_index).get();
+- auto num_upper_alloc = _tensor_builder->portableAt(num_upper_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto num_lower_tensor = _tensor_builder->portableAt(num_lower_index).get();
++ auto num_upper_tensor = _tensor_builder->portableAt(num_upper_index).get();
+
+ auto fn = std::make_unique<ops::MatrixBandPartLayer>();
+
+- fn->configure(input_alloc, num_lower_alloc, num_upper_alloc, output_alloc);
++ fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor);
+ _return_fn = std::move(fn);
+ }
+
+@@ -1243,16 +1269,16 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
+ const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+
+ const auto adj_x = node.param().adj_x;
+ const auto adj_y = node.param().adj_y;
+
+ auto fn = std::make_unique<ops::BatchMatMulLayer>();
+
+- fn->configure(lhs_alloc, rhs_alloc, adj_x, adj_y, output_alloc);
++ fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor);
+ _return_fn = std::move(fn);
+ }
+
+@@ -1262,13 +1288,13 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node)
+ const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)};
+ const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto shape_alloc = _tensor_builder->portableAt(shape_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto shape_tensor = _tensor_builder->portableAt(shape_index).get();
+
+ auto fn = std::make_unique<ops::BroadcastToLayer>();
+
+- fn->configure(input_alloc, shape_alloc, output_alloc);
++ fn->configure(input_tensor, shape_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1277,10 +1303,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
+ {
+ const auto ofm_index{node.getOutputs().at(0)};
+
+- auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+- std::vector<const IPortableTensor *> input_allocs;
++ auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
++ std::vector<const IPortableTensor *> input_tensors;
+ for (auto &ifm_idx : node.getInputs())
+- input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
++ input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+
+ const auto epsilon = node.param().epsilon;
+ const auto is_training = node.param().is_training;
+@@ -1288,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
+
+ auto fn = std::make_unique<ops::FusedBatchNormLayer>();
+
+- fn->configure(input_allocs, epsilon, is_training, data_format, output_alloc);
++ fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1301,12 +1327,12 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node)
+ const auto beta = node.param().beta;
+ const auto axis = node.param().axis;
+
+- auto output_alloc = _tensor_builder->at(output_index).get();
+- auto input_alloc = _tensor_builder->at(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::LogSoftMaxLayer>();
+
+- fn->configure(input_alloc, beta, axis, output_alloc);
++ fn->configure(input_tensor, beta, axis, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+@@ -1318,14 +1344,45 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+ const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)};
+ const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)};
+
+- auto output_alloc = _tensor_builder->portableAt(output_index).get();
+- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+- auto block_shape_alloc = _tensor_builder->portableAt(block_shape_index).get();
+- auto padding_alloc = _tensor_builder->portableAt(padding_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto block_shape_tensor = _tensor_builder->portableAt(block_shape_index).get();
++ auto padding_tensor = _tensor_builder->portableAt(padding_index).get();
+
+ auto fn = std::make_unique<ops::SpaceToBatchNDLayer>();
+
+- fn->configure(input_alloc, block_shape_alloc, padding_alloc, output_alloc);
++ fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor);
++
++ _return_fn = std::move(fn);
++}
++
++void KernelGenerator::visit(const ir::operation::Quantize &node)
++{
++ const auto input_index{node.getInputs().at(ir::operation::Quantize::Input::INPUT)};
++ const auto output_index{node.getOutputs().at(0)};
++
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++
++ auto fn = std::make_unique<ops::QuantizeLayer>();
++
++ fn->configure(input_tensor, output_tensor);
++
++ _return_fn = std::move(fn);
++}
++
++void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
++{
++ const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
++ const auto output_index{node.getOutputs().at(0)};
++ auto block_size = node.param().block_size;
++
++ auto input_tensor = _tensor_builder->portableAt(input_index).get();
++ auto output_tensor = _tensor_builder->portableAt(output_index).get();
++
++ auto fn = std::make_unique<ops::SpaceToDepthLayer>();
++
++ fn->configure(input_tensor, block_size, output_tensor);
+
+ _return_fn = std::move(fn);
+ }
+diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h
+index d6f4c28..f564bf8 100644
+--- a/runtime/onert/backend/cpu/KernelGenerator.h
++++ b/runtime/onert/backend/cpu/KernelGenerator.h
+@@ -94,6 +94,7 @@ public:
+ void visit(const ir::operation::SquaredDifference &) override;
+ void visit(const ir::operation::Tile &) override;
+ void visit(const ir::operation::LogicalOr &) override;
++ void visit(const ir::operation::L2Normalization &) override;
+ void visit(const ir::operation::Range &) override;
+ void visit(const ir::operation::MatrixBandPart &) override;
+ void visit(const ir::operation::BatchMatMul &) override;
+@@ -101,6 +102,8 @@ public:
+ void visit(const ir::operation::FusedBatchNorm &) override;
+ void visit(const ir::operation::LogSoftmax &) override;
+ void visit(const ir::operation::SpaceToBatchND &) override;
++ void visit(const ir::operation::Quantize &) override;
++ void visit(const ir::operation::SpaceToDepth &) override;
+
+ private:
+ const ir::Operands &_ctx;
+diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc
+new file mode 100644
+index 0000000..8723072
+--- /dev/null
++++ b/runtime/onert/backend/cpu/StaticTensorManager.cc
+@@ -0,0 +1,104 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "StaticTensorManager.h"
++#include "Tensor.h"
++
++#include <util/logging.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++
++StaticTensorManager::StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg)
++ : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg}
++{
++ // DO NOTHING
++}
++
++void StaticTensorManager::allocateNonconsts(void)
++{
++ _nonconst_mgr->allocate();
++
++ for (auto &pair : _tensors->native_tensors())
++ {
++ const auto &ind = pair.first;
++ auto tensor = pair.second;
++ if (!_as_constants[ind] && !tensor->is_dynamic())
++ {
++ auto *buffer = _nonconst_mgr->getBuffer(ind);
++ tensor->setBuffer(buffer);
++
++ VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
++ << "): " << static_cast<void *>(buffer) << std::endl;
++ }
++ }
++}
++
++void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); }
++
++void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
++ const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
++ bool as_const)
++{
++ assert(!_tensors->getITensor(ind));
++ if (as_const)
++ {
++ auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout);
++ _tensors->setNativeTensor(ind, tensor);
++ }
++ else
++ {
++ auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
++ _tensors->setNativeTensor(ind, tensor);
++ }
++ _as_constants[ind] = as_const;
++}
++
++void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
++{
++ assert(_tensors->getITensor(ind));
++
++ // This method is called only when a tensor has proper shape
++ assert(!_tensors->getITensor(ind)->is_dynamic());
++
++ if (!_as_constants[ind])
++ _nonconst_mgr->claimPlan(ind, size);
++}
++
++void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
++{
++ assert(_tensors->getITensor(ind));
++
++ // This method is called only when a tensor has proper shape
++ assert(!_tensors->getITensor(ind)->is_dynamic());
++
++ if (!_as_constants[ind])
++ _nonconst_mgr->releasePlan(ind);
++}
++
++void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
++{
++ for (const auto &it : _tensors->native_tensors())
++ fn(it.first);
++}
++
++} // namespace cpu
++} // namespace backend
++} // namespace onert
+diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h
+new file mode 100644
+index 0000000..66243a5
+--- /dev/null
++++ b/runtime/onert/backend/cpu/StaticTensorManager.h
+@@ -0,0 +1,61 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
++#define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
++
++#include "backend/IStaticTensorManager.h"
++#include "backend/cpu_common/MemoryManager.h"
++#include "backend/cpu_common/TensorRegistry.h"
++#include "backend/ITensorManager.h"
++#include "ir/OperandIndexMap.h"
++#include "ir/OperandInfo.h"
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++
++class StaticTensorManager : public backend::IStaticTensorManager
++{
++public:
++ StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg);
++ virtual ~StaticTensorManager() = default;
++
++ void allocateNonconsts(void);
++ void deallocateNonconsts(void);
++
++ void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
++ ir::Layout backend_layout, bool as_const);
++
++ void claimPlan(const ir::OperandIndex &ind, uint32_t size);
++ void releasePlan(const ir::OperandIndex &ind);
++
++ void iterate(const std::function<void(const ir::OperandIndex &)> &fn);
++
++private:
++ std::unique_ptr<cpu_common::MemoryManager> _nonconst_mgr;
++ const std::shared_ptr<cpu_common::TensorRegistry> _tensors;
++ ir::OperandIndexMap<bool> _as_constants;
++};
++
++} // namespace cpu
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
+diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
+index 4dd251b..da16d05 100644
+--- a/runtime/onert/backend/cpu/Tensor.h
++++ b/runtime/onert/backend/cpu/Tensor.h
+@@ -29,8 +29,14 @@ namespace cpu
+
+ using Tensor = cpu_common::Tensor;
+
+-// Tensor which has data from external. To support this, assume below things
+-// no padding, always NHWC layout, constant tensor and not dynamic
++/**
++ * @brief Class that uses data from external memory that is not managed by a backend
++ * instead of allocating and copying the data. ExternalTensor's data pointer points to
++ * an address of memory such as where memory is already allocated, or mmapped area.
++ * This is meaning that ExternalTensor can take all of types' ir::Data.
++ * To support this, assume below things no padding, always NHWC layout,
++ * constant tensor and not dynamic.
++ */
+ class ExternalTensor : public Tensor
+ {
+ public:
+@@ -45,6 +51,11 @@ public:
+ }
+
+ public:
++ /**
++ * @brief set Data to be shared from external so that this ExternalTensor will not be
++ * allocated on CPU backend
++ * @param[in] data data of Operand to be set
++ */
+ void setData(const std::shared_ptr<ir::Data> data)
+ {
+ assert(data != nullptr);
+diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc
+index 886e8d8..7eb3ce8 100644
+--- a/runtime/onert/backend/cpu/TensorBuilder.cc
++++ b/runtime/onert/backend/cpu/TensorBuilder.cc
+@@ -29,7 +29,7 @@ namespace cpu
+
+ TensorBuilder::TensorBuilder()
+ : _tensor_reg{new cpu_common::TensorRegistry()},
+- _static_tensor_mgr{new cpu_common::StaticTensorManager(_tensor_reg)},
++ _static_tensor_mgr{new StaticTensorManager(_tensor_reg)},
+ _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)}
+ {
+ /* empty */
+@@ -77,11 +77,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
+ return _tensor_info_map.find(ind) != _tensor_info_map.end();
+ }
+
+-void TensorBuilder::prepare(void)
+-{
+- _static_tensor_mgr->allocateConsts();
+- _static_tensor_mgr->allocateNonconsts();
+-}
++void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
+
+ void TensorBuilder::allocate()
+ {
+@@ -99,17 +95,17 @@ std::shared_ptr<IPortableTensor> TensorBuilder::portableAt(const ir::OperandInde
+ return _tensor_reg->getPortableTensor(ind);
+ }
+
+-bool TensorBuilder::setExternalTensor(const ir::OperandIndex &ind,
+- const std::shared_ptr<IPortableTensor> &tensor)
++bool TensorBuilder::setMigrantTensor(const ir::OperandIndex &ind,
++ const std::shared_ptr<IPortableTensor> &tensor)
+ {
+- return _tensor_reg->setExternalTensor(ind, tensor);
++ return _tensor_reg->setMigrantTensor(ind, tensor);
+ }
+
+ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->iterate(fn); }
+
+-std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
++std::shared_ptr<Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
+ {
+- return _tensor_reg->getManagedTensor(ind);
++ return _tensor_reg->getNativeTensor(ind);
+ }
+
+ std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
+diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
+index ba25451..12ca28c 100644
+--- a/runtime/onert/backend/cpu/TensorBuilder.h
++++ b/runtime/onert/backend/cpu/TensorBuilder.h
+@@ -18,13 +18,14 @@
+ #define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__
+
+ #include <backend/cpu_common/DynamicTensorManager.h>
+-#include <backend/cpu_common/StaticTensorManager.h>
+ #include <backend/cpu_common/TensorRegistry.h>
+-#include <backend/cpu_common/Tensor.h>
+
+ #include <backend/ITensorBuilder.h>
+ #include <ir/OperandIndexMap.h>
+
++#include "StaticTensorManager.h"
++#include "Tensor.h"
++
+ #include <unordered_map>
+
+ namespace onert
+@@ -80,16 +81,16 @@ public:
+ * If not, program will crash with assert or exception.
+ * @return shared_ptr<Tensor>
+ */
+- std::shared_ptr<cpu_common::Tensor> at(const ir::OperandIndex &ind);
++ std::shared_ptr<Tensor> at(const ir::OperandIndex &ind);
+ std::shared_ptr<IPortableTensor> portableAt(const ir::OperandIndex &ind);
+- bool setExternalTensor(const ir::OperandIndex &ind,
+- const std::shared_ptr<IPortableTensor> &tensor) override;
++ bool setMigrantTensor(const ir::OperandIndex &ind,
++ const std::shared_ptr<IPortableTensor> &tensor) override;
+
+ std::shared_ptr<ITensorRegistry> tensorRegistry() override { return _tensor_reg; }
+
+ private:
+ const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+- std::unique_ptr<cpu_common::StaticTensorManager> _static_tensor_mgr;
++ std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
+ std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
+ ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
+ };
+diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc
+index f557f3a..adf902a 100644
+--- a/runtime/onert/backend/cpu/ops/CompareLayer.cc
++++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc
+@@ -17,6 +17,7 @@
+
+ #include "OperationUtils.h"
+
++#include <assert.h>
+ #include <cker/operation/Comparison.h>
+ using namespace nnfw::cker;
+ namespace onert
+@@ -34,6 +35,14 @@ namespace
+ using OpType = onert::ir::operation::Comparison::ComparisonType;
+ using namespace onert::backend::cpu;
+
++// Assumes these enum values to be in the order like this
++static_assert(static_cast<int>(OpType::Equal) == 0, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::NotEqual) == 1, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::Greater) == 2, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::GreaterEqual) == 3, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::Less) == 4, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::LessEqual) == 5, "An OpType value has changed!");
++
+ template <typename T>
+ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output,
+ OpType op_type)
+@@ -52,95 +61,33 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
+ &params.input2_shift);
+ params.is_broadcast = !HaveSameShapes(lhs, rhs);
+
+- if (params.is_broadcast)
+- {
+- switch (op_type)
+- {
+- case OpType::Equal:
+- Broadcast4DSlowEqualWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::NotEqual:
+- Broadcast4DSlowNotEqualWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::Greater:
+- Broadcast4DSlowGreaterWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::GreaterEqual:
+- Broadcast4DSlowGreaterEqualWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::Less:
+- Broadcast4DSlowLessWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::LessEqual:
+- Broadcast4DSlowLessEqualWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- default:
+- throw std::runtime_error{"Invalid OpType for CompareLayer"};
+- }
+- }
+- else // if (requires_broadcast == false)
+- {
+- switch (op_type)
+- {
+- case OpType::Equal:
+- EqualWithScaling(params, getExtendedTensorShape(lhs),
+- reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
+- reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
+- reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::NotEqual:
+- NotEqualWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::Greater:
+- GreaterWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::GreaterEqual:
+- GreaterEqualWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::Less:
+- LessWithScaling(params, getExtendedTensorShape(lhs),
+- reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
+- reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
+- reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::LessEqual:
+- LessEqualWithScaling(
+- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- default:
+- throw std::runtime_error{"Invalid OpType for CompareLayer"};
+- }
+- }
+- return;
++ using CompareFunction =
++ void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data,
++ const Shape &input2_shape, const T *input2_data, const Shape &output_shape,
++ bool *output_data);
++
++ static const CompareFunction broadcast_fns[] = {
++ Broadcast4DSlowEqualWithScaling, Broadcast4DSlowNotEqualWithScaling,
++ Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling,
++ Broadcast4DSlowLessWithScaling, Broadcast4DSlowLessEqualWithScaling,
++ };
++ static const CompareFunction non_broadcast_fns[] = {
++ EqualWithScaling, NotEqualWithScaling, GreaterWithScaling,
++ GreaterEqualWithScaling, LessWithScaling, LessEqualWithScaling,
++ };
++
++ static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
++ "Sizes of broadcast_fns and non_broadcast_fns must match!");
++
++ auto index = static_cast<int>(op_type);
++ if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
++ throw std::runtime_error{"Invalid OpType for CompareLayer"};
++
++ CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
++
++ fn(params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
++ getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
++ getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+ }
+
+ template <typename T>
+@@ -149,94 +96,33 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
+ {
+ bool requires_broadcast = !HaveSameShapes(lhs, rhs);
+
+- if (requires_broadcast)
+- {
+- switch (op_type)
+- {
+- case OpType::Equal:
+- Broadcast4DSlowEqual(
+- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::NotEqual:
+- Broadcast4DSlowNotEqual(
+- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::Greater:
+- Broadcast4DSlowGreater(
+- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::GreaterEqual:
+- Broadcast4DSlowGreaterEqual(
+- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::Less:
+- Broadcast4DSlowLess(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output),
+- reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::LessEqual:
+- Broadcast4DSlowLessEqual(
+- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- default:
+- throw std::runtime_error{"Invalid OpType for CompareLayer"};
+- }
+- }
+- else // if (requires_broadcast == false)
+- {
+- switch (op_type)
+- {
+- case OpType::Equal:
+- EqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::NotEqual:
+- NotEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output),
+- reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::Greater:
+- GreaterNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output),
+- reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::GreaterEqual:
+- GreaterEqualNoScaling(
+- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::Less:
+- LessNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+- break;
+- case OpType::LessEqual:
+- LessEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+- getExtendedTensorShape(output),
+- reinterpret_cast<bool *>(output->buffer()));
+- break;
+- default:
+- throw std::runtime_error{"Invalid OpType for CompareLayer"};
+- }
+- }
+- return;
++ using CompareFunction =
++ void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
++ const T *input2_data, const Shape &output_shape, bool *output_data);
++
++ static const CompareFunction broadcast_fns[] = {
++ Broadcast4DSlowEqual, Broadcast4DSlowNotEqual, Broadcast4DSlowGreater,
++ Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess, Broadcast4DSlowLessEqual,
++ };
++ static const CompareFunction non_broadcast_fns[] = {
++ EqualNoScaling, NotEqualNoScaling, GreaterNoScaling,
++ GreaterEqualNoScaling, LessNoScaling, LessEqualNoScaling,
++ };
++
++ static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
++ "Sizes of broadcast_fns and non_broadcast_fns must match!");
++
++ auto index = static_cast<int>(op_type);
++ if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
++ throw std::runtime_error{"Invalid OpType for CompareLayer"};
++
++ CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
++
++ fn(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
++ getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
++ getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+ }
++
+ } // namespace
+
+ CompareLayer::CompareLayer()
+diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+index c00be64..ff22e32 100644
+--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
++++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+@@ -18,6 +18,7 @@
+
+ #include "../Tensor.h"
+ #include <cker/operation/FullyConnected.h>
++#include <cker/TensorUtils.h>
+
+ namespace onert
+ {
+@@ -112,15 +113,32 @@ void FullyConnectedLayer::fullyConnectedHybrid()
+ getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+ getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena);
+
+-// TODO Enable calling decrease_ref
+-#if 0
++// TODO Remove this ifdef
++#ifdef EXPERIMENTAL_RUY_FEATURE
+ if (_cached_weights == nullptr || _is_weights_freed)
+ return;
+
++ // '_cached_weights is not nullptr and _is_weights_freed is false' means
++ // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
++ // After entering here, it will not enter again except below the case - input is zero-vector
++
++ // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
++ // so that handle this case
++ const int input_size = getTensorShape(_input).FlatSize();
++ if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size))
++ return;
++
++ // This weight tensor could be other ops' const tensor.
++ // Therefore, below reference should be checked like following
+ auto weight_tensor = dynamic_cast<const Tensor *>(_weights);
+ if (weight_tensor)
+ {
+ auto tensor = const_cast<Tensor *>(weight_tensor);
++ if (tensor->buffer() == nullptr) // ref is already 0?
++ {
++ _is_weights_freed = true;
++ return;
++ }
+
+ tensor->decrease_ref();
+ if (tensor->buffer() == nullptr) // ref == 0?
+@@ -128,7 +146,7 @@ void FullyConnectedLayer::fullyConnectedHybrid()
+ _is_weights_freed = true;
+ }
+ }
+-#endif // if 0
++#endif
+ #endif
+ }
+
+@@ -167,7 +185,17 @@ void FullyConnectedLayer::run()
+
+ void FullyConnectedLayer::prepare()
+ {
++ if (_bias && _bias->is_constant())
++ {
++ const int bias_size = getTensorShape(_bias).FlatSize();
++ if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size))
++ {
++ _bias = nullptr;
++ }
++ }
++
+ #ifdef USE_RUY_GEMV
++#ifdef EXPERIMENTAL_RUY_FEATURE
+ // TODO This is workaround
+ // The only fc hybrid will use ruy kernel
+ if (_input->data_type() != OperandType::FLOAT32 ||
+@@ -199,6 +227,7 @@ void FullyConnectedLayer::prepare()
+ }
+ }
+ #endif
++#endif
+ }
+
+ } // namespace ops
+diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
+index dd5ef24..e405b24 100644
+--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
++++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
+@@ -72,6 +72,9 @@ private:
+
+ #ifdef USE_RUY_GEMV
+ uint8_t *_cached_weights = nullptr; // weights to be cached and a key
++#ifdef EXPERIMENTAL_RUY_FEATURE
++ bool _is_weights_freed = false; // is weights freed?
++#endif
+ #endif
+ };
+
+diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
+new file mode 100644
+index 0000000..0d99b05
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
+@@ -0,0 +1,71 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "L2NormLayer.h"
++
++#include "OperationUtils.h"
++
++#include <cker/operation/L2Normalize.h>
++#include <cker/Types.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++
++void L2NormLayer::configure(const IPortableTensor *input, IPortableTensor *output)
++{
++ assert(input != nullptr);
++ assert(output != nullptr);
++
++ _input = input;
++ _output = output;
++}
++
++void L2NormLayer::run()
++{
++ switch (_input->data_type())
++ {
++ case OperandType::FLOAT32:
++ nnfw::cker::L2NormalizeFloat32(
++ getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
++ getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
++ break;
++
++ case OperandType::QUANT_UINT8_ASYMM:
++ {
++ nnfw::cker::L2NormParams params;
++ assert(_input->data_offset() == 128);
++ params.input_zero_point = _input->data_offset();
++ nnfw::cker::L2NormalizeQuant8(
++ params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
++ getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
++ }
++ break;
++
++ default:
++ throw std::runtime_error{"L2Norm: Unsupported data type"};
++ }
++}
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
+diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.h b/runtime/onert/backend/cpu/ops/L2NormLayer.h
+new file mode 100644
+index 0000000..63f2d11
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/L2NormLayer.h
+@@ -0,0 +1,55 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in riting, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
++#define __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
++
++#include <backend/IPortableTensor.h>
++
++#include <exec/IFunction.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++class L2NormLayer : public ::onert::exec::IFunction
++{
++public:
++ L2NormLayer() : _input(nullptr), _output(nullptr)
++ {
++ // Nothing
++ }
++
++public:
++ void configure(const IPortableTensor *_input, IPortableTensor *output);
++
++ void run() override;
++
++private:
++ const IPortableTensor *_input;
++ IPortableTensor *_output;
++};
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
+diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
+index d71e325..06dde4f 100644
+--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
++++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
+@@ -49,8 +49,8 @@ void LogSoftMaxLayer::logsoftmaxQuant8()
+ // NYI
+ }
+
+-void LogSoftMaxLayer::configure(const Tensor *input, const float beta, const int axis,
+- Tensor *output)
++void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis,
++ IPortableTensor *output)
+ {
+ _input = input;
+ _output = output;
+diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
+index bc145ce..ba9deca 100644
+--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
++++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
+@@ -40,13 +40,14 @@ public:
+
+ void logsoftmaxQuant8();
+
+- void configure(const Tensor *input, const float beta, const int axis, Tensor *output);
++ void configure(const IPortableTensor *input, const float beta, const int axis,
++ IPortableTensor *output);
+
+ void run();
+
+ private:
+- const Tensor *_input;
+- Tensor *_output;
++ const IPortableTensor *_input;
++ IPortableTensor *_output;
+
+ float _beta;
+ int _axis;
+diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h
+index 8d29374..9838552 100644
+--- a/runtime/onert/backend/cpu/ops/OperationUtils.h
++++ b/runtime/onert/backend/cpu/ops/OperationUtils.h
+@@ -52,6 +52,17 @@ union DataPtr {
+ void *v;
+ };
+
++union ConstDataPtr {
++ const uint8_t *u8;
++ const int8_t *i8;
++ const uint32_t *u32;
++ const int32_t *i32;
++ const bool *b;
++ const float *f;
++ const int64_t *i64;
++ const void *v;
++};
++
+ uint32_t getNumberOfDimensions(const IPortableTensor *tensor);
+
+ uint32_t getNumberOfElements(const IPortableTensor *tensor);
+diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc
+index fcfcf7b..6a2bf9d 100644
+--- a/runtime/onert/backend/cpu/ops/PadLayer.cc
++++ b/runtime/onert/backend/cpu/ops/PadLayer.cc
+@@ -33,33 +33,40 @@ PadLayer::PadLayer()
+ // DO NOTHING
+ }
+
+-void PadLayer::padFloat32()
++template <typename T> void PadLayer::padImpl(const T *constant_value_data)
+ {
+- nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input),
+- reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
+- reinterpret_cast<float *>(_output->buffer()), _constantValueData.f);
++ nnfw::cker::Pad<T>(_padData, _padRank, getTensorShape(_input),
++ reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
++ reinterpret_cast<T *>(_output->buffer()), constant_value_data);
+ }
+-void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); }
+
+ void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output,
+- const int32_t *padData, int32_t padRank, uint8_t *constantValueData)
++ const int32_t *padData, int32_t padRank, const void *constantValueData)
+ {
+ _input = input;
+ _output = output;
+ memcpy(_padData, padData, sizeof(_padData));
+ _padRank = padRank;
+- _constantValueData.u8 = constantValueData;
++ _constantValueData.v = constantValueData;
+ }
+
+ void PadLayer::run()
+ {
+ if (_input->data_type() == OperandType::FLOAT32)
+ {
+- padFloat32();
++ padImpl<float>(_constantValueData.f);
+ }
+ else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
+ {
+- padQuant8();
++ if (_constantValueData.u8 == nullptr)
++ {
++ uint8_t pad_value = static_cast<uint8_t>(_output->data_offset());
++ padImpl<uint8_t>(&pad_value);
++ }
++ else
++ {
++ padImpl<uint8_t>(_constantValueData.u8);
++ }
+ }
+ else
+ {
+diff --git a/runtime/onert/backend/cpu/ops/PadLayer.h b/runtime/onert/backend/cpu/ops/PadLayer.h
+index 85bd2e6..efd73d5 100644
+--- a/runtime/onert/backend/cpu/ops/PadLayer.h
++++ b/runtime/onert/backend/cpu/ops/PadLayer.h
+@@ -39,12 +39,10 @@ public:
+ PadLayer();
+
+ public:
+- void padFloat32();
+-
+- void padQuant8();
++ template <typename T> void padImpl(const T *constant_value_data);
+
+ void configure(const IPortableTensor *input, IPortableTensor *output, const int32_t *padData,
+- int32_t padRank, uint8_t *constantValueData = nullptr);
++ int32_t padRank, const void *constantValueData = nullptr);
+
+ void run() override;
+
+@@ -54,7 +52,7 @@ private:
+
+ int32_t _padData[8];
+ int32_t _padRank;
+- DataPtr _constantValueData;
++ ConstDataPtr _constantValueData;
+ };
+
+ } // namespace ops
+diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
+new file mode 100644
+index 0000000..45fc148
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
+@@ -0,0 +1,63 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "QuantizeLayer.h"
++
++#include <cker/operation/Quantize.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++
++QuantizeLayer::QuantizeLayer() : _input(nullptr), _output(nullptr)
++{
++ // DO NOTHING
++}
++
++template <typename InputT, typename OutputT> void QuantizeLayer::affineQuantize()
++{
++ nnfw::cker::Quantize(getTensorShape(_input), reinterpret_cast<const InputT *>(_input->buffer()),
++ getTensorShape(_output), reinterpret_cast<OutputT *>(_output->buffer()),
++ _output->data_scale(), _output->data_offset());
++}
++
++void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output)
++{
++ _input = input;
++ _output = output;
++}
++
++void QuantizeLayer::run()
++{
++ if (_input->data_type() == OperandType::FLOAT32)
++ {
++ affineQuantize<float, uint8_t>();
++ }
++ else
++ {
++ throw std::runtime_error{"Quantize: unsupported data type"};
++ }
++}
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
+diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
+new file mode 100644
+index 0000000..b4e7aca
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
+@@ -0,0 +1,56 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
++#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
++
++#include <backend/IPortableTensor.h>
++#include "OperationUtils.h"
++
++#include <exec/IFunction.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++
++class QuantizeLayer : public ::onert::exec::IFunction
++{
++public:
++ QuantizeLayer();
++
++public:
++ template <typename InputT, typename OutputT> void affineQuantize();
++
++ void configure(const IPortableTensor *input, IPortableTensor *output);
++
++ void run() override;
++
++private:
++ const IPortableTensor *_input;
++ IPortableTensor *_output;
++};
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc
+index a9106c1..449c073 100644
+--- a/runtime/onert/backend/cpu/ops/SliceLayer.cc
++++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc
+@@ -46,7 +46,7 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b
+ }
+ }
+
+-void SliceLayer::sliceFloat32()
++template <typename T> void SliceLayer::sliceImpl()
+ {
+ const int kMaxDim = nnfw::cker::Shape::kMaxSmallSize;
+
+@@ -74,14 +74,8 @@ void SliceLayer::sliceFloat32()
+ }
+
+ nnfw::cker::Slice(op_params, getExtendedTensorShape(_input),
+- reinterpret_cast<const float *>(_input->buffer()),
+- reinterpret_cast<float *>(_output->buffer()));
+-}
+-
+-void SliceLayer::sliceQuant8()
+-{
+- // cker quant8 slice is not implemented yet
+- throw std::runtime_error{"NYI"};
++ reinterpret_cast<const T *>(_input->buffer()),
++ reinterpret_cast<T *>(_output->buffer()));
+ }
+
+ void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
+@@ -97,11 +91,11 @@ void SliceLayer::run()
+ {
+ if (_input->data_type() == OperandType::FLOAT32)
+ {
+- sliceFloat32();
++ sliceImpl<float>();
+ }
+ else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
+ {
+- sliceQuant8();
++ sliceImpl<uint8_t>();
+ }
+ else
+ {
+diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.h b/runtime/onert/backend/cpu/ops/SliceLayer.h
+index 9945d7e..650e2c9 100644
+--- a/runtime/onert/backend/cpu/ops/SliceLayer.h
++++ b/runtime/onert/backend/cpu/ops/SliceLayer.h
+@@ -42,8 +42,7 @@ public:
+ void run() override;
+
+ private:
+- void sliceFloat32();
+- void sliceQuant8();
++ template <typename T> void sliceImpl();
+
+ template <typename T>
+ void GetBeginAndSizeVectors(int dimensions, const IPortableTensor *begin,
+diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
+new file mode 100644
+index 0000000..110b0bc
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
+@@ -0,0 +1,70 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "SpaceToDepthLayer.h"
++
++#include "OperationUtils.h"
++
++#include <cker/operation/SpaceToDepth.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++SpaceToDepthLayer::SpaceToDepthLayer() : _input(nullptr), _block_size(0), _output(nullptr)
++{
++ // DO NOTHING
++}
++
++template <typename T> void SpaceToDepthLayer::spaceToDepth()
++{
++
++ nnfw::cker::SpaceToDepthParams params;
++ params.block_size = _block_size;
++
++ nnfw::cker::SpaceToDepth(params, getTensorShape(_input),
++ reinterpret_cast<const float *>(_input->buffer()),
++ getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
++}
++
++void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size,
++ IPortableTensor *output)
++{
++ _input = input;
++ _block_size = block_size;
++ _output = output;
++}
++
++void SpaceToDepthLayer::run()
++{
++ if (_input->data_type() == OperandType::FLOAT32)
++ {
++ spaceToDepth<float>();
++ }
++ else
++ {
++ throw std::runtime_error{"SpaceToDepth: unsupported data type"};
++ }
++}
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
+diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
+new file mode 100644
+index 0000000..c11ef2b
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
+@@ -0,0 +1,54 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in riting, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
++#define __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
++
++#include <backend/IPortableTensor.h>
++
++#include <exec/IFunction.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++class SpaceToDepthLayer : public ::onert::exec::IFunction
++{
++public:
++ SpaceToDepthLayer();
++
++ void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output);
++
++ void run() override;
++
++private:
++ template <typename T> void spaceToDepth();
++
++ const IPortableTensor *_input;
++ int32_t _block_size;
++ IPortableTensor *_output;
++};
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_CPU_OPS_SPACE_TO_BATCH_ND_LAYER_H__
+diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h
+index a49525b..b760cda 100644
+--- a/runtime/onert/core/include/backend/ITensorBuilder.h
++++ b/runtime/onert/core/include/backend/ITensorBuilder.h
+@@ -112,12 +112,12 @@ public: // methods for static tensor allocation
+ virtual std::shared_ptr<ITensor> tensorAt(const ir::OperandIndex &ind) = 0;
+
+ /**
+- * @brief Set the External Tensor object
++ * @brief Set the migrant tensor object
+ *
+ * @return true if succeeded
+ * @return false if failed or unsupported
+ */
+- virtual bool setExternalTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
++ virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
+ {
+ return false;
+ }
+diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h
+index f5a95f4..8555131 100644
+--- a/runtime/onert/core/include/backend/ITensorRegistry.h
++++ b/runtime/onert/core/include/backend/ITensorRegistry.h
+@@ -35,17 +35,22 @@ struct ITensorRegistry
+ virtual ~ITensorRegistry() = default;
+
+ /**
+- * @brief Returns pointer of ITensor among managed and external tensors
++ * @brief Returns pointer of ITensor among native and migrant tensors
++ *
++ * Native Tensor is a tensor that is managed by this backend
++ * Migrant Tensor is a tensor that is imported from another backend
++ *
+ * @note Return tensor cannot be used longer than dynamic tensor manager
+ */
+ virtual std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &) = 0;
+ /**
+- * @brief Returns pointer of ITensor among managed tensors
++ * @brief Returns pointer of ITensor among native tensors
+ *
+- * Unlike @c getITensor , this function only searches from managed tensors
+- * @note Return tensor cannot be used longer than dynamic tensor manager
++ * Unlike @c getITensor , this function only searches from native tensors
++ *
++ * @note Returned tensor cannot be used longer than dynamic tensor manager
+ */
+- virtual std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &) = 0;
++ virtual std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &) = 0;
+ };
+
+ } // namespace backend
+@@ -73,68 +78,67 @@ public:
+ std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
+ {
+ static_assert(std::is_base_of<ITensor, T_Tensor>::value, "T_Tensor must derive from ITensor.");
+- auto external_tensor = _external.find(ind);
+- if (external_tensor != _external.end())
++ auto external_tensor = _migrant.find(ind);
++ if (external_tensor != _migrant.end())
+ return external_tensor->second;
+- return getManagedTensor(ind);
++ return getNativeTensor(ind);
+ }
+
+- std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &ind) override
++ std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
+ {
+- return getManagedTensor(ind);
++ return getNativeTensor(ind);
+ }
+
+ std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
+ {
+- auto external_tensor = _external.find(ind);
+- if (external_tensor != _external.end())
++ auto external_tensor = _migrant.find(ind);
++ if (external_tensor != _migrant.end())
+ {
+ if (external_tensor->second)
+ return external_tensor->second;
+ }
+- return getManagedTensor(ind);
++ return getNativeTensor(ind);
+ }
+
+- std::shared_ptr<T_Tensor> getManagedTensor(const ir::OperandIndex &ind)
++ std::shared_ptr<T_Tensor> getNativeTensor(const ir::OperandIndex &ind)
+ {
+- auto tensor = _managed.find(ind);
+- if (tensor != _managed.end())
++ auto tensor = _native.find(ind);
++ if (tensor != _native.end())
+ return tensor->second;
+ return nullptr;
+ }
+
+- bool setExternalTensor(const ir::OperandIndex &ind,
+- const std::shared_ptr<IPortableTensor> &tensor)
++ bool setMigrantTensor(const ir::OperandIndex &ind, const std::shared_ptr<IPortableTensor> &tensor)
+ {
+ // TODO Uncomment this as two tensors for an index is not allowed.
+ // But now it is temporarily allowed as a workaround. External one hides Managed one.
+- // auto itr = _managed.find(ind);
+- // if (itr != _managed.end() && itr->second != nullptr && tensor != nullptr)
++ // auto itr = _native.find(ind);
++ // if (itr != _native.end() && itr->second != nullptr && tensor != nullptr)
+ // throw std::runtime_error{
+- // "Tried to set an external tensor but an managed tensor already exists."};
+- _external[ind] = tensor;
++ // "Tried to set an migrant tensor but an native tensor already exists."};
++ _migrant[ind] = tensor;
+ return true;
+ }
+
+- void setManagedTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
++ void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
+ {
+- auto itr = _external.find(ind);
+- if (itr != _external.end() && itr->second != nullptr && tensor != nullptr)
++ auto itr = _migrant.find(ind);
++ if (itr != _migrant.end() && itr->second != nullptr && tensor != nullptr)
+ throw std::runtime_error{
+- "Tried to set a managed tensor but an external tensor already exists."};
+- _managed[ind] = tensor;
++ "Tried to set a native tensor but an migrant tensor already exists."};
++ _native[ind] = tensor;
+ }
+
+- const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &managed_tensors() { return _managed; }
++ const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &native_tensors() { return _native; }
+
+- const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &external_tensors()
++ const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &migrant_tensors()
+ {
+- return _external;
++ return _migrant;
+ }
+
+ private:
+- ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _external;
+- ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _managed;
++ ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _migrant;
++ ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _native;
+ };
+
+ } // namespace backend
+diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+index 6ddacc7..a7e034a 100644
+--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
++++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+@@ -19,7 +19,7 @@
+
+ #include "MemoryManager.h"
+
+-#include "backend/ITensorManager.h"
++#include "backend/IStaticTensorManager.h"
+ #include "ir/OperandIndexMap.h"
+ #include "ir/OperandInfo.h"
+ #include "TensorRegistry.h"
+@@ -31,7 +31,7 @@ namespace backend
+ namespace cpu_common
+ {
+
+-class StaticTensorManager : public backend::ITensorManager
++class StaticTensorManager : public backend::IStaticTensorManager
+ {
+ public:
+ StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg);
+diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h
+index 379143b..b3391a3 100644
+--- a/runtime/onert/core/include/compiler/StaticShapeInference.h
++++ b/runtime/onert/core/include/compiler/StaticShapeInference.h
+@@ -99,6 +99,7 @@ private:
+ void visit(const ir::operation::LogicalNot &op) override;
+ void visit(const ir::operation::LogicalOr &op) override;
+ void visit(const ir::operation::Logistic &op) override;
++ void visit(const ir::operation::L2Normalization &op) override;
+ void visit(const ir::operation::MatrixBandPart &op) override;
+ void visit(const ir::operation::Max &op) override;
+ void visit(const ir::operation::Min &op) override;
+diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h
+index 113c348..601c1bf 100644
+--- a/runtime/onert/core/include/exec/DynamicShapeInference.h
++++ b/runtime/onert/core/include/exec/DynamicShapeInference.h
+@@ -72,6 +72,7 @@ public:
+ void visit(const ir::operation::LogicalNot &op) override;
+ void visit(const ir::operation::LogicalOr &op) override;
+ void visit(const ir::operation::Logistic &op) override;
++ void visit(const ir::operation::L2Normalization &op) override;
+ void visit(const ir::operation::MatrixBandPart &op) override;
+ void visit(const ir::operation::Max &op) override;
+ void visit(const ir::operation::Min &op) override;
+diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h
+index 5fac54e..e3b5d19 100644
+--- a/runtime/onert/core/include/ir/Operations.Include.h
++++ b/runtime/onert/core/include/ir/Operations.Include.h
+@@ -103,3 +103,4 @@
+ #include "ir/operation/BatchMatMul.h"
+ #include "ir/operation/FusedBatchNorm.h"
+ #include "ir/operation/LogSoftmax.h"
++#include "ir/operation/Quantize.h"
+diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst
+index 9d0642f..03a2aa2 100644
+--- a/runtime/onert/core/include/ir/Operations.lst
++++ b/runtime/onert/core/include/ir/Operations.lst
+@@ -106,3 +106,4 @@ OP(MatrixBandPart)
+ OP(BatchMatMul)
+ OP(FusedBatchNorm)
+ OP(LogSoftmax)
++OP(Quantize)
+diff --git a/runtime/onert/core/include/ir/operation/LogSoftmax.h b/runtime/onert/core/include/ir/operation/LogSoftmax.h
+index 26a92d7..391b4ba 100644
+--- a/runtime/onert/core/include/ir/operation/LogSoftmax.h
++++ b/runtime/onert/core/include/ir/operation/LogSoftmax.h
+@@ -48,7 +48,7 @@ public:
+
+ public:
+ void accept(OperationVisitor &v) const override;
+- OpCode opcode() const final { return OpCode::Softmax; }
++ OpCode opcode() const final { return OpCode::LogSoftmax; }
+
+ public:
+ const Param &param() const { return _param; }
+diff --git a/runtime/onert/core/include/ir/operation/Pad.h b/runtime/onert/core/include/ir/operation/Pad.h
+index a486061..00481cd 100644
+--- a/runtime/onert/core/include/ir/operation/Pad.h
++++ b/runtime/onert/core/include/ir/operation/Pad.h
+@@ -33,7 +33,7 @@ public:
+ {
+ INPUT = 0,
+ PAD = 1,
+- // VALUE = 2 Not allow padding value operand yet
++ VALUE = 2
+ };
+
+ public:
+diff --git a/runtime/onert/core/include/ir/operation/Quantize.h b/runtime/onert/core/include/ir/operation/Quantize.h
+new file mode 100644
+index 0000000..2533ce4
+--- /dev/null
++++ b/runtime/onert/core/include/ir/operation/Quantize.h
+@@ -0,0 +1,49 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_IR_OPERATION_QUANTIZE_H__
++#define __ONERT_IR_OPERATION_QUANTIZE_H__
++
++#include "ir/Operation.h"
++
++namespace onert
++{
++namespace ir
++{
++namespace operation
++{
++
++class Quantize : public Operation
++{
++public:
++ enum Input
++ {
++ INPUT = 0,
++ };
++
++public:
++ Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs);
++
++public:
++ void accept(OperationVisitor &v) const override;
++ OpCode opcode() const final { return OpCode::Quantize; }
++};
++
++} // namespace operation
++} // namespace ir
++} // namespace onert
++
++#endif // __ONERT_IR_OPERATION_QUANTIZE_H__
+diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+index 32a8041..c374aba 100644
+--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
++++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+@@ -36,7 +36,7 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<cpu_common::Ten
+ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
+ {
+ // NOTE Handle user tensors first
+- auto user_tensor = _user_tensors->getManagedTensor(ind);
++ auto user_tensor = _user_tensors->getNativeTensor(ind);
+ if (user_tensor)
+ {
+ // User tensors cannot be reallocated.
+@@ -47,8 +47,8 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
+ user_tensor->setShape(new_shape);
+ }
+
+- // NOTE Then handle managed tensors
+- auto tensor = _tensors->getManagedTensor(ind);
++ // NOTE Then handle native tensors
++ auto tensor = _tensors->getNativeTensor(ind);
+ assert(tensor);
+
+ bool previously_dynamic = tensor->is_dynamic();
+@@ -101,9 +101,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
+ const ir::OperandInfo &tensor_info,
+ ir::Layout backend_layout)
+ {
+- assert(_tensors->getManagedTensor(ind) == nullptr);
++ assert(_tensors->getNativeTensor(ind) == nullptr);
+ auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout);
+- _tensors->setManagedTensor(ind, tensor);
++ _tensors->setNativeTensor(ind, tensor);
+ }
+
+ void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
+@@ -130,7 +130,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
+ auto &input_set = find->second;
+ for (auto input_ind : input_set)
+ {
+- if (!_tensors->getManagedTensor(input_ind)->is_dynamic())
++ if (!_tensors->getNativeTensor(input_ind)->is_dynamic())
+ continue;
+
+ _dynamic_mem_mgr->deallocate(input_ind);
+@@ -141,7 +141,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
+
+ void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
+ {
+- if (!_tensors->getManagedTensor(output_ind)->is_dynamic())
++ if (!_tensors->getNativeTensor(output_ind)->is_dynamic())
+ return;
+
+ _dynamic_mem_mgr->deallocate(output_ind);
+diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+index 4b683fb..eb83b7d 100644
+--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
++++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+@@ -81,23 +81,23 @@ void KernelGenerator::visit(const ir::operation::If &node)
+ std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
+ for (const auto input_index : node.getInputs())
+ {
+- auto input_alloc = getTensor(input_index);
++ auto input_tensor = getTensor(input_index);
+
+- input_tensors.emplace_back(input_alloc);
++ input_tensors.emplace_back(input_tensor);
+ }
+
+ std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+ exec::DynAllocInfoMap outputs_dyn_alloc_info;
+ for (const auto output_index : node.getOutputs())
+ {
+- auto output_alloc = getTensor(output_index);
++ auto output_tensor = getTensor(output_index);
+
+- output_tensors.emplace_back(output_alloc);
++ output_tensors.emplace_back(output_tensor);
+ const auto output_tensor_builder = getTensorBuilder(output_index);
+ if (output_tensor_builder->supportDynamicTensor())
+ {
+ auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
+- outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
++ outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
+ }
+ }
+
+@@ -146,24 +146,24 @@ void KernelGenerator::visit(const ir::operation::While &node)
+ std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
+ for (const auto input_index : node.getInputs())
+ {
+- auto input_alloc = getTensor(input_index);
++ auto input_tensor = getTensor(input_index);
+
+- input_tensors.emplace_back(input_alloc);
++ input_tensors.emplace_back(input_tensor);
+ }
+
+ std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+ std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
+ for (const auto output_index : node.getOutputs())
+ {
+- auto output_alloc = getTensor(output_index);
++ auto output_tensor = getTensor(output_index);
+
+- output_tensors.emplace_back(output_alloc);
++ output_tensors.emplace_back(output_tensor);
+
+ const auto output_tensor_builder = getTensorBuilder(output_index);
+ if (output_tensor_builder->supportDynamicTensor())
+ {
+ auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
+- outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
++ outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
+ }
+ }
+
+@@ -199,7 +199,7 @@ KernelGenerator::getTensorBuilder(const ir::OperandIndex &index)
+ for (auto tensor_builder : _tensor_builder_set)
+ {
+ auto reg = tensor_builder->tensorRegistry();
+- auto tensor = reg ? reg->getManagedITensor(index) : tensor_builder->tensorAt(index);
++ auto tensor = reg ? reg->getNativeITensor(index) : tensor_builder->tensorAt(index);
+ if (tensor)
+ {
+ ret = tensor_builder;
+diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+index 16cd3ec..5bddb91 100644
+--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
++++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+@@ -92,7 +92,7 @@ void TensorBuilder::allocate()
+ std::shared_ptr<ITensor> TensorBuilder::tensorAt(const ir::OperandIndex &ind)
+ {
+ // NOTE Find from User Tensor Registry first
+- // FIXME There may be both user tensor and managed tensor for a `ind` which is a waste
++ // FIXME There may be both user tensor and native tensor for a `ind` which is a waste
+ auto user_tensor = _user_tensor_reg->getITensor(ind);
+ auto tensor = _tensor_reg->getITensor(ind);
+ if (user_tensor)
+@@ -107,7 +107,7 @@ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->ite
+
+ std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
+ {
+- return _tensor_reg->getManagedTensor(ind);
++ return _tensor_reg->getNativeTensor(ind);
+ }
+
+ std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
+@@ -123,7 +123,7 @@ std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
+ void TensorBuilder::setUserTensor(const ir::OperandIndex &ind,
+ const std::shared_ptr<UserTensor> &tensor)
+ {
+- _user_tensor_reg->setManagedTensor(ind, tensor);
++ _user_tensor_reg->setNativeTensor(ind, tensor);
+ }
+
+ } // namespace controlflow
+diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h
+index ce94ea0..b9b2d52 100644
+--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h
++++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h
+@@ -68,6 +68,7 @@ public:
+ void set_dynamic() override { _dynamic = true; }
+ ir::Shape getShape() const override { return _info.shape(); }
+ void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); }
++ bool is_constant() const override { return false; }
+
+ private:
+ ir::OperandInfo _info;
+diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+index 0ccf700..ede403b 100644
+--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
++++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+@@ -35,7 +35,7 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
+ {
+ VERBOSE_F() << ind << std::endl;
+
+- auto tensor = _tensors->getManagedTensor(ind);
++ auto tensor = _tensors->getNativeTensor(ind);
+ assert(tensor);
+
+ bool previously_dynamic = tensor->is_dynamic();
+@@ -88,9 +88,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
+ const ir::OperandInfo &tensor_info,
+ ir::Layout backend_layout)
+ {
+- assert(_tensors->getManagedTensor(ind) == nullptr);
++ assert(_tensors->getNativeTensor(ind) == nullptr);
+ auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
+- _tensors->setManagedTensor(ind, tensor);
++ _tensors->setNativeTensor(ind, tensor);
+ }
+
+ void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
+@@ -117,7 +117,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
+ auto &input_set = find->second;
+ for (auto input_ind : input_set)
+ {
+- auto *tensor = _tensors->getManagedTensor(input_ind).get();
++ auto *tensor = _tensors->getNativeTensor(input_ind).get();
+ if (!tensor->is_dynamic())
+ continue;
+
+@@ -131,7 +131,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
+
+ void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
+ {
+- auto *tensor = _tensors->getManagedTensor(output_ind).get();
++ auto *tensor = _tensors->getNativeTensor(output_ind).get();
+ if (!tensor->is_dynamic())
+ return;
+
+diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+index 47bea35..8604542 100644
+--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
++++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+@@ -33,7 +33,7 @@ StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &
+
+ void StaticTensorManager::allocateConsts(void)
+ {
+- for (auto &pair : _tensors->managed_tensors())
++ for (auto &pair : _tensors->native_tensors())
+ {
+ const auto &ind = pair.first;
+ auto tensor = pair.second;
+@@ -42,9 +42,9 @@ void StaticTensorManager::allocateConsts(void)
+ auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size());
+ tensor->setBuffer(mem_alloc);
+ auto buffer = mem_alloc->base();
+- VERBOSE(CPU_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
+- << "): " << static_cast<void *>(buffer)
+- << "size : " << tensor->total_size() << std::endl;
++ VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
++ << "): " << static_cast<void *>(buffer)
++ << "size : " << tensor->total_size() << std::endl;
+ }
+ }
+ }
+@@ -53,7 +53,7 @@ void StaticTensorManager::allocateNonconsts(void)
+ {
+ _nonconst_mgr->allocate();
+
+- for (auto &pair : _tensors->managed_tensors())
++ for (auto &pair : _tensors->native_tensors())
+ {
+ const auto &ind = pair.first;
+ auto tensor = pair.second;
+@@ -62,8 +62,8 @@ void StaticTensorManager::allocateNonconsts(void)
+ auto *buffer = _nonconst_mgr->getBuffer(ind);
+ tensor->setBuffer(buffer);
+
+- VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
+- << "): " << static_cast<void *>(buffer) << std::endl;
++ VERBOSE(CPU_COMMON_StaticTensorManager) << "TENSOR(#" << ind.value()
++ << "): " << static_cast<void *>(buffer) << std::endl;
+ }
+ }
+ }
+@@ -76,18 +76,18 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
+ const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
+ bool as_const)
+ {
+- assert(!_tensors->getManagedTensor(ind));
++ assert(!_tensors->getNativeTensor(ind));
+ auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
+- _tensors->setManagedTensor(ind, tensor);
++ _tensors->setNativeTensor(ind, tensor);
+ _as_constants[ind] = as_const;
+ }
+
+ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
+ {
+- assert(_tensors->getManagedTensor(ind));
++ assert(_tensors->getNativeTensor(ind));
+
+ // This method is called only when a tensor has proper shape
+- assert(!_tensors->getManagedTensor(ind)->is_dynamic());
++ assert(!_tensors->getNativeTensor(ind)->is_dynamic());
+
+ if (!_as_constants[ind])
+ _nonconst_mgr->claimPlan(ind, size);
+@@ -95,10 +95,10 @@ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
+
+ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
+ {
+- assert(_tensors->getManagedTensor(ind));
++ assert(_tensors->getNativeTensor(ind));
+
+ // This method is called only when a tensor has proper shape
+- assert(!_tensors->getManagedTensor(ind)->is_dynamic());
++ assert(!_tensors->getNativeTensor(ind)->is_dynamic());
+
+ if (!_as_constants[ind])
+ _nonconst_mgr->releasePlan(ind);
+@@ -106,7 +106,7 @@ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
+
+ void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
+ {
+- for (const auto &it : _tensors->managed_tensors())
++ for (const auto &it : _tensors->native_tensors())
+ fn(it.first);
+ }
+
+diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
+index f3f69ad..8439b6a 100644
+--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
++++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
+@@ -201,18 +201,35 @@ ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
+ // Add tensor to controlflow TensorRegistry.
+ cf_tensor_builder->setUserTensor(ind, tensor);
+ ret.push_back(tensor);
+-
+- // Set other tensors as external tensors
+- for (auto &tensor_builder : tensor_builders)
+- {
+- // FIXME This is a workaround registering all user tensors to all backends
+- // FIXME Handle when it is failed
+- tensor_builder->setExternalTensor(ind, tensor);
+- }
+ }
+ return ret;
+ }
+
++void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph,
++ TensorBuilders &tensor_builders)
++{
++ lowered_graph.op_seqs().iterate(
++ [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
++ auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
++ auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
++ for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
++ ir::Remove::UNDEFINED)
++ {
++ // If an OpSequence input/output tensor does not have a own tensor object,
++ // it must be using external tensors, so find the tensor from other tensor builders and
++ // set the tensor to this tensor builder if portable
++ if (!backend_ctx->tensor_builder->tensorAt(ind))
++ {
++ auto tensor = tensor_builders.getITensor(ind);
++ assert(tensor); // The tensor must have been created in one of TensorBuilders
++ auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
++ if (ptensor)
++ backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor);
++ }
++ }
++ });
++}
++
+ exec::IExecutor *
+ ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
+ const compiler::CompilerOptions &options,
+@@ -265,6 +282,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_
+ tensor_builder->prepare();
+ }
+
++ prepareExternalTensors(*lowered_graph, tensor_builders);
++
+ ExecutionBuilder builder;
+
+ // Generate kernels
+@@ -367,6 +386,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
+ tensor_builder->prepare();
+ }
+
++ prepareExternalTensors(*lowered_graph, tensor_builders);
++
+ ExecutionBuilder builder;
+
+ // Generate kernels
+diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
+index 1e82b98..418e5a7 100644
+--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
++++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
+@@ -22,6 +22,7 @@
+ #include "backend/ITensor.h"
+ #include "exec/IExecutor.h"
+ #include "ir/LoweredGraph.h"
++#include "TensorBuilders.h"
+
+ namespace onert
+ {
+@@ -48,6 +49,8 @@ private:
+ static std::vector<std::shared_ptr<backend::ITensor>>
+ initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
+ const ir::OperandIndexSequence &indices);
++ static void prepareExternalTensors(ir::LoweredGraph &lowered_graph,
++ TensorBuilders &tensor_builders);
+ static exec::IExecutor *
+ createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
+ const compiler::CompilerOptions &options,
+diff --git a/runtime/onert/core/src/compiler/HEScheduler.h b/runtime/onert/core/src/compiler/HEScheduler.h
+index f507539..d8ceca9 100644
+--- a/runtime/onert/core/src/compiler/HEScheduler.h
++++ b/runtime/onert/core/src/compiler/HEScheduler.h
+@@ -51,16 +51,12 @@ public:
+ * @param[in] backend_resolver backend resolver
+ */
+ HEScheduler(const backend::BackendContexts &backend_contexts, const CompilerOptions &options)
+- : _backend_contexts{backend_contexts}, _is_supported{}, _backends_avail_time{}, _ops_eft{},
++ : _is_supported{}, _backends_avail_time{}, _ops_eft{},
+ _op_to_rank{std::make_shared<ir::OperationIndexMap<int64_t>>()},
+ _is_profiling_mode{options.he_profiling_mode},
+ _is_linear_exec{options.executor == "Linear"},
+ _is_parallel_exec{options.executor == "Parallel"}
+ {
+- // Workaround to avoid unused-private-field warning
+- // TODO use _backend_contexts and remove workaround
+- (void)_backend_contexts;
+-
+ for (auto &entry : backend_contexts)
+ {
+ _all_backends.push_back(entry.first);
+@@ -165,7 +161,6 @@ private:
+ // whether it should assign these backends to these nodes:
+ // * It stores false for unsupported nodes
+ // * During rank calculation with enabled profiling mode it stores true for supported nodes
+- const backend::BackendContexts &_backend_contexts;
+ std::unordered_map<const backend::Backend *, std::unordered_map<std::string, bool>> _is_supported;
+ // Finishing and starting time of each backend
+ std::unordered_map<const backend::Backend *, std::map<int64_t, int64_t>> _backends_avail_time;
+@@ -175,8 +170,7 @@ private:
+ std::unique_ptr<compiler::BackendResolver> _backend_resolver;
+ std::unique_ptr<exec::ExecTime> _exec_time;
+ const ir::Graph *_graph{nullptr};
+- std::vector<const backend::Backend *>
+- _all_backends; // TODO Remove this and use _backend_contexts instead
++ std::vector<const backend::Backend *> _all_backends;
+ const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to controlflow_backend
+ bool _is_profiling_mode;
+ bool _is_linear_exec;
+diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc
+index 5c545ae..fa5ee27 100644
+--- a/runtime/onert/core/src/compiler/OperationValidator.cc
++++ b/runtime/onert/core/src/compiler/OperationValidator.cc
+@@ -41,6 +41,21 @@ OperationValidator::OperationValidator(const ir::Graph &graph)
+ {
+ }
+
++void OperationValidator::checkUnaryOp(const ir::Operation &node)
++{
++ const auto output_index{node.getOutputs().at(0)};
++ const auto input_index{node.getInputs().at(0)};
++
++ // Check if I/O types match
++ OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
++
++ if (_ctx.at(output_index).info().isDynamic())
++ return;
++
++ // Check if I/O shapes match
++ OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
++}
++
+ void OperationValidator::operator()()
+ {
+ // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
+@@ -53,16 +68,7 @@ void OperationValidator::operator()()
+ [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
+ }
+
+-void OperationValidator::visit(const ir::operation::Abs &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
+-
+- const auto input_index{node.getInputs().at(0)};
+-
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Abs &node) { checkUnaryOp(node); }
+
+ void OperationValidator::visit(const ir::operation::AvgPool2D &node)
+ {
+@@ -292,17 +298,7 @@ void OperationValidator::visit(const ir::operation::RNN &node)
+ num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
+ }
+
+-void OperationValidator::visit(const ir::operation::Round &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- const auto input_index{node.getInputs().at(ir::operation::Round::Input::INPUT)};
+-
+- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+-
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Round &node) { checkUnaryOp(node); }
+
+ void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
+ {
+@@ -393,17 +389,7 @@ void OperationValidator::visit(const ir::operation::EmbeddingLookup &node)
+ }
+ }
+
+-void OperationValidator::visit(const ir::operation::Exp &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+-
+- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+-
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Exp &node) { checkUnaryOp(node); }
+
+ void OperationValidator::visit(const ir::operation::ExpandDims &node)
+ {
+@@ -419,17 +405,7 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node)
+ OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
+ }
+
+-void OperationValidator::visit(const ir::operation::Floor &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- const auto input_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
+-
+- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+-
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Floor &node) { checkUnaryOp(node); }
+
+ void OperationValidator::visit(const ir::operation::HashtableLookup &node)
+ {
+@@ -789,6 +765,25 @@ void OperationValidator::visit(const ir::operation::LSTM &node)
+ }
+ }
+
++void OperationValidator::visit(const ir::operation::L2Normalization &node)
++{
++ const auto ofm_index{node.getOutputs().at(0)};
++ if (_ctx.at(ofm_index).info().isDynamic())
++ return;
++
++ const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
++
++ auto ifm_shape = _ctx.at(ifm_index).shape();
++ auto ofm_shape = _ctx.at(ofm_index).shape();
++
++ OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
++
++ for (auto i = 0; i < ifm_shape.rank(); i++)
++ {
++ OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
++ }
++}
++
+ void OperationValidator::visit(const ir::operation::Unpack &node)
+ {
+ const auto num{node.param().num};
+@@ -904,35 +899,11 @@ void OperationValidator::visit(const ir::operation::Split &node)
+ OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
+ }
+
+-void OperationValidator::visit(const ir::operation::Cos &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
+-
+- const auto input_index{node.getInputs().at(0)};
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
+-
+-void OperationValidator::visit(const ir::operation::Sin &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
++void OperationValidator::visit(const ir::operation::Cos &node) { checkUnaryOp(node); }
+
+- const auto input_index{node.getInputs().at(0)};
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Sin &node) { checkUnaryOp(node); }
+
+-void OperationValidator::visit(const ir::operation::RSQRT &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
+-
+- const auto input_index{node.getInputs().at(0)};
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::RSQRT &node) { checkUnaryOp(node); }
+
+ void OperationValidator::visit(const ir::operation::Shape &node)
+ {
+@@ -972,35 +943,11 @@ void OperationValidator::visit(const ir::operation::While &node)
+ // TODO Add to validate with subgraphs
+ }
+
+-void OperationValidator::visit(const ir::operation::Neg &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
++void OperationValidator::visit(const ir::operation::Neg &node) { checkUnaryOp(node); }
+
+- const auto input_index{node.getInputs().at(0)};
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Log &node) { checkUnaryOp(node); }
+
+-void OperationValidator::visit(const ir::operation::Log &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
+-
+- const auto input_index{node.getInputs().at(0)};
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
+-
+-void OperationValidator::visit(const ir::operation::LogicalNot &node)
+-{
+- const auto output_index{node.getOutputs().at(0)};
+- if (_ctx.at(output_index).info().isDynamic())
+- return;
+-
+- const auto input_index{node.getInputs().at(0)};
+- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::LogicalNot &node) { checkUnaryOp(node); }
+
+ void OperationValidator::visit(const ir::operation::SquaredDifference &node)
+ {
+@@ -1118,5 +1065,25 @@ void OperationValidator::visit(const ir::operation::LogSoftmax &node)
+
+ OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+ }
++
++void OperationValidator::visit(const ir::operation::Quantize &node)
++{
++ VERBOSE(Quantize) << "Configure Quantize operation" << std::endl;
++
++ OP_REQUIRES(node.getInputs().size() == 1);
++ OP_REQUIRES(node.getOutputs().size() == 1);
++
++ const auto input_index{node.getInputs().at(0)};
++ const auto output_index{node.getOutputs().at(0)};
++
++ OP_REQUIRES(_ctx.at(input_index).typeInfo().type() == ir::DataType::FLOAT32);
++
++ if (_ctx.at(output_index).info().isDynamic())
++ return;
++
++ OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
++
++ OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
++}
+ } // namespace compiler
+ } // namespace onert
+diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h
+index 6ceafe8..55a4dd5 100644
+--- a/runtime/onert/core/src/compiler/OperationValidator.h
++++ b/runtime/onert/core/src/compiler/OperationValidator.h
+@@ -70,6 +70,7 @@ public:
+ void visit(const ir::operation::DepthToSpace &node) override;
+ void visit(const ir::operation::Pack &node) override;
+ void visit(const ir::operation::LSTM &node) override;
++ void visit(const ir::operation::L2Normalization &node) override;
+ void visit(const ir::operation::Unpack &node) override;
+ void visit(const ir::operation::Pad &node) override;
+ void visit(const ir::operation::Min &node) override;
+@@ -93,9 +94,10 @@ public:
+ void visit(const ir::operation::Range &node) override;
+ void visit(const ir::operation::MatrixBandPart &node) override;
+ void visit(const ir::operation::LogSoftmax &node) override;
++ void visit(const ir::operation::Quantize &node) override;
+
+ private:
+- void checkReduceOp(const ir::OperandIndex input_index, const ir::OperandIndex output_index);
++ void checkUnaryOp(const ir::Operation &node);
+
+ private:
+ // TODO Remove _ctx field
+diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc
+index 5a58f2e..66de599 100644
+--- a/runtime/onert/core/src/compiler/StaticShapeInference.cc
++++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc
+@@ -497,6 +497,11 @@ void StaticShapeInferer::visit(const ir::operation::Logistic &op)
+ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::Input::INPUT));
+ }
+
++void StaticShapeInferer::visit(const ir::operation::L2Normalization &op)
++{
++ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT));
++}
++
+ void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op)
+ {
+ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT));
+diff --git a/runtime/onert/core/src/compiler/TensorBuilders.h b/runtime/onert/core/src/compiler/TensorBuilders.h
+index 4bb7413..c0a1ebc 100644
+--- a/runtime/onert/core/src/compiler/TensorBuilders.h
++++ b/runtime/onert/core/src/compiler/TensorBuilders.h
+@@ -23,6 +23,7 @@
+ #include "backend/Backend.h"
+ #include "backend/controlflow/Config.h"
+ #include "backend/controlflow/TensorBuilder.h"
++#include "util/logging.h"
+
+ namespace onert
+ {
+@@ -66,6 +67,17 @@ public:
+ return _cf_tensor_builder;
+ }
+
++ std::shared_ptr<backend::ITensor> getITensor(ir::OperandIndex ind)
++ {
++ for (auto &tensor_builder : _tensor_builders)
++ {
++ auto tensor = tensor_builder->tensorAt(ind);
++ if (tensor)
++ return tensor;
++ }
++ return nullptr;
++ }
++
+ private:
+ std::unordered_set<std::shared_ptr<backend::ITensorBuilder>> _tensor_builders;
+ std::shared_ptr<backend::controlflow::TensorBuilder> _cf_tensor_builder;
+diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc
+index 1b82029..28e92ba 100644
+--- a/runtime/onert/core/src/exec/DynamicShapeInference.cc
++++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc
+@@ -442,6 +442,11 @@ void DynamicShapeInferer::visit(const ir::operation::Logistic &op)
+ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::INPUT));
+ }
+
++void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op)
++{
++ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT));
++}
++
+ void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op)
+ {
+ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT));
+diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
+index a7409b9..864ccb3 100644
+--- a/runtime/onert/core/src/exec/ExecutorBase.cc
++++ b/runtime/onert/core/src/exec/ExecutorBase.cc
+@@ -46,7 +46,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
+ {
+ auto tensor_registry = tensor_builder->tensorRegistry();
+ assert(tensor_registry);
+- tensor = tensor_registry->getManagedITensor(ind);
++ tensor = tensor_registry->getNativeITensor(ind);
+ if (tensor != nullptr)
+ {
+ if (tensor_builder->supportDynamicTensor())
+@@ -71,7 +71,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
+ {
+ auto tensor_registry = tensor_builder->tensorRegistry();
+ assert(tensor_registry);
+- tensor = tensor_registry->getManagedITensor(ind);
++ tensor = tensor_registry->getNativeITensor(ind);
+ if (tensor != nullptr)
+ {
+ if (tensor_builder->supportDynamicTensor())
+diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc
+index d2e3627..c8dce69 100644
+--- a/runtime/onert/core/src/interp/operations/Pad.cc
++++ b/runtime/onert/core/src/interp/operations/Pad.cc
+@@ -69,8 +69,8 @@ void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITenso
+ const int32_t *pad_ptr = reinterpret_cast<const int32_t *>(pad_buffer);
+ float *output_ptr = reinterpret_cast<float *>(output_buffer);
+
+- nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, output_ptr,
+- nullptr);
++ nnfw::cker::Pad<float>(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape,
++ output_ptr, nullptr);
+ }
+
+ void invokePad(const ExecEnv *env, const ir::Operation &node)
+diff --git a/runtime/onert/core/src/ir/LoweredGraph.cc b/runtime/onert/core/src/ir/LoweredGraph.cc
+index 6e93a23..f138089 100644
+--- a/runtime/onert/core/src/ir/LoweredGraph.cc
++++ b/runtime/onert/core/src/ir/LoweredGraph.cc
+@@ -122,9 +122,6 @@ LoweredGraph::LoweredGraph(const Graph &graph, const compiler::CompilerOptions &
+
+ pass::PermutationInsertionPass pi_pass(*this);
+ pi_pass.run();
+- // Implemented code no longer works.
+- // pass::PermutationEliminationPass pe_pass(*this);
+- // pe_pass.run();
+
+ _op_seqs.dump("merged and sorted operations with permutation", _graph.operations());
+ }
+diff --git a/runtime/onert/core/src/ir/operation/Quantize.cc b/runtime/onert/core/src/ir/operation/Quantize.cc
+new file mode 100644
+index 0000000..0e3d5b6
+--- /dev/null
++++ b/runtime/onert/core/src/ir/operation/Quantize.cc
+@@ -0,0 +1,37 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "ir/operation/Quantize.h"
++
++#include "ir/OperationVisitor.h"
++
++namespace onert
++{
++namespace ir
++{
++namespace operation
++{
++
++void Quantize::accept(OperationVisitor &v) const { v.visit(*this); }
++
++Quantize::Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
++ : Operation{OperandConstraint::createExact(2u), inputs, outputs}
++{
++}
++
++} // namespace operation
++} // namespace ir
++} // namespace onert
+diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
+deleted file mode 100644
+index 9e0291e..0000000
+--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
++++ /dev/null
+@@ -1,195 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-#include "PermutationEliminationPass.h"
+-
+-#include "ir/Operand.h"
+-#include "ir/operand/LowerInfo.h"
+-#include "ir/Graph.h"
+-#include "backend/IConfig.h"
+-#include "util/logging.h"
+-
+-namespace onert
+-{
+-namespace ir
+-{
+-namespace pass
+-{
+-void PermutationEliminationPass::callback(const OperandIndex &inp_index, Operand &object)
+-{
+- if (_graph.getInputs().contains(inp_index))
+- {
+- eliminateInput(inp_index, object);
+- }
+- else if (_graph.getOutputs().contains(inp_index))
+- {
+- eliminateOutput(inp_index, object);
+- }
+-}
+-
+-void PermutationEliminationPass::eliminateInput(const OperandIndex &inp_index, Operand &object)
+-{
+- auto &model_inputs = _graph.getInputs();
+-
+- // get uses of the model's given input
+- auto uses = object.getUses();
+-
+- // input must be used just by permutation
+- if (uses.size() != 1)
+- {
+- return;
+- }
+-
+- for (auto input_use : uses)
+- {
+- auto &perm_operation = _graph.operations().at(input_use);
+- auto perm_inputs = perm_operation.getInputs();
+-
+- auto perm_outputs = perm_operation.getOutputs();
+-
+- if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, true))
+- {
+- return;
+- }
+-
+- assert(perm_inputs.at(0) == inp_index);
+-
+- VERBOSE(PermutationEliminationPass::EliminateInput) << "remove NHWC_TO_NCHW permutation\n";
+-
+- // set model's new input, which was output of permutation
+- model_inputs.replace(inp_index, perm_outputs.at(0));
+-
+- // remove model's input, which is also input of permutation
+- _graph.removeOperand(inp_index);
+-
+- // remove permutation operation
+- assert(_lowered_graph.op_seqs().containsOperation(input_use));
+- auto op_seq_idx = _lowered_graph.op_seqs().getOperation(input_use);
+- _lowered_graph.op_seqs().remove(op_seq_idx);
+- _graph.operations().remove(input_use);
+-
+- VERBOSE(PermutationEliminationPass::EliminateInput)
+- << inp_index.value() << " is model's input and is removed. New input is "
+- << perm_outputs.at(0).value() << "\n"
+- << input_use.value() << " is removed permutation operation\n";
+- }
+-}
+-
+-void PermutationEliminationPass::eliminateOutput(const OperandIndex &out_index, Operand &object)
+-{
+- auto &model_outputs = _graph.getOutputs();
+-
+- // get defs of the model's given output
+- auto defs = object.getDef();
+-
+- // output must use just permutation
+- if (defs.size() != 1)
+- {
+- return;
+- }
+-
+- for (auto output_def : defs)
+- {
+- auto &perm_operation = _graph.operations().at(output_def);
+- auto perm_outputs = perm_operation.getOutputs();
+-
+- auto perm_inputs = perm_operation.getInputs();
+- if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, false))
+- {
+- return;
+- }
+-
+- assert(perm_outputs.at(0) == out_index);
+-
+- VERBOSE(PermutationEliminationPass::EliminateOutput) << "remove NCHW_TO_NHWC permutation\n";
+-
+- // Update operations' output that is used by permute operand
+- for (auto perm_input_index : perm_inputs)
+- {
+- auto &perm_input_operand = _graph.operands().at(perm_input_index);
+- perm_input_operand.removeUse(output_def);
+- }
+-
+- // set model's new output, which was input of permutation
+- model_outputs.replace(out_index, perm_inputs.at(0));
+-
+- // remove model's output, which is also output of permutation
+- _graph.removeOperand(out_index);
+-
+- // remove permutation operation
+- assert(_lowered_graph.op_seqs().containsOperation(output_def));
+- auto op_seq_idx = _lowered_graph.op_seqs().getOperation(output_def);
+- _lowered_graph.op_seqs().remove(op_seq_idx);
+- _graph.operations().remove(output_def);
+-
+- VERBOSE(PermutationEliminationPass::EliminateOutput)
+- << out_index.value() << " is model's output and is removed. New output is "
+- << perm_inputs.at(0).value() << "\n"
+- << output_def.value() << " is removed permutation operation\n";
+- }
+-}
+-
+-bool PermutationEliminationPass::isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
+- const OperandIndexSequence &out_indexes,
+- bool is_for_model_input)
+-{
+- auto input_def_factors = _lowered_graph.getLowerInfo(inp_indexes.at(0))->def_factors();
+- auto output_def_factors = _lowered_graph.getLowerInfo(out_indexes.at(0))->def_factors();
+-
+- auto input_layout = input_def_factors.getOnlyElement().layout();
+- auto output_layout = output_def_factors.getOnlyElement().layout();
+-
+- if (input_def_factors.size() != 1 || output_def_factors.size() != 1)
+- {
+- return false;
+- }
+-
+- // all operands' factor must be the same
+- for (auto index : inp_indexes)
+- {
+- auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
+- if (op_factor_set.size() != 1 ||
+- input_layout != _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
+- {
+- return false;
+- }
+- }
+- // all operands' factor must be the same
+- for (auto index : out_indexes)
+- {
+- auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
+- if (op_factor_set.size() != 1 ||
+- output_layout !=
+- _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
+- {
+- return false;
+- }
+- }
+-
+- if (is_for_model_input)
+- {
+- // check if this is NHWC_TO_NCHW permutation: must have single input, which is model's input
+- return (inp_indexes.size() == 1 && input_layout == Layout::NHWC &&
+- output_layout == Layout::NCHW);
+- }
+-
+- // check if this is NCHW_TO_NHWC permutation: must have single output, which is model's output
+- return (out_indexes.size() == 1 && input_layout == Layout::NCHW && output_layout == Layout::NHWC);
+-}
+-
+-} // namespace pass
+-} // namespace ir
+-} // namespace onert
+diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
+deleted file mode 100644
+index 1c84300..0000000
+--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
++++ /dev/null
+@@ -1,86 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-#ifndef __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
+-#define __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
+-
+-#include "LoweredOperandPass.h"
+-#include "ir/Operand.h"
+-#include "ir/OperandIndexSequence.h"
+-
+-namespace onert
+-{
+-namespace ir
+-{
+-namespace pass
+-{
+-
+-class PermutationEliminationPass : public LoweredOperandPass
+-{
+-public:
+- using LoweredOperandPass::LoweredOperandPass;
+-
+-public:
+- std::string id() override { return "PermutationEliminationPass"; }
+-
+- void callback(const OperandIndex &index, Operand &object) override;
+-
+-private:
+- /**
+- * @brief Remove Permute operation that permutates input
+- *
+- * Note: This function aslo removes model's input and
+- * sets output of permutation as model's new input
+- *
+- * @param inp_index is the target operand index for the elimination
+- * @param object is the target operand object for the elimination
+- *
+- * @return
+- */
+- void eliminateInput(const OperandIndex &inp_index, Operand &object);
+-
+- /**
+- * @brief Remove Permute operation that permutates output of a model
+- *
+- * Note: This function aslo removes model's output and
+- * sets input of permutation as model's new output
+- *
+- * @param out_index is the target operand index for the elimination
+- * @param object is the target operand object for the elimination
+- *
+- * @return
+- */
+- void eliminateOutput(const OperandIndex &out_index, Operand &object);
+-
+- /**
+- * @brief Determine if passed operands are permute layer's input and output, that must be
+- * eliminated
+- *
+- * @param inp_index indexes of the input operand to operation
+- * @param out_index indexes of the output operand to operation
+- * @param is_for_model_input checking for model's input or output
+- *
+- * @return if it is permutation layer
+- */
+- bool isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
+- const OperandIndexSequence &out_indexes, bool is_for_model_input);
+-};
+-
+-} // namespace pass
+-} // namespace ir
+-} // namespace onert
+-
+-#endif // __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
+diff --git a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
+index 7c3da52..75efdd8 100644
+--- a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
++++ b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
+@@ -62,27 +62,26 @@ void PermutationInsertionPass::callback(const OperandIndex &index, Operand &obje
+ auto insert_set = operand_li->use_factors() - operand_li->def_factors();
+ auto def_factor = operand_li->def_factors().getOnlyElement();
+
+- auto compatible_backends = [](auto /* backend1 */, auto /* backend2 */) {
+- // TODO If other issues for Permute elimination are resolved, enable this
+- return false;
+- /*
++ auto compatible_backends = [](auto backend1, auto backend2) {
+ // TODO This is a workaround for not inserting Permute between cpu and controlflow.
+ // To be general, we need another way of checking they are compatible.
+ const auto cf = backend::controlflow::Config::ID;
+ const auto cpu = "cpu";
+ const auto id1 = backend1->config()->id();
+ const auto id2 = backend2->config()->id();
+- return (id1 == cpu && id2 == cf) // Allows no-Permute for Model inputs
+- || (id1 == cf && id2 == cpu); // Allows no-Permute for Model outputs
+- */
++ // NOTE This is to skip Permute insertion for model inputs(controlflow -> cpu), but not
++ // outputs. This function currently assumes that backend1 is Def and backend2 is Use. However
++ // it is going to be fixed soon.
++ // TODO make both ways work
++ return (id1 == cpu && id2 == cf);
+ };
+
+ for (auto factor : insert_set)
+ {
++ // Check exceptional cases that Permute ops are not inserted
+ if (factor.layout() == def_factor.layout() &&
+ compatible_backends(factor.backend(), def_factor.backend()))
+ {
+- // For this factor we can just reuse existing operand - Permute is not added.
+ VERBOSE(PermutationInsertionPass) << "Permutation Insertion is skipped for operand "
+ << index << " / as the tensor is compatible with backend "
+ << factor.backend()->config()->id() << std::endl;
+diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
+index f5687ad..f763346 100644
+--- a/runtime/onert/frontend/base_loader/include/base_loader.h
++++ b/runtime/onert/frontend/base_loader/include/base_loader.h
+@@ -171,6 +171,8 @@ protected:
+ void loadBroadcastTo(const Operator *op, ir::Graph &subg);
+ void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
+ void loadLogSoftmax(const Operator *op, ir::Graph &subg);
++ void loadQuantize(const Operator *op, ir::Graph &subg);
++ void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
+
+ protected:
+ // Base address for mapped region for loading (if needed)
+@@ -1123,6 +1125,22 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *o
+ std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs));
+ subg.addOperation(std::move(new_op));
+ }
++template <typename LoaderDomain, typename SpecificLoader>
++void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
++{
++ ir::OperandIndexSequence inputs;
++ ir::OperandIndexSequence outputs;
++ ir::operation::SpaceToDepth::Param param;
++
++ const auto *options = op->builtin_options_as_SpaceToDepthOptions();
++
++ param.block_size = options->block_size();
++
++ loadOperationIO(op, inputs, outputs);
++
++ std::unique_ptr<ir::Operation> new_op(new ir::operation::SpaceToDepth(inputs, outputs, param));
++ subg.addOperation(std::move(new_op));
++}
+
+ template <typename LoaderDomain, typename SpecificLoader>
+ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir::Graph &subg)
+@@ -1743,6 +1761,18 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadLogSoftmax(const Operator *op
+ }
+
+ template <typename LoaderDomain, typename SpecificLoader>
++void BaseLoader<LoaderDomain, SpecificLoader>::loadQuantize(const Operator *op, ir::Graph &subg)
++{
++ ir::OperandIndexSequence inputs;
++ ir::OperandIndexSequence outputs;
++
++ loadOperationIO(op, inputs, outputs);
++
++ std::unique_ptr<ir::Operation> new_op(new ir::operation::Quantize(inputs, outputs));
++ subg.addOperation(std::move(new_op));
++}
++
++template <typename LoaderDomain, typename SpecificLoader>
+ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, ir::Graph &subg)
+ {
+ const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
+@@ -1959,6 +1989,12 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
+ case BuiltinOperator::BuiltinOperator_LOG_SOFTMAX:
+ loadLogSoftmax(op, subg);
+ return;
++ case BuiltinOperator::BuiltinOperator_QUANTIZE:
++ loadQuantize(op, subg);
++ return;
++ case BuiltinOperator::BuiltinOperator_SPACE_TO_DEPTH:
++ loadSpaceToDepth(op, subg);
++ return;
+ default:
+ throw std::runtime_error(
+ std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
+diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+index 94791f8..00ffcb6 100644
+--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
++++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+@@ -106,6 +106,33 @@ getReduceGenerator(const onert::ir::operation::Reduce::ReduceType reduce_type)
+ };
+ }
+
++template <typename T>
++Operation *CreateSimpleUnaryOp(const OperationFactory::Param &init_param, Operands &)
++{
++ assert(init_param.input_count == 1 && init_param.output_count == 1);
++
++ OperandIndexSequence outputs{init_param.outputs[0]};
++
++ // Each input should be interpreted as follows:
++ //
++ // 0 -> Input Tensor Index
++ OperandIndexSequence inputs{init_param.inputs[0]};
++
++ return new T{inputs, outputs};
++}
++
++// A generator function for binary ops with no params
++template <typename T>
++Operation *createSimpleBinaryOp(const OperationFactory::Param &init_param, Operands &)
++{
++ assert(init_param.input_count == 2 && init_param.output_count == 1);
++
++ OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
++ OperandIndexSequence outputs{init_param.outputs[0]};
++
++ return new T{inputs, outputs};
++}
++
+ } // namespace
+
+ OperationFactory &OperationFactory::get()
+@@ -116,20 +143,10 @@ OperationFactory &OperationFactory::get()
+
+ OperationFactory::OperationFactory()
+ {
+- _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = [](const OperationFactory::Param &init_param,
+- Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- // 1 -> Block size Index
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+- return new operation::BatchToSpaceND{inputs, outputs};
+- };
++ // Each input should be interpreted as follows:
++ // 0 -> Input Tensor Index
++ // 1 -> Block size Index
++ _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = createSimpleBinaryOp<operation::BatchToSpaceND>;
+
+ _map[ANEURALNETWORKS_DEPTHWISE_CONV_2D] = [](const OperationFactory::Param &init_param,
+ Operands &operands) {
+@@ -724,44 +741,11 @@ OperationFactory::OperationFactory()
+ return new operation::Squeeze{inputs, outputs, param};
+ };
+
+- _map[ANEURALNETWORKS_TANH] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::Tanh{inputs, outputs};
+- };
+-
+- _map[ANEURALNETWORKS_LOG] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
++ _map[ANEURALNETWORKS_TANH] = CreateSimpleUnaryOp<operation::Tanh>;
+
+- OperandIndexSequence outputs{init_param.outputs[0]};
++ _map[ANEURALNETWORKS_LOG] = CreateSimpleUnaryOp<operation::Log>;
+
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::Log{inputs, outputs};
+- };
+-
+- _map[ANEURALNETWORKS_LOGISTIC] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::Logistic{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_LOGISTIC] = CreateSimpleUnaryOp<operation::Logistic>;
+
+ _map[ANEURALNETWORKS_DIV] = [](const OperationFactory::Param &init_param, Operands &operands) {
+ assert(init_param.input_count == 3 && init_param.output_count == 1);
+@@ -784,36 +768,16 @@ OperationFactory::OperationFactory()
+ return new operation::Div{inputs, outputs, param};
+ };
+
+- _map[ANEURALNETWORKS_EXP] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::Exp{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_EXP] = CreateSimpleUnaryOp<operation::Exp>;
+
+ // ANEURALNETWORKS_EXP_EX is deprecated
+ // TODO Remove ANEURALNETWORKS_EXP_EX
+ _map[ANEURALNETWORKS_EXP_EX] = _map[ANEURALNETWORKS_EXP];
+
+- _map[ANEURALNETWORKS_EXPAND_DIMS] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- // 1 -> Axis Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+- return new operation::ExpandDims{inputs, outputs};
+- };
++ // Each input should be interpreted as follows:
++ // 0 -> Input Tensor Index
++ // 1 -> Axis Tensor Index
++ _map[ANEURALNETWORKS_EXPAND_DIMS] = createSimpleBinaryOp<operation::ExpandDims>;
+
+ _map[ANEURALNETWORKS_GREATER] = [](const OperationFactory::Param &init_param, Operands &) {
+ assert(init_param.input_count == 2 && init_param.output_count == 1);
+@@ -982,19 +946,7 @@ OperationFactory::OperationFactory()
+ return new operation::Comparison{inputs, outputs, param};
+ };
+
+- _map[ANEURALNETWORKS_LOGICAL_AND] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> input0 Tensor Index
+- // 1 -> input1 Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+- return new operation::LogicalAnd{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_LOGICAL_AND] = createSimpleBinaryOp<operation::LogicalAnd>;
+
+ // ANEURALNETWORKS_LOGICAL_AND_EX is deprecated
+ // TODO Remove ANEURALNETWORKS_LOGICAL_AND_EX
+@@ -1018,18 +970,7 @@ OperationFactory::OperationFactory()
+ return new operation::LogicalAnd{inputs, outputs};
+ };
+
+- _map[ANEURALNETWORKS_RSQRT] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::RSQRT{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_RSQRT] = CreateSimpleUnaryOp<operation::RSQRT>;
+
+ _map[ANEURALNETWORKS_SELECT] = [](const OperationFactory::Param &init_param, Operands &) {
+ assert(init_param.input_count == 3 && init_param.output_count == 1);
+@@ -1065,18 +1006,7 @@ OperationFactory::OperationFactory()
+ // TODO Remove ANEURALNETWORKS_RSQRT_EX
+ _map[ANEURALNETWORKS_RSQRT_EX] = _map[ANEURALNETWORKS_RSQRT];
+
+- _map[ANEURALNETWORKS_RELU] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::ReLU{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_RELU] = CreateSimpleUnaryOp<operation::ReLU>;
+
+ _map[ANEURALNETWORKS_RESIZE_BILINEAR] = [](const OperationFactory::Param &init_param,
+ Operands &operands) {
+@@ -1098,31 +1028,9 @@ OperationFactory::OperationFactory()
+ return new operation::ResizeBilinear{inputs, outputs, param};
+ };
+
+- _map[ANEURALNETWORKS_RELU1] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
++ _map[ANEURALNETWORKS_RELU1] = CreateSimpleUnaryOp<operation::ReLU1>;
+
+- return new operation::ReLU1{inputs, outputs};
+- };
+-
+- _map[ANEURALNETWORKS_RELU6] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::ReLU6{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_RELU6] = CreateSimpleUnaryOp<operation::ReLU6>;
+
+ _map[ANEURALNETWORKS_REVERSE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
+ assert(init_param.input_count == 2 && init_param.output_count == 1);
+@@ -1438,18 +1346,7 @@ OperationFactory::OperationFactory()
+ return new operation::LogicalOr{inputs, outputs};
+ };
+
+- _map[ANEURALNETWORKS_LOGICAL_NOT] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::LogicalNot{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_LOGICAL_NOT] = CreateSimpleUnaryOp<operation::LogicalNot>;
+
+ // ANEURALNETWORKS_LOGICAL_NOT_EX is deprecated
+ // TODO Remove ANEURALNETWORKS_LOGICAL_NOT_EX
+@@ -1649,35 +1546,13 @@ OperationFactory::OperationFactory()
+ // TODO Remove ANEURALNETWORKS_GATHER_EX
+ _map[ANEURALNETWORKS_GATHER_EX] = _map[ANEURALNETWORKS_GATHER];
+
+- _map[ANEURALNETWORKS_NEG] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::Neg{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_NEG] = CreateSimpleUnaryOp<operation::Neg>;
+
+ // ANEURALNETWORKS_NEG_EX is deprecated
+ // TODO Remove ANEURALNETWORKS_NEG_EX
+ _map[ANEURALNETWORKS_NEG_EX] = _map[ANEURALNETWORKS_NEG];
+
+- _map[ANEURALNETWORKS_ABS] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::Abs{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_ABS] = CreateSimpleUnaryOp<operation::Abs>;
+
+ // ANEURALNETWORKS_ABS_EX is deprecated
+ // TODO Remove ANEURALNETWORKS_ABS_EX
+@@ -1704,18 +1579,7 @@ OperationFactory::OperationFactory()
+ // TODO Remove ANEURALNETWORKS_ARGMAX_EX
+ _map[ANEURALNETWORKS_ARGMAX_EX] = _map[ANEURALNETWORKS_ARGMAX];
+
+- _map[ANEURALNETWORKS_DEQUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- OperandIndexSequence inputs{init_param.inputs[0]};
+-
+- return new operation::Dequantize{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_DEQUANTIZE] = CreateSimpleUnaryOp<operation::Dequantize>;
+
+ _map[ANEURALNETWORKS_MEAN] = [](const OperationFactory::Param &init_param, Operands &operands) {
+ assert(init_param.input_count == 3 && init_param.output_count == 1);
+@@ -1841,31 +1705,24 @@ OperationFactory::OperationFactory()
+ };
+
+ _map[ANEURALNETWORKS_PAD] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count >= 1);
++ assert(init_param.input_count >= 2 && init_param.input_count <= 3 &&
++ init_param.output_count >= 1);
+
+ OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
++ if (init_param.input_count == 3)
++ {
++ inputs.append(OperandIndex{init_param.inputs[2]});
++ }
+ OperandIndexSequence outputs{init_param.outputs[0]};
+
+ return new operation::Pad{inputs, outputs};
+ };
+
+- _map[ANEURALNETWORKS_MINIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
++ _map[ANEURALNETWORKS_PAD_V2] = _map[ANEURALNETWORKS_PAD];
+
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+- OperandIndexSequence outputs{init_param.outputs[0]};
++ _map[ANEURALNETWORKS_MINIMUM] = createSimpleBinaryOp<operation::Min>;
+
+- return new operation::Min{inputs, outputs};
+- };
+-
+- _map[ANEURALNETWORKS_MAXIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- return new operation::Max{inputs, outputs};
+- };
++ _map[ANEURALNETWORKS_MAXIMUM] = createSimpleBinaryOp<operation::Max>;
+
+ _map[ANEURALNETWORKS_ONE_HOT_EX] = [](const OperationFactory::Param &init_param,
+ Operands &operands) {
+@@ -1948,34 +1805,15 @@ OperationFactory::OperationFactory()
+ return new operation::Range{inputs, outputs};
+ };
+
+- _map[ANEURALNETWORKS_POW] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
++ // Each input should be interpreted as follows:
++ // 0 -> LHS Tensor Index
++ // 1 -> RHS Tensor Index
++ _map[ANEURALNETWORKS_POW] = createSimpleBinaryOp<operation::Pow>;
+
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> LHS Tensor Index
+- // 1 -> RHS Tensor Index
+-
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+- return new operation::Pow{inputs, outputs};
+- };
+-
+- _map[ANEURALNETWORKS_FILL_EX] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> A tensor, specifying the input.
+- // 1 -> A 1-D tensor, specifying the value
+-
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- return new operation::Fill{inputs, outputs};
+- };
++ // Each input should be interpreted as follows:
++ // 0 -> A tensor, specifying the input.
++ // 1 -> A 1-D tensor, specifying the value
++ _map[ANEURALNETWORKS_FILL_EX] = createSimpleBinaryOp<operation::Fill>;
+
+ _map[ANEURALNETWORKS_ZEROS_LIKE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
+ assert(init_param.input_count == 1 && init_param.output_count == 1);
+@@ -1989,20 +1827,10 @@ OperationFactory::OperationFactory()
+ return new operation::ZerosLike{inputs, outputs};
+ };
+
+- _map[ANEURALNETWORKS_TILE] = [](const OperationFactory::Param &init_param, Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- // 1 -> Multiple Tensor Index
+-
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+- return new operation::Tile{inputs, outputs};
+- };
++ // Each input should be interpreted as follows:
++ // 0 -> Input Tensor Index
++ // 1 -> Multiple Tensor Index
++ _map[ANEURALNETWORKS_TILE] = createSimpleBinaryOp<operation::Tile>;
+
+ _map[ANEURALNETWORKS_MATRIX_BAND_PART_EX] = [](const OperationFactory::Param &init_param,
+ Operands &) {
+@@ -2064,21 +1892,9 @@ OperationFactory::OperationFactory()
+ return new operation::Einsum{inputs, outputs, param};
+ };
+
+- _map[ANEURALNETWORKS_BROADCAST_TO_EX] = [](const OperationFactory::Param &init_param,
+- Operands &) {
+- assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+- OperandIndexSequence outputs{init_param.outputs[0]};
+-
+- // Each input should be interpreted as follows:
+- //
+- // 0 -> Input Tensor Index
+- // 1 -> int32, int64, An 1-D int tensor Index
+-
+- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+- return new operation::BroadcastTo{inputs, outputs};
+- };
++ // 0 -> Input Tensor Index
++ // 1 -> int32, int64, An 1-D int tensor Index
++ _map[ANEURALNETWORKS_BROADCAST_TO_EX] = createSimpleBinaryOp<operation::BroadcastTo>;
+
+ _map[ANEURALNETWORKS_FUSED_BATCH_NORM_V3_EX] = [](const OperationFactory::Param &init_param,
+ Operands &operands) {
+@@ -2133,6 +1949,15 @@ OperationFactory::OperationFactory()
+
+ return new operation::LogSoftmax{inputs, outputs, param};
+ };
++
++ _map[ANEURALNETWORKS_QUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
++ assert(init_param.input_count == 1 && init_param.output_count == 1);
++
++ OperandIndexSequence inputs{init_param.inputs[0]};
++ OperandIndexSequence outputs{init_param.outputs[0]};
++
++ return new operation::Quantize{inputs, outputs};
++ };
+ }
+
+ Operation *OperationFactory::create(ANeuralNetworksOperationType type,
+diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc
+index cc04347..0fcf372 100644
+--- a/runtime/onert/test/core/exec/ExecInstance.cc
++++ b/runtime/onert/test/core/exec/ExecInstance.cc
+@@ -73,9 +73,8 @@ public:
+ // Compile
+ auto subgs = std::make_shared<onert::ir::Subgraphs>();
+ subgs->push(onert::ir::SubgraphIndex{0}, graph);
+- auto compiler = new onert::compiler::Compiler{subgs};
+- executors = compiler->compile();
+- delete compiler;
++ onert::compiler::Compiler compiler{subgs};
++ executors = compiler.compile();
+ }
+
+ public:
+@@ -98,19 +97,17 @@ TEST(ExecInstance, simple)
+ float output_buffer[4] = {};
+ const float output_expected[4] = {5, -2, 0, -1};
+
+- auto execution = new onert::exec::Execution(executors);
++ onert::exec::Execution execution{executors};
+
+- execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+- execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+- execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+- execution->execute();
++ execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
++ execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
++ execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
++ execution.execute();
+
+ for (auto i = 0; i < 4; i++)
+ {
+ EXPECT_EQ(output_buffer[i], output_expected[i]);
+ }
+-
+- delete execution;
+ }
+
+ TEST(ExecInstance, twoCompile)
+@@ -118,7 +115,7 @@ TEST(ExecInstance, twoCompile)
+ auto mockup = CompiledMockUpModel();
+ auto graph = mockup.graph;
+ auto executors1 = mockup.executors;
+- auto execution1 = new onert::exec::Execution(executors1);
++ onert::exec::Execution execution1{executors1};
+
+ auto input1 = IOIndex{0};
+ auto input2 = IOIndex{1};
+@@ -129,38 +126,34 @@ TEST(ExecInstance, twoCompile)
+ float exe1_output_buffer[4] = {};
+ const float exe1_output_expected[4] = {5, -2, 0, -1};
+
+- execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+- execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+- execution1->setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
++ execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
++ execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
++ execution1.setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
+
+ // Make new executor: compile again
+ auto subgs = std::make_shared<onert::ir::Subgraphs>();
+ subgs->push(onert::ir::SubgraphIndex{0}, graph);
+- auto compiler = new onert::compiler::Compiler{subgs};
+- std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler->compile();
+- auto execution2 = new onert::exec::Execution(executors2);
++ onert::compiler::Compiler compiler{subgs};
++ std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler.compile();
++ onert::exec::Execution execution2{executors2};
+
+ const float exe2_input1_buffer[4] = {2, 1, -2, 0};
+ const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
+ float exe2_output_buffer[4] = {};
+ const float exe2_output_expected[4] = {2, 5, -2, 7};
+
+- execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+- execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+- execution2->setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
++ execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
++ execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
++ execution2.setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
+
+- execution1->execute();
+- execution2->execute();
++ execution1.execute();
++ execution2.execute();
+
+ for (auto i = 0; i < 4; i++)
+ {
+ EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
+ EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
+ }
+-
+- delete compiler;
+- delete execution1;
+- delete execution2;
+ }
+
+ // Support two initialized execution instance then ordered execution
+@@ -178,32 +171,29 @@ TEST(ExecInstance, twoExecution)
+ const float exe1_output_expected[4] = {5, -2, 0, -1};
+ const float exe2_output_expected[4] = {2, 5, -2, 7};
+
+- auto execution1 = new onert::exec::Execution(executors);
+- execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+- execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+- execution1->setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
++ onert::exec::Execution execution1{executors};
++ execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
++ execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
++ execution1.setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
+
+ const float exe2_input1_buffer[4] = {2, 1, -2, 0};
+ const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
+ float exe2_output_buffer[4] = {};
+
+ // Make new execution
+- auto execution2 = new onert::exec::Execution(executors);
+- execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+- execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+- execution2->setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
++ onert::exec::Execution execution2{executors};
++ execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
++ execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
++ execution2.setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
+
+- execution1->execute();
+- execution2->execute();
++ execution1.execute();
++ execution2.execute();
+
+ for (auto i = 0; i < 4; i++)
+ {
+ EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
+ EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
+ }
+-
+- delete execution1;
+- delete execution2;
+ }
+
+ class Inference
+@@ -222,14 +212,12 @@ public:
+ auto input2 = IOIndex{1};
+ auto output1 = IOIndex{0};
+
+- auto execution = new onert::exec::Execution(_executors);
+- execution->setInput(input1, reinterpret_cast<const void *>(_input1), 16);
+- execution->setInput(input2, reinterpret_cast<const void *>(_input2), 16);
+- execution->setOutput(output1, reinterpret_cast<void *>(_output), 16);
++ onert::exec::Execution execution{_executors};
++ execution.setInput(input1, reinterpret_cast<const void *>(_input1), 16);
++ execution.setInput(input2, reinterpret_cast<const void *>(_input2), 16);
++ execution.setOutput(output1, reinterpret_cast<void *>(_output), 16);
+
+- execution->execute();
+-
+- delete execution;
++ execution.execute();
+ }
+
+ private:
+@@ -288,20 +276,18 @@ TEST(ExecInstance, async)
+ float output_buffer[4] = {};
+ const float output_expected[4] = {5, -2, 0, -1};
+
+- auto execution = new onert::exec::Execution(executors);
++ onert::exec::Execution execution{executors};
+
+- execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+- execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+- execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+- execution->startExecute();
+- execution->waitFinish();
++ execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
++ execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
++ execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
++ execution.startExecute();
++ execution.waitFinish();
+
+ for (auto i = 0; i < 4; i++)
+ {
+ EXPECT_EQ(output_buffer[i], output_expected[i]);
+ }
+-
+- delete execution;
+ }
+
+ } // namespace
+diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
+index e50b941..005f61c 100644
+--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
+@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
+ GeneratedTests.cast_float16_to_quant8_overflow
+ GeneratedTests.cast_float32_to_float16
+ GeneratedTests.cast_float32_to_float16_relaxed
++GeneratedTests.cast_float32_to_int32_nnfw
+ GeneratedTests.cast_int32_to_float16
+-GeneratedTests.cast_int32_to_quant8_overflow
+ GeneratedTests.cast_quant8_to_float16
+ GeneratedTests.concat_dynamic_nnfw
+ GeneratedTests.conv_dynamic_nnfw
+@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
+ GeneratedTests.gather_float16_8
+ GeneratedTests.greater_dynamic_float_nnfw
+ GeneratedTests.greater_equal_dynamic_float_nnfw
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.less_dynamic_float_nnfw
+ GeneratedTests.less_equal_dynamic_float_nnfw
+ GeneratedTests.log_4D_float_nnfw
+@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
+ GeneratedTests.one_hot_ex_dynamic_nnfw
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+ GeneratedTests.pow_broadcast_float_nnfw_3
+ GeneratedTests.pow_dynamic_nnfw
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+index c9edee5..d987bf1 100644
+--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
+ GeneratedTests.cast_float16_to_quant8_overflow
+ GeneratedTests.cast_float32_to_float16
+ GeneratedTests.cast_float32_to_float16_relaxed
+-GeneratedTests.cast_float32_to_quant8_overflow
+-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
+ GeneratedTests.cast_int32_to_float16
+-GeneratedTests.cast_int32_to_quant8_overflow
+ GeneratedTests.cast_quant8_to_float16
+ GeneratedTests.concat_dynamic_nnfw
+ GeneratedTests.conv_dynamic_nnfw
+@@ -73,6 +70,7 @@ GeneratedTests.gather_float16_8
+ GeneratedTests.greater_dynamic_float_nnfw
+ GeneratedTests.greater_equal_boolean
+ GeneratedTests.greater_equal_dynamic_float_nnfw
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.less_boolean
+ GeneratedTests.less_dynamic_float_nnfw
+ GeneratedTests.less_equal_dynamic_float_nnfw
+@@ -112,11 +110,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
+ GeneratedTests.one_hot_ex_dynamic_nnfw
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+ GeneratedTests.pow_broadcast_float_nnfw_3
+ GeneratedTests.pow_dynamic_nnfw
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+index 3cce4f3..bc0ae0f 100644
+--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
+ GeneratedTests.hashtable_lookup_float
+ GeneratedTests.hashtable_lookup_float_4D_nnfw
+ GeneratedTests.hashtable_lookup_quant8
+-GeneratedTests.l2_normalization
+-GeneratedTests.l2_normalization_2
+-GeneratedTests.l2_normalization_large
+ GeneratedTests.l2_pool_float
+ GeneratedTests.l2_pool_float_2
+ GeneratedTests.l2_pool_float_large
+@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
+ GeneratedTests.neg
+ GeneratedTests.neg_3D_int_nnfw
+ GeneratedTests.neg_4D_int_nnfw
+-GeneratedTests.pad_quant8_nnfw
+ GeneratedTests.prelu
+ GeneratedTests.prelu_broadcast_float_1_nnfw
+ GeneratedTests.prelu_broadcast_quant8_1_nnfw
+@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
+ GeneratedTests.prelu_weight_as_input_quant8_2
+ GeneratedTests.prelu_weight_as_input_quant8_3
+ GeneratedTests.prelu_weight_as_input_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.reduce_max_quant8
+ GeneratedTests.reduce_max_quant8_1_nnfw
+ GeneratedTests.reduce_max_quant8_2
+@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
+ GeneratedTests.select_v1_2_two_dim_quant8
+ GeneratedTests.slice_5
+ GeneratedTests.slice_6
+-GeneratedTests.slice_7
+ GeneratedTests.slice_8
+ GeneratedTests.slice_zero_sized
+ GeneratedTests.slice_zero_sized_quant8
+-GeneratedTests.space_to_depth_float_1
+-GeneratedTests.space_to_depth_float_2
+-GeneratedTests.space_to_depth_float_3
+ GeneratedTests.space_to_depth_quant8_1
+ GeneratedTests.space_to_depth_quant8_2
+ GeneratedTests.sqrt_
+diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
+index e50b941..005f61c 100644
+--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
+@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
+ GeneratedTests.cast_float16_to_quant8_overflow
+ GeneratedTests.cast_float32_to_float16
+ GeneratedTests.cast_float32_to_float16_relaxed
++GeneratedTests.cast_float32_to_int32_nnfw
+ GeneratedTests.cast_int32_to_float16
+-GeneratedTests.cast_int32_to_quant8_overflow
+ GeneratedTests.cast_quant8_to_float16
+ GeneratedTests.concat_dynamic_nnfw
+ GeneratedTests.conv_dynamic_nnfw
+@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
+ GeneratedTests.gather_float16_8
+ GeneratedTests.greater_dynamic_float_nnfw
+ GeneratedTests.greater_equal_dynamic_float_nnfw
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.less_dynamic_float_nnfw
+ GeneratedTests.less_equal_dynamic_float_nnfw
+ GeneratedTests.log_4D_float_nnfw
+@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
+ GeneratedTests.one_hot_ex_dynamic_nnfw
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+ GeneratedTests.pow_broadcast_float_nnfw_3
+ GeneratedTests.pow_dynamic_nnfw
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+index 55cfe39..051fbc7 100644
+--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
+ GeneratedTests.cast_float16_to_quant8_overflow
+ GeneratedTests.cast_float32_to_float16
+ GeneratedTests.cast_float32_to_float16_relaxed
+-GeneratedTests.cast_float32_to_quant8_overflow
+-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
+ GeneratedTests.cast_int32_to_float16
+-GeneratedTests.cast_int32_to_quant8_overflow
+ GeneratedTests.cast_quant8_to_float16
+ GeneratedTests.concat_dynamic_nnfw
+ GeneratedTests.conv_dynamic_nnfw
+@@ -73,6 +70,7 @@ GeneratedTests.greater_dynamic_float_nnfw
+ GeneratedTests.greater_equal_boolean
+ GeneratedTests.greater_equal_dynamic_float_nnfw
+ GeneratedTests.less_boolean
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.less_dynamic_float_nnfw
+ GeneratedTests.less_equal_dynamic_float_nnfw
+ GeneratedTests.log_4D_float_nnfw
+@@ -111,11 +109,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
+ GeneratedTests.one_hot_ex_dynamic_nnfw
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+ GeneratedTests.pow_broadcast_float_nnfw_3
+ GeneratedTests.pow_dynamic_nnfw
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+index 3cce4f3..bc0ae0f 100644
+--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
+ GeneratedTests.hashtable_lookup_float
+ GeneratedTests.hashtable_lookup_float_4D_nnfw
+ GeneratedTests.hashtable_lookup_quant8
+-GeneratedTests.l2_normalization
+-GeneratedTests.l2_normalization_2
+-GeneratedTests.l2_normalization_large
+ GeneratedTests.l2_pool_float
+ GeneratedTests.l2_pool_float_2
+ GeneratedTests.l2_pool_float_large
+@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
+ GeneratedTests.neg
+ GeneratedTests.neg_3D_int_nnfw
+ GeneratedTests.neg_4D_int_nnfw
+-GeneratedTests.pad_quant8_nnfw
+ GeneratedTests.prelu
+ GeneratedTests.prelu_broadcast_float_1_nnfw
+ GeneratedTests.prelu_broadcast_quant8_1_nnfw
+@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
+ GeneratedTests.prelu_weight_as_input_quant8_2
+ GeneratedTests.prelu_weight_as_input_quant8_3
+ GeneratedTests.prelu_weight_as_input_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.reduce_max_quant8
+ GeneratedTests.reduce_max_quant8_1_nnfw
+ GeneratedTests.reduce_max_quant8_2
+@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
+ GeneratedTests.select_v1_2_two_dim_quant8
+ GeneratedTests.slice_5
+ GeneratedTests.slice_6
+-GeneratedTests.slice_7
+ GeneratedTests.slice_8
+ GeneratedTests.slice_zero_sized
+ GeneratedTests.slice_zero_sized_quant8
+-GeneratedTests.space_to_depth_float_1
+-GeneratedTests.space_to_depth_float_2
+-GeneratedTests.space_to_depth_float_3
+ GeneratedTests.space_to_depth_quant8_1
+ GeneratedTests.space_to_depth_quant8_2
+ GeneratedTests.sqrt_
+diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp
+index 08118ca..069d367 100644
+--- a/tests/nnapi/nnapi_gtest.skip.noarch.interp
++++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp
+@@ -188,6 +188,7 @@ GeneratedTests.hashtable_lookup_quant8
+ GeneratedTests.l2_normalization
+ GeneratedTests.l2_normalization_2
+ GeneratedTests.l2_normalization_large
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.l2_pool_float
+ GeneratedTests.l2_pool_float_2
+ GeneratedTests.l2_pool_float_large
+@@ -312,6 +313,12 @@ GeneratedTests.pack_ex_2D_int_2
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
+ GeneratedTests.pad_quant8_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+@@ -331,6 +338,15 @@ GeneratedTests.prelu_weight_as_input_quant8
+ GeneratedTests.prelu_weight_as_input_quant8_2
+ GeneratedTests.prelu_weight_as_input_quant8_3
+ GeneratedTests.prelu_weight_as_input_quant8_4
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+index 3cce4f3..bc0ae0f 100644
+--- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
++++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
+ GeneratedTests.hashtable_lookup_float
+ GeneratedTests.hashtable_lookup_float_4D_nnfw
+ GeneratedTests.hashtable_lookup_quant8
+-GeneratedTests.l2_normalization
+-GeneratedTests.l2_normalization_2
+-GeneratedTests.l2_normalization_large
+ GeneratedTests.l2_pool_float
+ GeneratedTests.l2_pool_float_2
+ GeneratedTests.l2_pool_float_large
+@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
+ GeneratedTests.neg
+ GeneratedTests.neg_3D_int_nnfw
+ GeneratedTests.neg_4D_int_nnfw
+-GeneratedTests.pad_quant8_nnfw
+ GeneratedTests.prelu
+ GeneratedTests.prelu_broadcast_float_1_nnfw
+ GeneratedTests.prelu_broadcast_quant8_1_nnfw
+@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
+ GeneratedTests.prelu_weight_as_input_quant8_2
+ GeneratedTests.prelu_weight_as_input_quant8_3
+ GeneratedTests.prelu_weight_as_input_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.reduce_max_quant8
+ GeneratedTests.reduce_max_quant8_1_nnfw
+ GeneratedTests.reduce_max_quant8_2
+@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
+ GeneratedTests.select_v1_2_two_dim_quant8
+ GeneratedTests.slice_5
+ GeneratedTests.slice_6
+-GeneratedTests.slice_7
+ GeneratedTests.slice_8
+ GeneratedTests.slice_zero_sized
+ GeneratedTests.slice_zero_sized_quant8
+-GeneratedTests.space_to_depth_float_1
+-GeneratedTests.space_to_depth_float_2
+-GeneratedTests.space_to_depth_float_3
+ GeneratedTests.space_to_depth_quant8_1
+ GeneratedTests.space_to_depth_quant8_2
+ GeneratedTests.sqrt_
+diff --git a/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
+new file mode 100644
+index 0000000..ca3770c
+--- /dev/null
++++ b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
+@@ -0,0 +1,30 @@
++#
++# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++# Copyright (C) 2017 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++model = Model()
++in0 = Input("op1", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
++out0 = Output("op2", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
++model = model.Operation("L2_NORMALIZATION", in0).To(out0)
++
++# Example 1. Input in operand 0,
++input0 = {in0: # input 0
++ [0, 5, 12]}
++output0 = {out0: # output 0
++ [51, 54, 58]}
++
++# Instantiate an example
++Example((input0, output0))
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
+new file mode 100644
+index 0000000..c500741
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
+@@ -0,0 +1,35 @@
++#
++# Copyright (C) 2018 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++input0 = Input("input0", "TENSOR_FLOAT32", "{1, 2, 3, 1}")
++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0,
++ 0, 2,
++ 1, 3,
++ 0, 0])
++pad_value = Float32Scalar("pad_value", 9.3)
++output0 = Output("output0", "TENSOR_FLOAT32", "{1, 4, 7, 1}")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example(({
++ input0: [1.0, 2.0, 3.0,
++ 4.0, 5.0, 6.0],
++}, {
++ output0: [9.3, 1.0, 2.0, 3.0, 9.3, 9.3, 9.3,
++ 9.3, 4.0, 5.0, 6.0, 9.3, 9.3, 9.3,
++ 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3,
++ 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3],
++})).AddVariations("float16", "relaxed")
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
+new file mode 100644
+index 0000000..3dfaff6
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
+@@ -0,0 +1,35 @@
++#
++# Copyright (C) 2018 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 2, 3, 1}, 2.3, 4")
++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0,
++ 0, 2,
++ 1, 3,
++ 0, 0])
++pad_value = Int32Scalar("pad_value", 9)
++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{1, 4, 7, 1}, 2.3, 4")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example(({
++ input0: [1, 2, 3,
++ 4, 5, 6],
++}, {
++ output0: [9, 1, 2, 3, 9, 9, 9,
++ 9, 4, 5, 6, 9, 9, 9,
++ 9, 9, 9, 9, 9, 9, 9,
++ 9, 9, 9, 9, 9, 9, 9],
++}))
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
+new file mode 100644
+index 0000000..5b27f49
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
+@@ -0,0 +1,40 @@
++#
++# Copyright (C) 2019 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++import numpy as np
++
++input0 = Input("input0", "TENSOR_FLOAT32", "{1, 1, 2, 3}")
++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2,
++ 3, 4,
++ 3, 3,
++ 2, 1])
++pad_value = Float32Scalar("pad_value", 3.9)
++output0 = Output("output0", "TENSOR_FLOAT32", "{4, 8, 8, 6}")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example({
++ input0: [1.0, 2.0, 3.0,
++ 4.0, 5.0, 6.0],
++ output0: np.pad([[[[1.0, 2.0, 3.0],
++ [4.0, 5.0, 6.0]]]],
++ [[1, 2],
++ [3, 4],
++ [3, 3],
++ [2, 1]],
++ "constant",
++ constant_values=3.9).flatten().tolist(),
++}).AddVariations("float16", "relaxed")
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
+new file mode 100644
+index 0000000..5ee4b06
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
+@@ -0,0 +1,40 @@
++#
++# Copyright (C) 2019 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++import numpy
++
++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 1, 2, 3}, 2.3, 4")
++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2,
++ 3, 4,
++ 3, 3,
++ 2, 1])
++pad_value = Int32Scalar("pad_value", 3)
++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{4, 8, 8, 6}, 2.3, 4")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example({
++ input0: [1, 2, 3,
++ 4, 5, 6],
++ output0: np.pad([[[[1, 2, 3],
++ [4, 5, 6]]]],
++ [[1, 2],
++ [3, 4],
++ [3, 3],
++ [2, 1]],
++ "constant",
++ constant_values=3).flatten().tolist(),
++})
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
+new file mode 100644
+index 0000000..391d5cf
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
+@@ -0,0 +1,27 @@
++#
++# Copyright (C) 2019 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++input0 = Input("input0", "TENSOR_FLOAT32", "{3}")
++paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1])
++pad_value = Float32Scalar("pad_value", 9.9)
++output0 = Output("output0", "TENSOR_FLOAT32", "{7}")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example({
++ input0: [1.0, 2.0, 3.0],
++ output0: [9.9, 9.9, 9.9, 1.0, 2.0, 3.0, 9.9],
++}).AddVariations("float16")
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
+new file mode 100644
+index 0000000..b67c2b8
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
+@@ -0,0 +1,27 @@
++#
++# Copyright (C) 2019 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{3}, 2.3, 4")
++paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1])
++pad_value = Int32Scalar("pad_value", 9)
++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{7}, 2.3, 4")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example({
++ input0: [1, 2, 3],
++ output0: [9, 9, 9, 1, 2, 3, 9],
++})
+diff --git a/tests/nnapi/specs/V1_2/quantize.mod.py b/tests/nnapi/specs/V1_2/quantize.mod.py
+new file mode 100644
+index 0000000..a42624d
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/quantize.mod.py
+@@ -0,0 +1,69 @@
++#
++# Copyright (C) 2018 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++import numpy as np
++
++num_values = 300
++values = list(np.linspace(-10, 10, num_values))
++
++for input_type in ["TENSOR_FLOAT32", "TENSOR_FLOAT16"]:
++ for scale, offset in [(1.0, 0),
++ (1.0, 1),
++ (0.01, 120),
++ (10.0, 120)]:
++ input0 = Input("input0", input_type, "{%d}" % num_values)
++ output0 = Output("output0", input_type, "{%d}" % num_values)
++
++ model = Model().Operation("QUANTIZE", input0).To(output0)
++
++ quantizeOutput = DataTypeConverter().Identify({
++ output0: ["TENSOR_QUANT8_ASYMM", scale, offset],
++ })
++
++ Example({
++ input0: values,
++ output0: values,
++ }).AddVariations(quantizeOutput, includeDefault=False)
++
++
++# Zero-sized input
++
++# Use BOX_WITH_NMS_LIMIT op to generate a zero-sized internal tensor for box cooridnates.
++p1 = Parameter("scores", "TENSOR_FLOAT32", "{1, 2}", [0.90, 0.10]) # scores
++p2 = Parameter("roi", "TENSOR_FLOAT32", "{1, 8}", [1, 1, 10, 10, 0, 0, 10, 10]) # roi
++o1 = Output("scoresOut", "TENSOR_FLOAT32", "{0}") # scores out
++o2 = Output("classesOut", "TENSOR_INT32", "{0}") # classes out
++tmp1 = Internal("roiOut", "TENSOR_FLOAT32", "{0, 4}") # roi out
++tmp2 = Internal("batchSplitOut", "TENSOR_INT32", "{0}") # batch split out
++model = Model("zero_sized").Operation("BOX_WITH_NMS_LIMIT", p1, p2, [0], 0.3, -1, 0, 0.4, 1.0, 0.3).To(o1, tmp1, o2, tmp2)
++
++# Use ROI_ALIGN op to convert into zero-sized feature map.
++layout = BoolScalar("layout", False) # NHWC
++i1 = Input("in", "TENSOR_FLOAT32", "{1, 1, 1, 1}")
++zero_sized = Internal("featureMap", "TENSOR_FLOAT32", "{0, 2, 2, 1}")
++model = model.Operation("ROI_ALIGN", i1, tmp1, tmp2, 2, 2, 2.0, 2.0, 4, 4, layout).To(zero_sized)
++
++# QUANTIZE op with numBatches = 0.
++o3 = Output("out", "TENSOR_QUANT8_ASYMM", "{0, 2, 2, 1}, 0.1f, 128") # out
++model = model.Operation("QUANTIZE", zero_sized).To(o3)
++
++# Create test case with dummy values.
++Example({
++ i1: [1],
++ o1: [0],
++ o2: [0],
++ o3: [0],
++}).AddVariations("relaxed", "float16")
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py
+deleted file mode 100644
+index c500741..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py
++++ /dev/null
+@@ -1,35 +0,0 @@
+-#
+-# Copyright (C) 2018 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-# http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-input0 = Input("input0", "TENSOR_FLOAT32", "{1, 2, 3, 1}")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0,
+- 0, 2,
+- 1, 3,
+- 0, 0])
+-pad_value = Float32Scalar("pad_value", 9.3)
+-output0 = Output("output0", "TENSOR_FLOAT32", "{1, 4, 7, 1}")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example(({
+- input0: [1.0, 2.0, 3.0,
+- 4.0, 5.0, 6.0],
+-}, {
+- output0: [9.3, 1.0, 2.0, 3.0, 9.3, 9.3, 9.3,
+- 9.3, 4.0, 5.0, 6.0, 9.3, 9.3, 9.3,
+- 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3,
+- 9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3],
+-})).AddVariations("float16", "relaxed")
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py
+deleted file mode 100644
+index 3dfaff6..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py
++++ /dev/null
+@@ -1,35 +0,0 @@
+-#
+-# Copyright (C) 2018 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-# http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 2, 3, 1}, 2.3, 4")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0,
+- 0, 2,
+- 1, 3,
+- 0, 0])
+-pad_value = Int32Scalar("pad_value", 9)
+-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{1, 4, 7, 1}, 2.3, 4")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example(({
+- input0: [1, 2, 3,
+- 4, 5, 6],
+-}, {
+- output0: [9, 1, 2, 3, 9, 9, 9,
+- 9, 4, 5, 6, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9],
+-}))
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py
+deleted file mode 100644
+index 5b27f49..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py
++++ /dev/null
+@@ -1,40 +0,0 @@
+-#
+-# Copyright (C) 2019 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-# http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-import numpy as np
+-
+-input0 = Input("input0", "TENSOR_FLOAT32", "{1, 1, 2, 3}")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2,
+- 3, 4,
+- 3, 3,
+- 2, 1])
+-pad_value = Float32Scalar("pad_value", 3.9)
+-output0 = Output("output0", "TENSOR_FLOAT32", "{4, 8, 8, 6}")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example({
+- input0: [1.0, 2.0, 3.0,
+- 4.0, 5.0, 6.0],
+- output0: np.pad([[[[1.0, 2.0, 3.0],
+- [4.0, 5.0, 6.0]]]],
+- [[1, 2],
+- [3, 4],
+- [3, 3],
+- [2, 1]],
+- "constant",
+- constant_values=3.9).flatten().tolist(),
+-}).AddVariations("float16", "relaxed")
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py
+deleted file mode 100644
+index 5ee4b06..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py
++++ /dev/null
+@@ -1,40 +0,0 @@
+-#
+-# Copyright (C) 2019 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-# http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-import numpy
+-
+-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 1, 2, 3}, 2.3, 4")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2,
+- 3, 4,
+- 3, 3,
+- 2, 1])
+-pad_value = Int32Scalar("pad_value", 3)
+-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{4, 8, 8, 6}, 2.3, 4")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example({
+- input0: [1, 2, 3,
+- 4, 5, 6],
+- output0: np.pad([[[[1, 2, 3],
+- [4, 5, 6]]]],
+- [[1, 2],
+- [3, 4],
+- [3, 3],
+- [2, 1]],
+- "constant",
+- constant_values=3).flatten().tolist(),
+-})
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py
+deleted file mode 100644
+index 391d5cf..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py
++++ /dev/null
+@@ -1,27 +0,0 @@
+-#
+-# Copyright (C) 2019 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-# http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-input0 = Input("input0", "TENSOR_FLOAT32", "{3}")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1])
+-pad_value = Float32Scalar("pad_value", 9.9)
+-output0 = Output("output0", "TENSOR_FLOAT32", "{7}")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example({
+- input0: [1.0, 2.0, 3.0],
+- output0: [9.9, 9.9, 9.9, 1.0, 2.0, 3.0, 9.9],
+-}).AddVariations("float16")
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py
+deleted file mode 100644
+index b67c2b8..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py
++++ /dev/null
+@@ -1,27 +0,0 @@
+-#
+-# Copyright (C) 2019 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-# http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{3}, 2.3, 4")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1])
+-pad_value = Int32Scalar("pad_value", 9)
+-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{7}, 2.3, 4")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example({
+- input0: [1, 2, 3],
+- output0: [9, 9, 9, 1, 2, 3, 9],
+-})
+diff --git a/tests/nnapi/specs/skip/V1_2/quantize.mod.py b/tests/nnapi/specs/skip/V1_2/quantize.mod.py
+deleted file mode 100644
+index a42624d..0000000
+--- a/tests/nnapi/specs/skip/V1_2/quantize.mod.py
++++ /dev/null
+@@ -1,69 +0,0 @@
+-#
+-# Copyright (C) 2018 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-# http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-import numpy as np
+-
+-num_values = 300
+-values = list(np.linspace(-10, 10, num_values))
+-
+-for input_type in ["TENSOR_FLOAT32", "TENSOR_FLOAT16"]:
+- for scale, offset in [(1.0, 0),
+- (1.0, 1),
+- (0.01, 120),
+- (10.0, 120)]:
+- input0 = Input("input0", input_type, "{%d}" % num_values)
+- output0 = Output("output0", input_type, "{%d}" % num_values)
+-
+- model = Model().Operation("QUANTIZE", input0).To(output0)
+-
+- quantizeOutput = DataTypeConverter().Identify({
+- output0: ["TENSOR_QUANT8_ASYMM", scale, offset],
+- })
+-
+- Example({
+- input0: values,
+- output0: values,
+- }).AddVariations(quantizeOutput, includeDefault=False)
+-
+-
+-# Zero-sized input
+-
+-# Use BOX_WITH_NMS_LIMIT op to generate a zero-sized internal tensor for box cooridnates.
+-p1 = Parameter("scores", "TENSOR_FLOAT32", "{1, 2}", [0.90, 0.10]) # scores
+-p2 = Parameter("roi", "TENSOR_FLOAT32", "{1, 8}", [1, 1, 10, 10, 0, 0, 10, 10]) # roi
+-o1 = Output("scoresOut", "TENSOR_FLOAT32", "{0}") # scores out
+-o2 = Output("classesOut", "TENSOR_INT32", "{0}") # classes out
+-tmp1 = Internal("roiOut", "TENSOR_FLOAT32", "{0, 4}") # roi out
+-tmp2 = Internal("batchSplitOut", "TENSOR_INT32", "{0}") # batch split out
+-model = Model("zero_sized").Operation("BOX_WITH_NMS_LIMIT", p1, p2, [0], 0.3, -1, 0, 0.4, 1.0, 0.3).To(o1, tmp1, o2, tmp2)
+-
+-# Use ROI_ALIGN op to convert into zero-sized feature map.
+-layout = BoolScalar("layout", False) # NHWC
+-i1 = Input("in", "TENSOR_FLOAT32", "{1, 1, 1, 1}")
+-zero_sized = Internal("featureMap", "TENSOR_FLOAT32", "{0, 2, 2, 1}")
+-model = model.Operation("ROI_ALIGN", i1, tmp1, tmp2, 2, 2, 2.0, 2.0, 4, 4, layout).To(zero_sized)
+-
+-# QUANTIZE op with numBatches = 0.
+-o3 = Output("out", "TENSOR_QUANT8_ASYMM", "{0, 2, 2, 1}, 0.1f, 128") # out
+-model = model.Operation("QUANTIZE", zero_sized).To(o3)
+-
+-# Create test case with dummy values.
+-Example({
+- i1: [1],
+- o1: [0],
+- o2: [0],
+- o3: [0],
+-}).AddVariations("relaxed", "float16")
+diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+index 67f2467..c6c6355 100644
+--- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
++++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+@@ -51,19 +51,24 @@ TEST_F(ValidationTestAddModelLoaded, output_tensorinfo)
+ ASSERT_EQ(tensor_info.dims[0], 1);
+ }
+
+-TEST_F(ValidationTestAddModelLoaded, neg_run_001)
++TEST_F(ValidationTestAddModelLoaded, neg_run)
+ {
+- ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
++ // nnfw_prepare is not called
++ ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
+ }
+
+-TEST_F(ValidationTestAddModelLoaded, neg_set_input_001)
++TEST_F(ValidationTestAddModelLoaded, neg_set_input)
+ {
+- ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
++ // nnfw_prepare is not called
++ ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
++ NNFW_STATUS_INVALID_STATE);
+ }
+
+-TEST_F(ValidationTestAddModelLoaded, neg_set_output_001)
++TEST_F(ValidationTestAddModelLoaded, neg_set_output)
+ {
+- ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
++ // nnfw_prepare is not called
++ ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
++ NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestAddModelLoaded, neg_get_input_size)
+@@ -81,7 +86,7 @@ TEST_F(ValidationTestAddModelLoaded, neg_load_model)
+ // load model twice
+ ASSERT_EQ(nnfw_load_model_from_file(
+ _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
+- NNFW_STATUS_ERROR);
++ NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestAddModelLoaded, neg_output_tensorinfo)
+diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+index 1bb4182..0f4a4af 100644
+--- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
++++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+@@ -102,7 +102,7 @@ TEST_F(ValidationTestAddSessionPrepared, neg_run_during_async_run)
+ {
+ SetInOutBuffers();
+ ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR);
+- EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
++ EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
+ ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR);
+ }
+
+@@ -152,13 +152,13 @@ TEST_F(ValidationTestAddSessionPrepared, neg_load_model)
+ // Load model twice
+ ASSERT_EQ(nnfw_load_model_from_file(
+ _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
+- NNFW_STATUS_ERROR);
++ NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestAddSessionPrepared, neg_prepare)
+ {
+ // Call Prepare twice
+- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
+ }
+
+ // TODO Validation check when "nnfw_run" is called without input & output tensor setting
+diff --git a/tests/nnfw_api/src/ValidationTestSessionCreated.cc b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
+index 2675aa7..01832db 100644
+--- a/tests/nnfw_api/src/ValidationTestSessionCreated.cc
++++ b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
+@@ -58,7 +58,7 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_1)
+ nnfw_load_model_from_file(
+ _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()),
+ NNFW_STATUS_ERROR);
+- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
+@@ -67,52 +67,52 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
+ _session,
+ NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()),
+ NNFW_STATUS_ERROR);
+- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestSessionCreated, neg_prepare_001)
+ {
+ // nnfw_load_model_from_file was not called
+- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestSessionCreated, neg_run_001)
+ {
+ // nnfw_load_model_from_file and nnfw_prepare was not called
+- ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestSessionCreated, neg_set_input_001)
+ {
+- // Invalid state
+- ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
++ NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestSessionCreated, neg_set_output_001)
+ {
+- // Invalid state
+- ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
++ NNFW_STATUS_INVALID_STATE);
+ }
+
+ TEST_F(ValidationTestSessionCreated, neg_get_input_size)
+ {
+ uint32_t size = 10000;
+- ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_ERROR);
+- ASSERT_EQ(size, 10000);
++ ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_INVALID_STATE);
++ ASSERT_EQ(size, 10000); // Remain unchanged
+ }
+
+ TEST_F(ValidationTestSessionCreated, neg_get_output_size)
+ {
+ uint32_t size = 10000;
+- ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_ERROR);
+- ASSERT_EQ(size, 10000);
++ ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_INVALID_STATE);
++ ASSERT_EQ(size, 10000); // Remain unchanged
+ }
+
+ TEST_F(ValidationTestSessionCreated, neg_output_tensorinfo)
+ {
+ nnfw_tensorinfo tensor_info;
+ // model is not loaded
+- ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_INVALID_STATE);
+ // model is not loaded and tensor_info is null
+- ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_ERROR);
++ ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_INVALID_STATE);
+ }
+diff --git a/tests/scripts/benchmark_nnapi.sh b/tests/scripts/benchmark_nnapi.sh
+index c7f44c5..af79728 100755
+--- a/tests/scripts/benchmark_nnapi.sh
++++ b/tests/scripts/benchmark_nnapi.sh
+@@ -18,7 +18,6 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+ source $MY_PATH/common.sh
+
+-BENCHMARK_RUN_TEST_SH=
+ BENCHMARK_DRIVER_BIN=
+ BENCHMARK_REPORT_DIR=
+ BENCHMARK_MODELS_FILE=
+@@ -30,7 +29,7 @@ EXECUTORS="Linear Parallel" #TODO: accept this list as argument
+
+ function Usage()
+ {
+- echo "Usage: ./$0 --reportdir=. --runtestsh=tests/scripts/framework/run_test.sh --driverbin=Product/out/bin/tflite_run"
++ echo "Usage: ./$0 --reportdir=. --driverbin=Product/out/bin/tflite_run"
+ }
+
+ for i in "$@"
+@@ -43,9 +42,6 @@ do
+ --test_op)
+ TEST_OP="true"
+ ;;
+- --runtestsh=*)
+- BENCHMARK_RUN_TEST_SH=${i#*=}
+- ;;
+ --driverbin=*)
+ BENCHMARK_DRIVER_BIN=${i#*=}
+ ;;
+@@ -147,9 +143,8 @@ function run_onert_with_all_config()
+ local REPORT_MODEL_DIR=$2
+ local PAUSE_TIME_IN_SEC=$3
+ local BENCHMARK_DRIVER_BIN=$4
+- local BENCHMARK_RUN_TEST_SH=$5
+- local EXECUTORS=$6
+- local BACKEND_LIST=$7
++ local EXECUTORS=$5
++ local BACKEND_LIST=$6
+
+ export USE_NNAPI=1
+
+@@ -163,18 +158,18 @@ function run_onert_with_all_config()
+ done
+ export BACKENDS=$BACKENDS_TO_USE
+ if [ "$TEST_OP" == "false" ]; then
+- profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
++ profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
+ fi
+
+ for executor in $EXECUTORS; do
+ export EXECUTOR=$executor
+ if [ "$TEST_OP" == "false" ]; then
+- run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $executor
++ run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $executor
+ fi
+ for backend in $BACKEND_LIST; do
+ export OP_BACKEND_ALLOPS=$backend
+ run_benchmark_and_print "tflite_onert_"$executor"_executor_$backend" "TFLite onert $executor Executor $backend"\
+- $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
++ $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
+ done
+ done
+ unset USE_NNAPI EXECUTOR OP_BACKEND_ALLOPS BACKENDS
+@@ -215,14 +210,14 @@ function run_benchmark_test()
+
+ # TFLite+CPU
+ unset USE_NNAPI
+- run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
++ run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
+
+ # run onert
+ if [ "$TEST_OP" == "true" ]; then
+ # Operation test don't need to test each scheduler
+- run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "Linear" "$BACKEND_LIST"
++ run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "Linear" "$BACKEND_LIST"
+ else
+- run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "$EXECUTORS" "$BACKEND_LIST"
++ run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "$EXECUTORS" "$BACKEND_LIST"
+ fi
+
+ if [[ $i -ne $(echo $BENCHMARK_MODEL_LIST | wc -w)-1 ]]; then
+diff --git a/tests/scripts/common.sh b/tests/scripts/common.sh
+index 8800290..b2799c2 100755
+--- a/tests/scripts/common.sh
++++ b/tests/scripts/common.sh
+@@ -18,13 +18,12 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+ function get_result_of_benchmark_test()
+ {
+- local RUN_TEST_SH=$1
+- local DRIVER_BIN=$2
+- local MODEL=$3
+- local LOG_FILE=$4
++ local DRIVER_BIN=$1
++ local MODEL=$2
++ local LOG_FILE=$3
+
+ local RET=0
+- $RUN_TEST_SH --driverbin="$DRIVER_BIN -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
++ $MY_PATH/framework/run_test.sh --driverbin="$DRIVER_BIN -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
+ RET=$?
+ if [[ $RET -ne 0 ]]; then
+ echo "Testing $MODEL aborted... exit code: $RET"
+@@ -68,7 +67,7 @@ function run_benchmark_and_print()
+ LOG_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.txt
+ RESULT_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.result
+ print_with_dots $MSG
+- RESULT=$(get_result_of_benchmark_test $BENCHMARK_RUN_TEST_SH $DRIVER_BIN $MODEL $LOG_FILE)
++ RESULT=$(get_result_of_benchmark_test $DRIVER_BIN $MODEL $LOG_FILE)
+ echo "$RESULT ms"
+ print_result_of_benchmark_test "$MSG" "$RESULT" $RESULT_FILE
+ sleep $PAUSE_TIME_IN_SEC
+diff --git a/tests/scripts/framework/run_test.sh b/tests/scripts/framework/run_test.sh
+index 44b7149..9440c52 100755
+--- a/tests/scripts/framework/run_test.sh
++++ b/tests/scripts/framework/run_test.sh
+@@ -28,10 +28,12 @@ function Usage()
+ echo "Usage: ./$0 --driverbin={such as tflite_run} {tests to test or empty for all of tests}"
+ echo "Usage: ./$0 --driverbin=Product/out/bin/tflite_run --reportdir=report --tapname=verification.tap avgpool1 avgpool2"
+ echo ""
+- echo "--download - (default=off) Download model files. Other options is ignored"
+- echo "--driverbin - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
+- echo "--reportdir - (default=report) directory to place tap files"
+- echo "--tapname - (default=framework_test.tap) file name to be written for tap"
++ echo "--download - (default=on) Download model files"
++ echo "--run - (default=on) Test model files"
++ echo "--driverbin - (default=../../Product/out/bin/tflite_run) Runner for runnning model tests"
++ echo "--reportdir - (default=report) Directory to place tap files"
++ echo "--tapname - (default=framework_test.tap) File name to be written for tap"
++ echo "--md5 - (default=on) MD5 check when download model files"
+ echo ""
+ }
+
+@@ -43,9 +45,13 @@ function need_download()
+ return 0;
+ fi
+ # Ignore checking md5 in cache
++ # TODO Use "--md5" option only and remove IGNORE_MD5 environment variable
+ if [ ! -z $IGNORE_MD5 ] && [ "$IGNORE_MD5" == "1" ]; then
+ return 1
+ fi
++ if [ "$MD5_CHECK" = "off" ]; then
++ return 1
++ fi
+
+ LOCAL_HASH=$(md5sum $LOCAL_PATH | awk '{ print $1 }')
+ REMOTE_HASH=$(curl -ss $REMOTE_URL | md5sum | awk '{ print $1 }')
+@@ -60,7 +66,9 @@ function need_download()
+ DRIVER_BIN=""
+ TAP_NAME="framework_test.tap"
+ TEST_LIST=()
+-DOWNLOAD_MODE="off"
++DOWNLOAD_MODEL="on"
++RUN_TEST="on"
++MD5_CHECK="on"
+
+ # Support environment variable setting for mirror server
+ FIXED_MODELFILE_SERVER="${MODELFILE_SERVER:-}"
+@@ -84,6 +92,12 @@ do
+ --download=*)
+ DOWNLOAD_MODE=${i#*=}
+ ;;
++ --md5=*)
++ MD5_CHECK=${i#*=}
++ ;;
++ --run=*)
++ RUN_TEST=${i#*=}
++ ;;
+ *)
+ TEST_LIST+=( $i )
+ ;;
+@@ -100,7 +114,7 @@ if [ ! -n "$DRIVER_BIN" ]; then
+ fi
+
+ # Check test driver setting
+-if [ ! -e $DRIVER_BIN ] && [ "$DOWNLOAD_MODE" != "on" ]; then
++if [ ! -e $DRIVER_BIN ] && [ "$RUN_TEST" = "on" ]; then
+ echo "Cannot find test driver" $DRIVER_BIN ": please set proper DRIVER_BIN"
+ exit 1
+ fi
+@@ -139,33 +153,9 @@ run_tests()
+
+ TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME
+ MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME
+- MODELFILE_URL="$MODELFILE_SERVER_PATH/$MODELFILE_NAME"
+- if [ -n "$FIXED_MODELFILE_SERVER" ]; then
+- MODELFILE_URL="$FIXED_MODELFILE_SERVER/$MODELFILE_NAME"
+- fi
+-
+- # Download model file
+- if [ ! -e $TEST_CACHE_PATH ]; then
+- mkdir -p $TEST_CACHE_PATH
+- fi
+-
+- # Download unless we have it in cache (Also check md5sum)
+- if need_download "$MODELFILE" "$MODELFILE_URL"; then
+- echo ""
+- echo "Download test file for $TEST_NAME"
+- echo "======================"
+-
+- rm -f $MODELFILE # Remove invalid file if exists
+- pushd $TEST_CACHE_PATH
+- wget -nv $MODELFILE_URL
+- if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
+- unzip -o $MODELFILE_NAME
+- fi
+- popd
+- fi
+
+ # Find model file for downloaded by zip
+- if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
++ if [ "${MODELFILE_NAME##*.}" = "zip" ]; then
+ pushd $TEST_CACHE_PATH
+ MODELFILE=$TEST_CACHE_PATH/$(ls *.tflite)
+ popd
+@@ -178,7 +168,6 @@ run_tests()
+ # Run driver to test framework
+ $DRIVER_BIN $MODELFILE
+
+- #$DRIVER_BIN $MODELFILE
+ if [[ $? -eq 0 ]]; then
+ echo "ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME
+ else
+@@ -268,10 +257,11 @@ find_tests()
+ mkdir -p $REPORT_DIR
+ TESTS_TO_RUN=$(find_tests ${TEST_LIST[@]})
+
+-if [[ "$DOWNLOAD_MODE" == "on" ]]; then
++if [ "$DOWNLOAD_MODEL" = "on" ]; then
+ download_tests $TESTS_TO_RUN
+- exit 0;
+ fi
+
+-run_tests $TESTS_TO_RUN
++if [ "$RUN_TEST" = "on" ]; then
++ run_tests $TESTS_TO_RUN
++fi
+ exit $?
+diff --git a/tests/scripts/test-driver.sh b/tests/scripts/test-driver.sh
+index 615fc2c..a720b15 100755
+--- a/tests/scripts/test-driver.sh
++++ b/tests/scripts/test-driver.sh
+@@ -38,7 +38,6 @@ function Usage()
+ echo "etc."
+ echo "--framework_driverbin - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
+ echo "--verification_driverbin - (default=../../Product/out/bin/nnapi_test) runner for runnning verification tests"
+- echo "--runtestsh - (default=\$ARTIFACT_PATH/tests/scripts/framework/run_test.sh) run_test.sh with path where it is for framework test and verification"
+ echo "--unittestdir - (default=\$ARTIFACT_PATH/Product/out/unittest) directory that has unittest binaries for unit test"
+ echo ""
+ echo "--reportdir - (default=\$ARTIFACT_PATH/report) directory to save report"
+@@ -49,7 +48,6 @@ TEST_DRIVER_DIR="$( cd "$( dirname "${BASH_SOURCE}" )" && pwd )"
+ ARTIFACT_PATH="$TEST_DRIVER_DIR/../../"
+ FRAMEWORK_DRIVER_BIN=""
+ VERIFICATION_DRIVER_BIN=""
+-RUN_TEST_SH=""
+ UNIT_TEST_DIR=""
+ ALLTEST_ON="true"
+ UNITTEST_ON="false"
+@@ -74,9 +72,6 @@ do
+ --verification_driverbin=*)
+ VERIFICATION_DRIVER_BIN=${i#*=}
+ ;;
+- --runtestsh=*)
+- RUN_TEST_SH=${i#*=}
+- ;;
+ --unittestdir=*)
+ UNIT_TEST_DIR=${i#*=}
+ ;;
+@@ -116,15 +111,6 @@ done
+
+ ARTIFACT_PATH="$(readlink -f $ARTIFACT_PATH)"
+
+-if [ -z "$RUN_TEST_SH" ]; then
+- RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/framework/run_test.sh
+-fi
+-
+-if [ ! -e "$RUN_TEST_SH" ]; then
+- echo "Cannot find $RUN_TEST_SH"
+- exit 1
+-fi
+-
+ if [ -z "$UNIT_TEST_DIR" ]; then
+ UNIT_TEST_DIR=$ARTIFACT_PATH/Product/out/unittest
+ fi
+@@ -149,7 +135,6 @@ if [ "$FRAMEWORKTEST_ON" == "true" ]; then
+ fi
+
+ $TEST_DRIVER_DIR/test_framework.sh \
+- --runtestsh=$RUN_TEST_SH \
+ --driverbin=$FRAMEWORK_DRIVER_BIN \
+ --reportdir=$REPORT_DIR \
+ --tapname=framework_test.tap \
+@@ -166,7 +151,6 @@ if [ "$ALLTEST_ON" == "true" ] || [ "$VERIFICATION_ON" == "true" ]; then
+
+ # verification uses the same script as frameworktest does
+ $TEST_DRIVER_DIR/test_framework.sh \
+- --runtestsh=$RUN_TEST_SH \
+ --driverbin=$VERIFICATION_DRIVER_BIN \
+ --reportdir=$REPORT_DIR \
+ --tapname=verification_test.tap \
+@@ -180,7 +164,6 @@ if [ "$BENCHMARK_ONERT_OP_ON" == "true" ]; then
+
+ $TEST_DRIVER_DIR/benchmark_nnapi.sh \
+ --test_op \
+- --runtestsh=$RUN_TEST_SH \
+ --driverbin=$DRIVER_BIN \
+ --reportdir=$REPORT_DIR/benchmark_op \
+ --modelfilepath=$ARTIFACT_PATH/tests/scripts/framework
+diff --git a/tests/scripts/test_framework.sh b/tests/scripts/test_framework.sh
+index 1d97515..bd86cd3 100755
+--- a/tests/scripts/test_framework.sh
++++ b/tests/scripts/test_framework.sh
+@@ -14,7 +14,8 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+-FWTEST_RUN_TEST_SH=
++MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
++
+ FWTEST_DRIVER_BIN=
+ FWTEST_REPORT_DIR=
+ FWTEST_TAP_NAME=
+@@ -25,7 +26,6 @@ function Usage()
+ {
+ echo "Usage Example:"
+ echo "./$0 \\"
+- echo " --runtestsh=tests/scripts/framework/run_test.sh \\ # Test runner script path"
+ echo " --driverbin=Product/out/bin/tflite_run \\ # Test driver path"
+ echo " --frameworktest_list_file=tests/scripts/list/frameworktest_list.armv7l.cpu.txt \\"
+ echo " --reportdir=report \\ # Directory for the report files will be saved"
+@@ -42,9 +42,6 @@ do
+ -h|--help|help)
+ Usage
+ ;;
+- --runtestsh=*)
+- FWTEST_RUN_TEST_SH=${i#*=}
+- ;;
+ --driverbin=*)
+ FWTEST_DRIVER_BIN=${i#*=}
+ ;;
+@@ -67,7 +64,6 @@ do
+ shift
+ done
+
+-[ ! -z "$FWTEST_RUN_TEST_SH" ] || Usage
+ [ ! -z "$FWTEST_DRIVER_BIN" ] || Usage
+ [ ! -z "$FWTEST_REPORT_DIR" ] || Usage
+ [ ! -z "$FWTEST_TAP_NAME" ] || Usage
+@@ -86,7 +82,7 @@ if [ ! -z "$FRAMEWORKTEST_LIST_FILE" ]; then
+ MODELLIST=$(cat "${FRAMEWORKTEST_LIST_FILE}")
+ fi
+
+-$FWTEST_RUN_TEST_SH --driverbin=$FWTEST_DRIVER_BIN \
++$MY_PATH/framework/run_test.sh --driverbin=$FWTEST_DRIVER_BIN \
+ --reportdir=$FWTEST_REPORT_DIR \
+ --tapname=$FWTEST_TAP_NAME \
+ ${MODELLIST:-} \
+diff --git a/tests/tools/nnpackage_run/CMakeLists.txt b/tests/tools/nnpackage_run/CMakeLists.txt
+index 0e333a0..ec45db4 100644
+--- a/tests/tools/nnpackage_run/CMakeLists.txt
++++ b/tests/tools/nnpackage_run/CMakeLists.txt
+@@ -33,7 +33,7 @@ target_include_directories(nnpackage_run PRIVATE src)
+ target_include_directories(nnpackage_run PRIVATE ${Boost_INCLUDE_DIRS})
+
+ target_link_libraries(nnpackage_run onert_core onert tflite_loader)
+-target_link_libraries(nnpackage_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite jsoncpp)
++target_link_libraries(nnpackage_run nnfw_lib_tflite jsoncpp)
+ target_link_libraries(nnpackage_run nnfw-dev)
+ target_link_libraries(nnpackage_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
+ target_link_libraries(nnpackage_run nnfw_lib_benchmark)
+diff --git a/tests/tools/nnpackage_run/src/args.cc b/tests/tools/nnpackage_run/src/args.cc
+index 0dbcafc..cb4a7db 100644
+--- a/tests/tools/nnpackage_run/src/args.cc
++++ b/tests/tools/nnpackage_run/src/args.cc
+@@ -16,6 +16,7 @@
+
+ #include "args.h"
+
++#include <functional>
+ #include <iostream>
+ #include <json/json.h>
+
+@@ -105,6 +106,75 @@ Args::Args(const int argc, char **argv)
+
+ void Args::Initialize(void)
+ {
++ auto process_nnpackage = [&](const std::string &package_filename) {
++ _package_filename = package_filename;
++
++ std::cerr << "Package Filename " << _package_filename << std::endl;
++ if (_package_filename.empty())
++ {
++ // TODO Print usage instead of the below message
++ std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
++ << "\n";
++
++ exit(1);
++ }
++ else
++ {
++ if (access(_package_filename.c_str(), F_OK) == -1)
++ {
++ std::cerr << "nnpackage not found: " << _package_filename << "\n";
++ }
++ }
++ };
++
++ auto process_output_sizes = [&](const std::string &output_sizes_json_str) {
++ Json::Value root;
++ Json::Reader reader;
++ if (!reader.parse(output_sizes_json_str, root, false))
++ {
++ std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
++ exit(1);
++ }
++
++ auto arg_map = argArrayToMap(root);
++ for (auto &pair : arg_map)
++ {
++ uint32_t key = pair.first;
++ Json::Value &val_json = pair.second;
++ if (!val_json.isUInt())
++ {
++ std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
++ exit(1);
++ }
++ uint32_t val = val_json.asUInt();
++ _output_sizes[key] = val;
++ }
++ };
++
++ auto process_shape_prepare = [&](const std::string &shape_str) {
++ try
++ {
++ handleShapeParam(_shape_prepare, shape_str);
++ }
++ catch (const std::exception &e)
++ {
++ std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
++ exit(1);
++ }
++ };
++
++ auto process_shape_run = [&](const std::string &shape_str) {
++ try
++ {
++ handleShapeParam(_shape_run, shape_str);
++ }
++ catch (const std::exception &e)
++ {
++ std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
++ exit(1);
++ }
++ };
++
+ // General options
+ po::options_description general("General options", 100);
+
+@@ -112,32 +182,33 @@ void Args::Initialize(void)
+ general.add_options()
+ ("help,h", "Print available options")
+ ("version", "Print version and exit immediately")
+- ("nnpackage", po::value<std::string>()->required())
++ ("nnpackage", po::value<std::string>()->required()->notifier(process_nnpackage))
+ #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
+- ("dump,d", po::value<std::string>()->default_value(""), "Output filename")
+- ("load,l", po::value<std::string>()->default_value(""), "Input filename")
++ ("dump,d", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _dump_filename = v; }), "Output filename")
++ ("load,l", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _load_filename = v; }), "Input filename")
+ #endif
+- ("output_sizes", po::value<std::string>(),
++ ("output_sizes", po::value<std::string>()->notifier(process_output_sizes),
+ "The output buffer size in JSON 1D array\n"
+ "If not given, the model's output sizes are used\n"
+ "e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd tensor to 80.\n")
+- ("num_runs,r", po::value<int>()->default_value(1), "The number of runs")
+- ("warmup_runs,w", po::value<int>()->default_value(0), "The number of warmup runs")
+- ("run_delay,t", po::value<int>()->default_value(-1), "Delay time(ms) between runs (as default no delay")
+- ("gpumem_poll,g", po::value<bool>()->default_value(false), "Check gpu memory polling separately")
+- ("mem_poll,m", po::value<bool>()->default_value(false), "Check memory polling")
+- ("write_report,p", po::value<bool>()->default_value(false),
++ ("num_runs,r", po::value<int>()->default_value(1)->notifier([&](const auto &v) { _num_runs = v; }), "The number of runs")
++ ("warmup_runs,w", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _warmup_runs = v; }), "The number of warmup runs")
++ ("run_delay,t", po::value<int>()->default_value(-1)->notifier([&](const auto &v) { _run_delay = v; }), "Delay time(ms) between runs (as default no delay")
++ ("gpumem_poll,g", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _gpumem_poll = v; }), "Check gpu memory polling separately")
++ ("mem_poll,m", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _mem_poll = v; }), "Check memory polling")
++ ("write_report,p", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _write_report = v; }),
+ "Write report\n"
+ "{exec}-{nnpkg}-{backend}.csv will be generated.\n"
+ "e.g. nnpackage_run-UNIT_Add_000-acl_cl.csv.\n"
+ "{nnpkg} name may be changed to realpath if you use symbolic-link.")
+- ("shape_prepare", po::value<std::string>()->default_value("[]"),
++ ("shape_prepare", po::value<std::string>()->default_value("[]")->notifier(process_shape_prepare),
+ "set shape of specified tensor before compilation\n"
+ "e.g. '[0, [1, 2], 2, []]' to set 0th tensor to [1, 2] and 2nd tensor to [].\n")
+- ("shape_run", po::value<std::string>()->default_value("[]"),
++ ("shape_run", po::value<std::string>()->default_value("[]")->notifier(process_shape_run),
+ "set shape of specified tensor right before running\n"
+ "e.g. '[1, [1, 2]]` to set 1st tensor to [1, 2].\n")
+- ("verbose_level,v", po::value<int>()->default_value(0), "Verbose level\n"
++ ("verbose_level,v", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }),
++ "Verbose level\n"
+ "0: prints the only result. Messages btw run don't print\n"
+ "1: prints result and message btw run\n"
+ "2: prints all of messages to print\n")
+@@ -180,158 +251,23 @@ void Args::Parse(const int argc, char **argv)
+ return;
+ }
+
+- po::notify(vm);
+ try
+ {
+-#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
+- if (vm.count("dump"))
+- {
+- _dump_filename = vm["dump"].as<std::string>();
+- }
+-
+- if (vm.count("load"))
+- {
+- _load_filename = vm["load"].as<std::string>();
+- }
+-#endif
+-
+- if (vm.count("nnpackage"))
+- {
+- _package_filename = vm["nnpackage"].as<std::string>();
+-
+- if (_package_filename.empty())
+- {
+- // TODO Print usage instead of the below message
+- std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
+- << "\n";
+-
+- exit(1);
+- }
+- else
+- {
+- if (access(_package_filename.c_str(), F_OK) == -1)
+- {
+- std::cerr << "nnpackage not found: " << _package_filename << "\n";
+- }
+- }
+- }
+-
+- if (vm.count("output_sizes"))
+- {
+- auto output_sizes_json_str = vm["output_sizes"].as<std::string>();
+-
+- Json::Value root;
+- Json::Reader reader;
+- if (!reader.parse(output_sizes_json_str, root, false))
+- {
+- std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
+- exit(1);
+- }
+-
+- auto arg_map = argArrayToMap(root);
+- for (auto &pair : arg_map)
+- {
+- uint32_t key = pair.first;
+- Json::Value &val_json = pair.second;
+- if (!val_json.isUInt())
+- {
+- std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
+- exit(1);
+- }
+- uint32_t val = val_json.asUInt();
+- _output_sizes[key] = val;
+- }
+- }
+-
+- if (vm.count("num_runs"))
+- {
+- _num_runs = vm["num_runs"].as<int>();
+- }
+-
+- if (vm.count("warmup_runs"))
+- {
+- _warmup_runs = vm["warmup_runs"].as<int>();
+- }
+-
+- if (vm.count("run_delay"))
+- {
+- _run_delay = vm["run_delay"].as<int>();
+- }
+-
+- if (vm.count("gpumem_poll"))
+- {
+- _gpumem_poll = vm["gpumem_poll"].as<bool>();
+- }
+-
+- if (vm.count("mem_poll"))
+- {
+- _mem_poll = vm["mem_poll"].as<bool>();
+- // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
+- if (_mem_poll && _warmup_runs == 0)
+- {
+- _warmup_runs = 1;
+- }
+- }
+-
+- if (vm.count("write_report"))
+- {
+- _write_report = vm["write_report"].as<bool>();
+- }
+-
+- if (vm.count("verbose_level"))
+- {
+- _verbose_level = vm["verbose_level"].as<int>();
+- }
++ po::notify(vm);
+ }
+ catch (const std::bad_cast &e)
+ {
+- std::cerr << "error by bad cast" << e.what() << '\n';
++ std::cerr << "Bad cast error - " << e.what() << '\n';
+ exit(1);
+ }
+
+- if (vm.count("shape_prepare"))
+- {
+- std::string shape_str;
+- try
+- {
+- shape_str = vm["shape_prepare"].as<std::string>();
+- }
+- catch (const std::bad_cast &e)
+- {
+- std::cerr << "error by bad cast with '--shape_prepare' option" << e.what() << '\n';
+- exit(1);
+- }
+- try
+- {
+- handleShapeParam(_shape_prepare, shape_str);
+- }
+- catch (const std::exception &e)
+- {
+- std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
+- exit(1);
+- }
+- }
+-
+- if (vm.count("shape_run"))
++ // This must be run after `notify` as `_warm_up_runs` must have been processed before.
++ if (vm.count("mem_poll"))
+ {
+- std::string shape_str;
+- try
+- {
+- shape_str = vm["shape_run"].as<std::string>();
+- }
+- catch (const std::bad_cast &e)
++ // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
++ if (_mem_poll && _warmup_runs == 0)
+ {
+- std::cerr << "error by bad cast with '--shape_run' option" << e.what() << '\n';
+- exit(1);
+- }
+- try
+- {
+- handleShapeParam(_shape_run, shape_str);
+- }
+- catch (const std::exception &e)
+- {
+- std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
+- exit(1);
++ _warmup_runs = 1;
+ }
+ }
+ }
+diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc
+index 34c075c..09ace47 100644
+--- a/tests/tools/nnpackage_run/src/h5formatter.cc
++++ b/tests/tools/nnpackage_run/src/h5formatter.cc
+@@ -145,6 +145,7 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
+ data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT64);
+ break;
+ }
++ case NNFW_TYPE_TENSOR_UINT8:
+ case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
+ {
+ H5::DataSet data_set =
+@@ -159,13 +160,6 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
+ data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT8);
+ break;
+ }
+- case NNFW_TYPE_TENSOR_UINT8:
+- {
+- H5::DataSet data_set =
+- value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8BE, data_space);
+- data_set.write(outputs[i].data(), H5::PredType::NATIVE_UINT8);
+- break;
+- }
+ default:
+ throw std::runtime_error("nnpkg_run can dump f32, i32, qasymm8, bool and uint8.");
+ }
+diff --git a/tests/tools/tflite_loader/CMakeLists.txt b/tests/tools/tflite_loader/CMakeLists.txt
+index 5a9e3a8..0fe1c69 100644
+--- a/tests/tools/tflite_loader/CMakeLists.txt
++++ b/tests/tools/tflite_loader/CMakeLists.txt
+@@ -17,7 +17,7 @@ add_executable(tflite_loader_test_tool ${SOURCES})
+ target_include_directories(tflite_loader_test_tool PRIVATE ${Boost_INCLUDE_DIRS})
+
+ target_link_libraries(tflite_loader_test_tool onert_core onert tflite_loader)
+-target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_misc)
++target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite nnfw_lib_misc)
+ target_link_libraries(tflite_loader_test_tool ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
+
+ install(TARGETS tflite_loader_test_tool DESTINATION bin)
+diff --git a/tests/tools/tflite_run/CMakeLists.txt b/tests/tools/tflite_run/CMakeLists.txt
+index 19e7126..3f30d3e 100644
+--- a/tests/tools/tflite_run/CMakeLists.txt
++++ b/tests/tools/tflite_run/CMakeLists.txt
+@@ -13,7 +13,7 @@ add_executable(tflite_run ${TFLITE_RUN_SRCS})
+ target_include_directories(tflite_run PRIVATE src)
+ target_include_directories(tflite_run PRIVATE ${Boost_INCLUDE_DIRS})
+
+-target_link_libraries(tflite_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite)
++target_link_libraries(tflite_run nnfw_lib_tflite)
+ target_link_libraries(tflite_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
+
+ target_link_libraries(tflite_run nnfw_lib_benchmark)
+diff --git a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
+index cf3e544..bbc5b3e 100755
+--- a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
++++ b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
+@@ -62,6 +62,7 @@ tflite
+ "
+
+ model_type=""
++tf_intf_version=""
+
+ for ext in $supported_model_types; do
+ [ -e "$indir/$tcname"."$ext" ] && model_type=$ext
+@@ -73,7 +74,9 @@ if [[ "$model_type" == "" ]]; then
+ fi
+
+ if [[ "$model_type" == "pb" ]]; then
+- $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" -o "$outdir"
++ [ -f "$indir/$tcname"."v2" ] && tf_intf_version="--v2"
++ $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" \
++ "$tf_intf_version" -o "$outdir"
+ else
+ $model2nnpkg -o "$outdir" "$indir/$tcname"."$model_type"
+ fi
+diff --git a/tools/tflitefile_tool/select_operator.py b/tools/tflitefile_tool/select_operator.py
+index 1ad44a3..333ca32 100755
+--- a/tools/tflitefile_tool/select_operator.py
++++ b/tools/tflitefile_tool/select_operator.py
+@@ -1,4 +1,4 @@
+-#!/usr/bin/python
++#!/usr/bin/env python
+
+ # Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ #
+@@ -1180,23 +1180,6 @@ def GenerateModel(args, new_builder, sample_model, operator_list, new_input_tens
+ return tflite.Model.ModelEnd(new_builder)
+
+
+-def Finish(new_builder, new_model):
+- # Cusrom implementation: identifier
+- # Python API don't support identifier input yet
+- # Reference: Finish(self, rootTable)) in builder.py, Finish(uoffset_t root, const char *file_identifier, bool size_prefix) in flatbuffers.h
+- new_builder.Prep(new_builder.minalign,
+- flatbuffers.number_types.UOffsetTFlags.bytewidth)
+-
+- new_builder.PrependByte(0x33)
+- new_builder.PrependByte(0x4c)
+- new_builder.PrependByte(0x46)
+- new_builder.PrependByte(0x54)
+-
+- new_builder.PrependUOffsetTRelative(new_model)
+- new_builder.finished = True
+- return new_builder.Head()
+-
+-
+ def main(args):
+ input_model_file = args.input_model
+ oplist_file = args.opcode_list
+@@ -1343,7 +1326,7 @@ def main(args):
+ new_input_tensors, new_output_tensors, used_tensors_dic,
+ used_buffers_dic, used_opcodes_dic, used_subgraphs_dic)
+
+- Finish(new_builder, new_model)
++ new_builder.Finish(new_model, file_identifier=b'TFL3')
+ new_buf = new_builder.Output()
+
+ output_model_file.write(new_buf)
+diff --git a/tools/tflkit/README.md b/tools/tflkit/README.md
+index a0c40c6..9e18834 100644
+--- a/tools/tflkit/README.md
++++ b/tools/tflkit/README.md
+@@ -1,4 +1,4 @@
+-# tflkit
++# tflkit
+
+ ## Purpose
+
+@@ -114,11 +114,11 @@ Number of all operators : 126 (total instrs: 11,484,469
+
+ ### TensorFlow
+
+-TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
++TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
+
+ ### with tflkit
+
+-The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in teh `NAME` environment. This tool requires an information file as a parameter. There is an [example file](info/convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
++The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in the `NAME` environment. This tool requires an information file as a parameter. There is an [example file](convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+
+ Convert information:
+ * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
+@@ -176,7 +176,7 @@ The input and output file of this tool is a TensorFlow GraphDef file.
+
+ ### with tflkit
+
+-The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](info/optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
++The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+
+ Optimize information:
+ * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
+@@ -207,7 +207,7 @@ The trained TensorFlow model can be trasformed by some variants to deploy it in
+
+ ### with tflkit
+
+-The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](info/transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
++The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+
+ Transform information:
+ * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
+@@ -270,7 +270,7 @@ The [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorfl
+
+ ### with tflkit
+
+-The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](info/freeze.info) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
++The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](freeze.template) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+
+ Freeze information:
+ * SAVED_MODEL : Full directory path with TensorFlow `SavedModel` file and variables.
+diff --git a/tools/update_version/update-version b/tools/update_version/update-version
+index 4169327..1b77c10 100644
+--- a/tools/update_version/update-version
++++ b/tools/update_version/update-version
+@@ -40,11 +40,12 @@ fi
+
+ version=$1
+
+-sed -i "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
+-sed -i "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
++perl -pi -e "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
+
+-IFS=. read M m p <<< $version
++perl -pi -e "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
++
++IFS=. read M m p <<< "$version"
+ hex=$(printf '0x%08x' $(( (($M << 24)) | (($m << 8)) | $p )))
+-sed -i "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
++perl -pi -e "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
+
+-sed -i "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
++perl -pi -e "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
index ce1cd0b92..e26ffcb9b 100644
--- a/packaging/nnfw.spec
+++ b/packaging/nnfw.spec
@@ -30,7 +30,7 @@ BuildRequires: flatbuffers-devel
%ifarch %{arm} aarch64
# Require python for acl-ex library build pre-process
BuildRequires: python
-BuildRequires: libarmcl-devel
+BuildRequires: libarmcl-devel >= v20.05
%endif
Requires(post): /sbin/ldconfig
diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
new file mode 100644
index 000000000..7322e90a4
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
@@ -0,0 +1,26 @@
+operand {
+ name: "ifm"
+ type: UINT8
+ shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+ name: "ofm"
+ type: UINT8
+ shape { dim: 1 dim: 7 dim: 7 dim: 1 }
+ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operation {
+ type: "AveragePool2D"
+ averagepool2d_options {
+ padding: VALID
+ stride_w: 1
+ stride_h: 1
+ filter_width: 2
+ filter_height: 2
+ }
+ input: "ifm"
+ output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
new file mode 100644
index 000000000..a09afc1de
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
@@ -0,0 +1,44 @@
+operand {
+ name: "ifm"
+ type: FLOAT32
+ shape { dim: 1 dim: 4 dim: 5 dim: 5 }
+}
+operand {
+ name: "ker"
+ type: FLOAT32
+ shape { dim: 1 dim: 1 dim: 2 dim: 25 }
+}
+operand {
+ name: "bias"
+ type: FLOAT32
+ shape { dim: 25 }
+ filler {
+ tag: "constant"
+ arg: "1.1"
+ }
+}
+operand {
+ name: "ofm"
+ type: FLOAT32
+ shape { dim: 1 dim: 2 dim: 2 dim: 25 }
+}
+operation {
+ type: "DepthwiseConv2D"
+ version: 2
+ depthwiseconv2d_options {
+ padding: VALID
+ stride_w: 2
+ stride_h: 2
+ dilation_w_factor: 2
+ dilation_h_factor: 1
+ depth_multiplier: 5
+ activation : RELU6
+ }
+ input: "ifm"
+ input: "ker"
+ input: "bias"
+ output: "ofm"
+}
+input: "ifm"
+input: "ker"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
new file mode 100644
index 000000000..edfabc64e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
@@ -0,0 +1,3 @@
+# To check if DEPTHWISE_CONV_2D version is 2
+
+RULE "OP_VERSION_CHECK" $(op_version DEPTHWISE_CONV_2D) '=' 2
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
new file mode 100644
index 000000000..5e0b6b543
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
@@ -0,0 +1,61 @@
+operand {
+ name: "ifm"
+ type: UINT8
+ shape { dim: 1 dim: 112 dim: 112 dim: 4 }
+ quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
+}
+operand {
+ name: "ker"
+ type: UINT8
+ shape { dim: 1 dim: 3 dim: 3 dim: 4 }
+ filler {
+ tag: "gaussian"
+ arg: "0.0"
+ arg: "1.0"
+ }
+ quant {
+ min: -30.3175 min: -0.779597 min: -10.2751 min: -10.8594
+ max: 4.35049 max: 2.70807 max: 11.0269 max: 20.97
+ scale:0.135953 scale: 0.0136771 scale: 0.0835375 scale: 0.124821
+ zero_point:223 zero_point: 57 zero_point: 123 zero_point: 87
+ quantized_dimension: 3
+ }
+}
+operand {
+ name: "bias"
+ type: INT32
+ shape { dim: 4 }
+ filler {
+ tag: "gaussian"
+ arg: "0"
+ arg: "1.0"
+ }
+ quant {
+ scale: 1.4758e-16 scale: 3.15185e-05 scale: 2.20685e-05 scale: 1.72205e-16
+ zero_point: 0 zero_point: 0 zero_point: 0 zero_point: 0
+ }
+}
+operand {
+ name: "ofm"
+ type: UINT8
+ shape { dim: 1 dim: 112 dim: 112 dim: 4 }
+ quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
+
+}
+operation {
+ type: "DepthwiseConv2D"
+ depthwiseconv2d_options {
+ padding: SAME
+ stride_w: 1
+ stride_h: 1
+ depth_multiplier: 1
+ activation : RELU6
+ }
+ input: "ifm"
+ input: "ker"
+ input: "bias"
+ output: "ofm"
+}
+input: "ifm"
+input: "ker"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
new file mode 100644
index 000000000..3fff5cd6d
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
@@ -0,0 +1,22 @@
+operand {
+ name: "ifm1"
+ type: UINT8
+ shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+ quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
+}
+operand {
+ name: "ofm"
+ type: UINT8
+ shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+ quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
+}
+operation {
+ type: "L2Normalize"
+ l2norm_options {
+ activation: NONE
+ }
+ input: "ifm1"
+ output: "ofm"
+}
+input: "ifm1"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
new file mode 100644
index 000000000..7b2a84de7
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
@@ -0,0 +1,19 @@
+operand {
+ name: "ifm"
+ type: UINT8
+ shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+ quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
+}
+operand {
+ name: "ofm"
+ type: UINT8
+ shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+ quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
+}
+operation {
+ type: "Logistic"
+ input: "ifm"
+ output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
index 79271a45f..1313e2683 100644
--- a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
+++ b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
@@ -10,7 +10,7 @@ operand {
operand {
name: "ker"
type: FLOAT32
- shape { dim: 1 dim: 3 dim: 3 dim: 1 }
+ shape { dim: 3 dim: 1 dim: 1 dim: 3 }
filler {
tag: "gaussian"
arg: "0.0"
diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
new file mode 100644
index 000000000..887380c48
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
@@ -0,0 +1,27 @@
+operand {
+ name: "ifm"
+ type: FLOAT32
+ shape { dim: 4 }
+}
+operand {
+ name: "ofm"
+ type: FLOAT32
+ shape { }
+}
+operand {
+ name: "ofm_idx"
+ type: INT32
+ shape { dim: 4 }
+}
+operation {
+ type: "Unique"
+ unique_options {
+ idx_out_type: INT32
+ }
+ input: "ifm"
+ output: "ofm"
+ output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
new file mode 100644
index 000000000..9beb51690
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
@@ -0,0 +1,27 @@
+operand {
+ name: "ifm"
+ type: FLOAT32
+ shape { dim: 4 }
+}
+operand {
+ name: "ofm"
+ type: FLOAT32
+ shape { }
+}
+operand {
+ name: "ofm_idx"
+ type: INT64
+ shape { dim: 4 }
+}
+operation {
+ type: "Unique"
+ unique_options {
+ idx_out_type: INT64
+ }
+ input: "ifm"
+ output: "ofm"
+ output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.recipe b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
new file mode 100644
index 000000000..67b947ff8
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
@@ -0,0 +1,27 @@
+operand {
+ name: "ifm"
+ type: INT32
+ shape { dim: 5 }
+}
+operand {
+ name: "ofm"
+ type: INT32
+ shape { }
+}
+operand {
+ name: "ofm_idx"
+ type: INT32
+ shape { dim: 5 }
+}
+operation {
+ type: "Unique"
+ unique_options {
+ idx_out_type: INT32
+ }
+ input: "ifm"
+ output: "ofm"
+ output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.reverse b/res/TensorFlowLiteRecipes/Unique_002/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_002/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.recipe b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
new file mode 100644
index 000000000..375db66e8
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
@@ -0,0 +1,27 @@
+operand {
+ name: "ifm"
+ type: INT32
+ shape { dim: 5 }
+}
+operand {
+ name: "ofm"
+ type: INT32
+ shape { }
+}
+operand {
+ name: "ofm_idx"
+ type: INT64
+ shape { dim: 5 }
+}
+operation {
+ type: "Unique"
+ unique_options {
+ idx_out_type: INT64
+ }
+ input: "ifm"
+ output: "ofm"
+ output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.reverse b/res/TensorFlowLiteRecipes/Unique_003/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_003/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
new file mode 100644
index 000000000..d3985e401
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
@@ -0,0 +1,28 @@
+operand {
+ name: "ifm"
+ type: UINT8
+ shape { dim: 4 }
+ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+ name: "ofm"
+ type: UINT8
+ shape { }
+}
+operand {
+ name: "ofm_idx"
+ type: INT32
+ shape { dim: 4 }
+}
+operation {
+ type: "Unique"
+ unique_options {
+ idx_out_type: INT32
+ }
+ input: "ifm"
+ output: "ofm"
+ output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
new file mode 100644
index 000000000..b08dd85cc
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
@@ -0,0 +1,28 @@
+operand {
+ name: "ifm"
+ type: UINT8
+ shape { dim: 5 }
+ quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+ name: "ofm"
+ type: UINT8
+ shape { }
+}
+operand {
+ name: "ofm_idx"
+ type: INT64
+ shape { dim: 5 }
+}
+operation {
+ type: "Unique"
+ unique_options {
+ idx_out_type: INT64
+ }
+ input: "ifm"
+ output: "ofm"
+ output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse
diff --git a/runtime/libs/benchmark/CMakeLists.txt b/runtime/libs/benchmark/CMakeLists.txt
index 2af0ffaa3..748b2d13f 100644
--- a/runtime/libs/benchmark/CMakeLists.txt
+++ b/runtime/libs/benchmark/CMakeLists.txt
@@ -1,6 +1,5 @@
file(GLOB_RECURSE SOURCES "src/*.cpp")
-add_library(nnfw_lib_benchmark SHARED ${SOURCES})
+add_library(nnfw_lib_benchmark STATIC ${SOURCES})
target_include_directories(nnfw_lib_benchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_link_libraries(nnfw_lib_benchmark PRIVATE ${LIB_PTHREAD})
-install(TARGETS nnfw_lib_benchmark DESTINATION lib)
diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp
index 7a3f9a572..df573da92 100644
--- a/runtime/libs/benchmark/src/Result.cpp
+++ b/runtime/libs/benchmark/src/Result.cpp
@@ -166,7 +166,7 @@ Result::Result(const Phases &phases)
if (option.memory)
{
print_memory = true;
- for (int i = PhaseEnum::MODEL_LOAD; i <= PhaseEnum::EXECUTE; ++i)
+ for (int i = PhaseEnum::MODEL_LOAD; i < PhaseEnum::EXECUTE; ++i)
{
auto phase = phases.at(gPhaseStrings[i]);
for (int j = MemoryType::RSS; j <= MemoryType::PSS; ++j)
diff --git a/runtime/onert/api/include/nnfw.h b/runtime/onert/api/include/nnfw.h
index 031aabd51..03a3aed07 100644
--- a/runtime/onert/api/include/nnfw.h
+++ b/runtime/onert/api/include/nnfw.h
@@ -99,6 +99,8 @@ typedef enum {
NNFW_STATUS_ERROR = 1,
/** Unexpected null argument is given. */
NNFW_STATUS_UNEXPECTED_NULL = 2,
+ /** When a function was called but it is not valid for the current session state. */
+ NNFW_STATUS_INVALID_STATE = 3,
} NNFW_STATUS;
/**
@@ -432,10 +434,10 @@ NNFW_STATUS nnfw_output_tensorinfo(nnfw_session *session, uint32_t index,
*
* <p>Supported backends differs on each platforms.
* For example, `x86_64` supports "cpu" only.
- * Can set multiple backends by semicolon (ex: "acl_cl;cpu").
- * Among the multiple backends, the 1st element is used as default backend.</p>
- *
- * @note Possible backend strings are: "cpu", "acl_cl", "acl_neon", "srcn"
+ * Multiple backends can be set and they must be separated by a semicolon (ex: "acl_cl;cpu").
+ * For each backend string, `libbackend_{backend}.so` will be dynamically loaded during
+ * {@link nnfw_prepare}.
+ * Among the multiple backends, the 1st element is used as the default backend.</p>
*
* @param[in] session session to which avilable backends are set
* @param[in] backends available backends on which nnfw uses
@@ -449,12 +451,10 @@ NNFW_STATUS nnfw_set_available_backends(nnfw_session *session, const char *backe
*
* This function should be called before {@link nnfw_prepare} is invoked.
*
- * <p>Supported backends differs on each platforms.
- * For example, `x86_64` supports "cpu" only.
- * The backend for op has higher priority than available backends specified by
- * nnfw_set_available_backends.</p>
+ * <p>The backend for op has higher priority than available backends specified by
+ * {@link nnfw_set_available_backends}.</p>
*
- * @note Possible backend strings are: "cpu", "acl_cl", "acl_neon"
+ * @deprecated Deprecated since 1.8.0.
*
* @param[in] session session to be modified
* @param[in] op operation to be set
diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc
index 074758374..34a46ed7e 100644
--- a/runtime/onert/api/src/nnfw_api.cc
+++ b/runtime/onert/api/src/nnfw_api.cc
@@ -31,6 +31,7 @@ STATIC_ASSERT_ENUM_CHECK(NNFW_TYPE_TENSOR_INT64, 5);
STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_NO_ERROR, 0);
STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_ERROR, 1);
STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_UNEXPECTED_NULL, 2);
+STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_INVALID_STATE, 3);
STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_NONE, 0);
STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_CHANNELS_LAST, 1);
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index d03ddd427..b3390fa64 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -76,7 +76,7 @@ nnfw_session::~nnfw_session() = default;
NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
{
if (!isStateInitialized())
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
if (!package_dir)
{
@@ -156,7 +156,7 @@ NNFW_STATUS nnfw_session::prepare()
std::cerr << "invalid state";
}
std::cerr << std::endl;
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
}
if (!_subgraphs || !primary_subgraph() || primary_subgraph()->isBuildingPhase())
@@ -188,7 +188,7 @@ NNFW_STATUS nnfw_session::run()
{
std::cerr << "Error during nnfw_session::run : "
<< "run should be run after prepare" << std::endl;
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
}
try
@@ -211,7 +211,7 @@ NNFW_STATUS nnfw_session::run_async()
{
std::cerr << "Error during nnfw_session::run_async : "
<< "run_async should be run after prepare" << std::endl;
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
}
_execution->startExecute();
@@ -241,7 +241,7 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo
if (!isStatePreparedOrFinishedRun())
{
std::cerr << "Error during nnfw_session::set_input : invalid state" << std::endl;
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
}
if (!buffer && length != 0)
@@ -270,7 +270,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
if (!isStatePreparedOrFinishedRun())
{
std::cerr << "Error during nnfw_session::set_output : invalid state" << std::endl;
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
}
if (!buffer && length != 0)
@@ -296,7 +296,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
NNFW_STATUS nnfw_session::input_size(uint32_t *number)
{
if (isStateInitialized()) // Model is not loaded
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
try
{
@@ -318,7 +318,7 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number)
NNFW_STATUS nnfw_session::output_size(uint32_t *number)
{
if (isStateInitialized()) // Model is not loaded
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
try
{
@@ -410,7 +410,7 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti)
{
std::cerr << "Error during set_input_tensorinfo : should be run after load_model"
<< std::endl;
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
}
if (ti.rank <= 0 || ti.rank > NNFW_MAX_RANK)
@@ -463,6 +463,9 @@ NNFW_STATUS nnfw_session::set_input_tensorinfo(uint32_t index, const nnfw_tensor
NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
{
+ if (isStateInitialized())
+ return NNFW_STATUS_INVALID_STATE;
+
try
{
if (ti == nullptr)
@@ -499,7 +502,7 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
{
if (isStateInitialized())
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
if (ti == nullptr)
{
@@ -570,7 +573,7 @@ static std::string get_op_backend_string(std::string op)
NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
{
if (!isStateModelLoaded())
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
try
{
@@ -596,7 +599,7 @@ NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
{
if (!isStateModelLoaded())
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
try
{
@@ -627,7 +630,7 @@ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
NNFW_STATUS nnfw_session::set_config(const char *key, const char *value)
{
if (!isStateModelLoaded())
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
auto &options = _compiler->options();
@@ -693,7 +696,7 @@ onert::ir::Graph *nnfw_session::primary_subgraph()
NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size)
{
if (!isStateModelLoaded())
- return NNFW_STATUS_ERROR;
+ return NNFW_STATUS_INVALID_STATE;
auto &options = _compiler->options();
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index 3ca405899..4ab2d4ce8 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -31,6 +31,7 @@
#include "exec/FunctionSequence.h"
#include "util/logging.h"
#include "util/Utils.h"
+#include "AclKernelGen.h"
namespace onert
{
@@ -76,15 +77,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
const auto block_size_index{
node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto block_size_tensor = _tensor_builder->at(block_size_index).get();
assert(_ctx.at(block_size_index).data());
auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>();
- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -96,15 +97,27 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
- ? arm_compute::SubDataType::BOOL
- : arm_compute::SubDataType::NONE;
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
- auto fn = std::make_unique<::arm_compute::CLCast>();
+ std::unique_ptr<::arm_compute::IFunction> fn;
+ if (ifm_tensor->data_type() == ofm_tensor->data_type())
+ {
+ auto l = std::make_unique<::arm_compute::CLCopy>();
+
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
+ fn = std::move(l);
+ }
+ else
+ {
+ auto l = std::make_unique<::arm_compute::CLCast>();
+
+ // TODO Support converting float to int32 as round down
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
+
+ fn = std::move(l);
+ }
auto acl_fn = asAclClFunction(std::move(fn));
@@ -132,10 +145,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
ker_width, ker_height);
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto ker_alloc = _tensor_builder->at(ker_index).get();
- auto bias_alloc = _tensor_builder->at(bias_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto ker_tensor = _tensor_builder->at(ker_index).get();
+ auto bias_tensor = _tensor_builder->at(bias_index).get();
const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -143,8 +156,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>(
_tensor_builder->acl_tensor_manager()->internal_buffer_manager());
- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
- conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+ ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
+ ::arm_compute::Size2D(1U, 1U), act_info);
_return_fn = asAclClFunction(std::move(fn));
}
@@ -171,10 +185,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
const auto multiplier = node.param().multiplier;
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto ker_alloc = _tensor_builder->at(ker_index).get();
- auto bias_alloc = _tensor_builder->at(bias_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto ker_tensor = _tensor_builder->at(ker_index).get();
+ auto bias_tensor = _tensor_builder->at(bias_index).get();
const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -182,8 +196,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
{
auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
- ofm_alloc->handle(), conv_info, multiplier, act_info);
+ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+ ofm_tensor->handle(), conv_info, multiplier, act_info);
_return_fn = asAclClFunction(std::move(fn));
}
@@ -217,19 +231,20 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
- ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
- ::arm_compute::Size2D{kw, kh},
- acl_common::asPadStrideInfo(padding, stride)};
+ ::arm_compute::PoolingLayerInfo info{
+ ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh},
+ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)};
auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclClFunction(std::move(fn)),
+ ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
@@ -260,19 +275,21 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
::arm_compute::PoolingLayerInfo info{
::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
- acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
+ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride),
+ true /* exclude_padding */};
auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclClFunction(std::move(fn)),
+ ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Concat &node)
@@ -296,7 +313,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
return;
}
- auto output_alloc = _tensor_builder->at(ofm_index).get();
+ auto output_tensor = _tensor_builder->at(ofm_index).get();
std::vector<::arm_compute::ICLTensor *> input_tensors;
for (auto &ifm_ind : input_indexes)
input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
@@ -305,7 +322,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
if (input_indexes.size() < 2)
{
auto l = std::make_unique<::arm_compute::CLCopy>();
- l->configure(input_tensors.at(0), output_alloc->handle());
+ l->configure(input_tensors.at(0), output_tensor->handle());
fn = std::move(l);
}
else
@@ -313,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
auto l = std::make_unique<::arm_compute::CLConcatenateLayer>();
const auto rank = _ctx.at(ofm_index).shape().rank();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = output_alloc->layout();
+ const auto backend_layout = output_tensor->layout();
const auto fixed_axis =
acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
- l->configure(input_tensors, output_alloc->handle(), fixed_axis);
+ l->configure(input_tensors, output_tensor->handle(), fixed_axis);
fn = std::move(l);
}
@@ -327,75 +344,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
void KernelGenerator::visit(const ir::operation::FullyConnected &node)
{
- using ir::operation::FullyConnected;
-
const auto output_index{node.getOutputs().at(0)};
- const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
- const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
- const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
-
- const auto input_rank = _ctx.at(input_index).shape().rank();
-
- const auto output_size =
- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
- UNUSED_RELEASE(output_size);
- assert(_ctx.at(bias_index).shape().dim(0) == output_size);
- assert(_ctx.at(weight_index).shape().dim(0) == output_size);
- const auto batch_size =
- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
- const auto input_size =
- _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
-
- // Check for reshaping input's shape into rank-2
- bool needs_reshape = false;
- ir::Shape reshape(2);
- if (input_rank == 3 || input_rank == 4)
- {
- const auto &ifm_shape = _ctx.at(input_index).shape();
- auto feature_size = 1;
- for (int i = 0; i < ifm_shape.rank(); ++i)
- {
- feature_size *= ifm_shape.dim(i);
- }
-
- UNUSED_RELEASE(feature_size);
- assert(feature_size == batch_size * input_size);
-
- // for reshaping
- needs_reshape = true;
- reshape.dim(0) = batch_size; /* H */
- reshape.dim(1) = input_size; /* W */
- }
-
+ auto output_tensor = _tensor_builder->at(output_index).get();
const auto activation = node.param().activation;
- auto output_alloc = _tensor_builder->at(output_index).get();
- const auto input_alloc = _tensor_builder->at(input_index).get();
- const auto weight_alloc = _tensor_builder->at(weight_index).get();
- const auto bias_alloc = _tensor_builder->at(bias_index).get();
- const auto frontend_layout = _current_op_seq_layout;
- const auto acl_layout = output_alloc->handle()->info()->data_layout();
-
- auto fn = std::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
- _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-
- arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type =
- arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL;
- if (_ctx.at(weight_index).isConstant())
- {
- kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
- assert(_ctx.at(weight_index).data());
- }
- fn->configure(
- input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
- needs_reshape,
- ::onert::backend::acl_common::asTensorShape(
- reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
- kernel_type);
-
+ auto fn = acl_common::kernelGenFullyConnected<acl_common::AclClFunction, ::arm_compute::ICLTensor,
+ ::arm_compute::CLFullyConnectedReshapingLayer>(
+ node, _ctx, _tensor_builder, _current_op_seq_layout);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)),
- ActivationBuilder::generate(activation, output_alloc->handle()));
+ std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Mul &node)
@@ -406,17 +363,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclClFunction(std::move(fn)),
+ ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Reduce &node)
@@ -427,14 +385,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
const auto keep_dims{node.param().keep_dims};
const auto reduce_type = node.param().reduce_type;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
// Convert to ACL axes taking into account negative values and possible duplicates.
const auto &axes = _ctx.at(axes_index);
const auto input_rank = _ctx.at(input_index).shape().rank();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = input_alloc->layout();
+ const auto backend_layout = input_tensor->layout();
std::unique_ptr<arm_compute::IFunction> fn;
if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
@@ -443,7 +401,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
const auto acl_axes =
acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
- l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle());
+ l->configure(input_tensor->handle(), acl_axes, keep_dims, output_tensor->handle());
fn = std::move(l);
}
@@ -453,7 +411,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
_tensor_builder->acl_tensor_manager()->internal_buffer_manager());
const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
- l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims,
+ l->configure(input_tensor->handle(), output_tensor->handle(), acl_axes, keep_dims,
acl_common::convertReduceType(reduce_type));
fn = std::move(l);
@@ -469,13 +427,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
// NOTE This operation must not be changed the layout from frontend to backend
// So, PermutationOperationPass makes layouts of frontend and backend the same.
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = output_alloc->layout();
+ const auto backend_layout = output_tensor->layout();
assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
frontend_layout == backend_layout);
UNUSED_RELEASE(frontend_layout);
@@ -483,7 +441,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -503,10 +461,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
(void)dims;
(void)ndim;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<arm_compute::CLReshapeLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
_return_fn = std::move(acl_fn);
}
@@ -516,15 +474,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<arm_compute::CLActivationLayer>();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -538,13 +496,13 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
const auto beta = node.param().beta;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>(
_tensor_builder->acl_tensor_manager()->internal_buffer_manager());
- fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -558,10 +516,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
- auto outputData_alloc = _tensor_builder->at(output_index).get();
- auto inputData_alloc = _tensor_builder->at(input_index).get();
+ auto outputData_tensor = _tensor_builder->at(output_index).get();
+ auto inputData_tensor = _tensor_builder->at(input_index).get();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = inputData_alloc->layout();
+ const auto backend_layout = inputData_tensor->layout();
// Set initializers for indices data such as order of inputData
int input_rank = _ctx.at(input_index).shape().rank();
@@ -613,7 +571,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
auto fn = std::make_unique<::arm_compute::CLSlice>();
- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
+ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -628,10 +586,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
- auto outputData_alloc = _tensor_builder->at(output_index).get();
- auto inputData_alloc = _tensor_builder->at(input_index).get();
+ auto outputData_tensor = _tensor_builder->at(output_index).get();
+ auto inputData_tensor = _tensor_builder->at(input_index).get();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = inputData_alloc->layout();
+ const auto backend_layout = inputData_tensor->layout();
// Set initializers for indices data such as order of inputData
int input_rank = _ctx.at(input_index).shape().rank();
@@ -704,7 +662,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
auto fn = std::make_unique<::arm_compute::CLStridedSlice>();
- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
strides_set, begin_mask, end_mask, shrink_axis_mask);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -720,10 +678,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
const auto rank = _ctx.at(ifm_idx).shape().rank();
- auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
- auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = ifm_alloc->layout();
+ const auto backend_layout = ifm_tensor->layout();
std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
// Reversed
@@ -732,7 +690,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
auto fn = std::make_unique<::arm_compute::CLPermute>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -747,17 +705,18 @@ void KernelGenerator::visit(const ir::operation::Add &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
arm_compute::ConvertPolicy::SATURATE);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclClFunction(std::move(fn)),
+ ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Sub &node)
@@ -768,17 +727,18 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
arm_compute::ConvertPolicy::SATURATE);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclClFunction(std::move(fn)),
+ ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Div &node)
@@ -789,16 +749,17 @@ void KernelGenerator::visit(const ir::operation::Div &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclClFunction(std::move(fn)),
+ ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Exp &node)
@@ -806,12 +767,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<::arm_compute::CLExpLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -823,12 +784,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -842,20 +803,21 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto gamma_alloc = _tensor_builder->at(gamma_index).get();
- auto beta_alloc = _tensor_builder->at(beta_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto gamma_tensor = _tensor_builder->at(gamma_index).get();
+ auto beta_tensor = _tensor_builder->at(beta_index).get();
auto epsilon = node.param().epsilon;
auto activation = node.param().activation;
auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
- beta_alloc->handle(), epsilon);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
+ beta_tensor->handle(), epsilon);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclClFunction(std::move(fn)),
+ ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Logistic &node)
@@ -863,15 +825,15 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -884,13 +846,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input0_alloc = _tensor_builder->at(input0_index).get();
- auto input1_alloc = _tensor_builder->at(input1_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input0_tensor = _tensor_builder->at(input0_index).get();
+ auto input1_tensor = _tensor_builder->at(input1_index).get();
auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>();
- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
::arm_compute::BinaryLogicalOperation::AND);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -900,159 +862,8 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
void KernelGenerator::visit(const ir::operation::LSTM &node)
{
- // TODO Support dynamic rnn
- // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
- const auto scratch_buffer_index{
- node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
- const auto output_state_out_index{
- node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
- const auto cell_state_out_index{
- node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
- const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
-
- const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
- const auto input_to_input_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
- const auto input_to_forget_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
- const auto input_to_cell_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
- const auto input_to_output_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
- const auto recurrent_to_input_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
- const auto recurrent_to_forget_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
- const auto recurrent_to_cell_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
- const auto recurrent_to_output_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
- const auto cell_to_input_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
- const auto cell_to_forget_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
- const auto cell_to_output_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
- const auto input_gate_bias_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
- const auto forget_gate_bias_index{
- node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
- const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
- const auto output_gate_bias_index{
- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
- const auto projection_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
- const auto projection_bias_index{
- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
- const auto output_state_in_index{
- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
- const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
- const auto cell_threshold = node.param().cell_threshold;
- const auto projection_threshold = node.param().projection_threshold;
-
- bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
- _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
- bool has_recurrent_to_input_weights =
- _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
- _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
- bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
- bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
- bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
- _ctx.at(projection_weights_index).shape().dim(1) != 0;
- bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
- // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
- // true: no CIFG
- // false: CIFG
- // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
- bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
- // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
- // But the cell_to_input_weights does not exist in regular CIFG although peephole.
- // true: peephole
- // false: no peephole
- bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
- // NOTE Although the projection weights has data the projection bias may not have data.
- bool has_projection_param = has_projection_weights;
-
- const auto activation = node.param().activation;
- const auto cell_clip = cell_threshold;
- const auto projection_clip = projection_threshold;
- assert(cell_clip >= 0.f && projection_clip >= 0.f);
-
- auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
- auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
- auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
- auto output_alloc = _tensor_builder->at(output_index).get();
-
- auto input_alloc = _tensor_builder->at(input_index).get();
-
- auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
- auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
- auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
- auto recurrent_to_forget_weights_alloc =
- _tensor_builder->at(recurrent_to_forget_weights_index).get();
- auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
- auto recurrent_to_output_weights_alloc =
- _tensor_builder->at(recurrent_to_output_weights_index).get();
-
- auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
- auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
- auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
- auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
- auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
-
- auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
-
- auto fn = std::make_unique<::arm_compute::CLLSTMLayer>();
-
- ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
- if (has_cifg_param)
- {
- auto input_to_input_weights_alloc =
- _tensor_builder->at(input_to_input_weights_index).get(); // optional
- auto recurrent_to_input_weights_alloc =
- _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
- auto cell_to_input_weights_handle =
- has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
- : nullptr; // optional (non-cifg && peephole)
- auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
- lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
- recurrent_to_input_weights_alloc->handle(),
- cell_to_input_weights_handle, input_gate_bias_alloc->handle());
- }
- if (has_peephole_param)
- {
- auto cell_to_forget_weights_alloc =
- _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
- auto cell_to_output_weights_alloc =
- _tensor_builder->at(cell_to_output_weights_index).get(); // optional
- lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
- cell_to_output_weights_alloc->handle());
- }
- if (has_projection_param)
- {
- auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
- auto projection_bias_handle = has_projection_bias
- ? _tensor_builder->at(projection_bias_index).get()->handle()
- : nullptr; // optional
- lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
- }
-
- fn->configure(
- input_alloc->handle(), input_to_forget_weights_alloc->handle(),
- input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
- recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
- recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
- cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
- cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
- output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
- lstm_params, act_info, cell_clip, projection_clip);
-
- auto acl_fn = asAclClFunction(std::move(fn));
-
- _return_fn = std::move(acl_fn);
+ _return_fn = acl_common::kernelGenLSTM<acl_common::AclClFunction, ::arm_compute::ICLTensor,
+ ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_builder);
}
void KernelGenerator::visit(const ir::operation::Comparison &node)
@@ -1063,13 +874,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
const auto comparison_type = node.param().comparison_type;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input0_alloc = _tensor_builder->at(input0_index).get();
- auto input1_alloc = _tensor_builder->at(input1_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input0_tensor = _tensor_builder->at(input0_index).get();
+ auto input1_tensor = _tensor_builder->at(input1_index).get();
auto fn = std::make_unique<::arm_compute::CLComparison>();
- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
(arm_compute::ComparisonOperation)comparison_type);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1107,13 +918,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
for (const auto &input_index : input_indexes)
{
size_t input_rank = _ctx.at(input_index).shape().rank();
- const auto &input_alloc = _tensor_builder->at(input_index);
- orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape());
- assert(input_rank == input_alloc->num_dimensions());
- if (input_rank != input_alloc->info()->num_dimensions())
+ const auto &input_tensor = _tensor_builder->at(input_index);
+ orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
+ assert(input_rank == input_tensor->num_dimensions());
+ if (input_rank != input_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and ifm tensor is applied dim_correction
- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
_ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
}
}
@@ -1135,8 +946,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
const auto ofm_idx{node.getOutputs().at(0)};
const auto ifm_idx{node.getInputs().at(0)};
const auto permute_type = node.getPermuteType();
- auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
- auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
const auto rank = _ctx.at(ofm_idx).shape().rank();
assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
@@ -1149,7 +960,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
auto l = std::make_unique<::arm_compute::CLPermute>();
- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
fn = std::move(l);
}
@@ -1160,7 +971,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
auto l = std::make_unique<::arm_compute::CLPermute>();
- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
fn = std::move(l);
}
@@ -1168,7 +979,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
{
auto l = std::make_unique<::arm_compute::CLCopy>();
- l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
fn = std::move(l);
}
@@ -1183,12 +994,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
_return_fn = asAclClFunction(std::move(fn));
}
@@ -1198,15 +1009,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<arm_compute::CLActivationLayer>();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1219,12 +1030,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::CLScale>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
@@ -1238,15 +1049,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1258,15 +1069,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1288,25 +1099,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
const auto activation = node.param().activation;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
- auto weights_alloc = _tensor_builder->at(weights_index).get();
- auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
- auto bias_alloc = _tensor_builder->at(bias_index).get();
- auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
+ auto weights_tensor = _tensor_builder->at(weights_index).get();
+ auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
+ auto bias_tensor = _tensor_builder->at(bias_index).get();
+ auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
auto copy_layer = std::make_unique<::arm_compute::CLCopy>();
- copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+ copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
_return_fn = asAclClFunction(std::move(copy_layer));
- auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>(
+ auto fn = std::make_unique<::arm_compute::CLRNNLayer>(
_tensor_builder->acl_tensor_manager()->internal_buffer_manager());
- fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
- bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
- act_info);
+ fn->configure(input_tensor->handle(), weights_tensor->handle(),
+ recurrent_weights_tensor->handle(), bias_tensor->handle(),
+ hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
_return_fn = asAclClFunction(std::move(fn));
}
@@ -1315,12 +1126,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::CLFloor>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1335,10 +1146,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto block_size_alloc = _tensor_builder->at(block_size_index).get();
- auto paddings_alloc = _tensor_builder->at(paddings_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+ auto paddings_tensor = _tensor_builder->at(paddings_index).get();
assert(_ctx.at(block_size_index).data());
assert(_ctx.at(paddings_index).data());
@@ -1346,8 +1157,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
std::unique_ptr<::arm_compute::IFunction> fn;
auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>();
- l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
- ofm_alloc->handle());
+ l->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
+ ofm_tensor->handle());
fn = std::move(l);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1362,12 +1173,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
auto block_size = node.param().block_size;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
- auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>();
+ auto fn = std::make_unique<::arm_compute::CLSpaceToDepthLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1389,19 +1200,21 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
::arm_compute::PoolingLayerInfo info{
::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
+ ifm_tensor->info()->data_layout(),
::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclClFunction(std::move(fn)),
+ ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
@@ -1410,13 +1223,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto lookups_alloc = _tensor_builder->at(lookups_index).get();
- auto values_alloc = _tensor_builder->at(values_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+ auto values_tensor = _tensor_builder->at(values_index).get();
auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>();
- fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+ fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1442,15 +1255,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
float bias = 0.0f; // Don't offset the reduction.
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
radius, alpha, beta, bias, false);
auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1466,17 +1279,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto hits_alloc = _tensor_builder->at(hits_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto hits_tensor = _tensor_builder->at(hits_index).get();
- auto lookups_alloc = _tensor_builder->at(lookups_index).get();
- auto keys_alloc = _tensor_builder->at(keys_index).get();
- auto values_alloc = _tensor_builder->at(values_index).get();
+ auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+ auto keys_tensor = _tensor_builder->at(keys_index).get();
+ auto values_tensor = _tensor_builder->at(values_index).get();
auto fn = std::make_unique<::arm_compute::CLHashtableLookup>();
- fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
- output_alloc->handle(), hits_alloc->handle());
+ fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
+ output_tensor->handle(), hits_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1489,13 +1302,13 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto alpha_tensor = _tensor_builder->at(alpha_index).get();
- auto fn = std::make_unique<::arm_compute::CLPReLU>();
+ auto fn = std::make_unique<::arm_compute::CLPReluLayer>();
- fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1518,7 +1331,6 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
(node.param().padding.type == ir::PaddingType::VALID));
auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
ker_shape.W, ker_shape.H);
-
uint32_t invalid_horizontal = 0;
uint32_t invalid_vertical = 0;
if (node.param().padding.type == ir::PaddingType::VALID)
@@ -1528,17 +1340,17 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
}
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto ker_alloc = _tensor_builder->at(ker_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto ker_tensor = _tensor_builder->at(ker_index).get();
const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>(
_tensor_builder->acl_tensor_manager()->internal_buffer_manager());
- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
- invalid_horizontal, invalid_vertical);
+ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
+ tconv_info, invalid_horizontal, invalid_vertical);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1550,15 +1362,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1571,13 +1383,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input0_alloc = _tensor_builder->at(input0_index).get();
- auto input1_alloc = _tensor_builder->at(input1_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input0_tensor = _tensor_builder->at(input0_index).get();
+ auto input1_tensor = _tensor_builder->at(input1_index).get();
auto fn = std::make_unique<::arm_compute::CLBitwiseOr>();
- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1589,12 +1401,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<::arm_compute::CLBitwiseNot>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1607,13 +1419,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1634,13 +1446,13 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
const auto k = node.param().k;
- auto values_alloc = _tensor_builder->at(outputValues_index).get();
- auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
- auto input_alloc = _tensor_builder->at(inputData_index).get();
+ auto values_tensor = _tensor_builder->at(outputValues_index).get();
+ auto indices_tensor = _tensor_builder->at(outputIndices_index).get();
+ auto input_tensor = _tensor_builder->at(inputData_index).get();
auto fn = std::make_unique<::arm_compute::CLTopKV2>();
- fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
+ fn->configure(input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1659,9 +1471,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto indices_alloc = _tensor_builder->at(indices_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto indices_tensor = _tensor_builder->at(indices_index).get();
// NOTE The frontend layout and backend layout must be the same for this operation.
// If not the same, we have to add a stage(?) to perform permutation of output tensor. It
@@ -1671,43 +1483,43 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
// a model. For example, if a model in NHWC has this operation as output rank == 4, indices
// rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
// and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
- const auto backend_layout = ofm_alloc->layout();
+ const auto backend_layout = ofm_tensor->layout();
UNUSED_RELEASE(backend_layout);
- assert(backend_layout == ifm_alloc->layout());
- assert(backend_layout == indices_alloc->layout());
+ assert(backend_layout == ifm_tensor->layout());
+ assert(backend_layout == indices_tensor->layout());
assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
auto fn = std::make_unique<::arm_compute::CLGatherEx>();
// input is n-D, indices k-D, output is (n + k - 1)-D
size_t n = ifm_rank;
- assert(n == ifm_alloc->num_dimensions());
+ assert(n == ifm_tensor->num_dimensions());
size_t k = _ctx.at(indices_index).shape().rank();
- assert(k == indices_alloc->num_dimensions());
+ assert(k == indices_tensor->num_dimensions());
// Disable applied dim_correction
- const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape();
- if (n != ifm_alloc->info()->num_dimensions())
+ const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
+ if (n != ifm_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and ifm tensor is applied dim_correction
const auto ifm = _ctx.at(ifm_index);
- ifm_alloc->info()->set_tensor_shape(
+ ifm_tensor->info()->set_tensor_shape(
acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
}
- const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape();
- if (k != indices_alloc->info()->num_dimensions())
+ const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
+ if (k != indices_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and indices tensor is applied dim_correction
const auto indices = _ctx.at(indices_index);
- indices_alloc->info()->set_tensor_shape(
+ indices_tensor->info()->set_tensor_shape(
acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
}
- fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+ fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
// Revert disabling applied dim_correction
- ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
- indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
+ ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
+ indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1719,12 +1531,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::CLNeg>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1736,15 +1548,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1761,11 +1573,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
assert((ifm_shape.rank() - 1) == ofm_shape.rank());
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
auto frontend_layout = _current_op_seq_layout;
- auto backend_layout = ifm_alloc->layout();
+ auto backend_layout = ifm_tensor->layout();
int axis_value = node.param().axis;
if (axis_value < 0)
@@ -1776,10 +1588,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
auto acl_axis =
acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
- auto fn = std::make_unique<::arm_compute::CLArgOperation>();
+ auto fn = std::make_unique<::arm_compute::CLArgMinMaxLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
- ::arm_compute::ArgOperation::MAX);
+ fn->configure(ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
+ ::arm_compute::ReductionOperation::ARG_IDX_MAX);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1791,12 +1603,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
- auto fn = std::make_unique<::arm_compute::CLCast>();
+ auto fn = std::make_unique<::arm_compute::CLDequantizationLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE);
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1814,15 +1626,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
auto beta = node.param().beta;
auto bias = node.param().bias;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const auto norm_info = ::arm_compute::NormalizationLayerInfo(
::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1837,12 +1649,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
auto block_size = node.param().block_size;
assert(block_size > 0);
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
- auto fn = std::make_unique<::arm_compute::CLDepthToSpace>();
+ auto fn = std::make_unique<::arm_compute::CLDepthToSpaceLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -1860,13 +1672,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
for (const auto &output : node.getOutputs())
output_indexes.emplace_back(output);
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- std::vector<arm_compute::ICLTensor *> output_allocs;
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ std::vector<arm_compute::ICLTensor *> output_tensors;
for (const auto &ofm_ind : output_indexes)
- output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+ output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = ifm_alloc->layout();
+ const auto backend_layout = ifm_tensor->layout();
auto axis = node.param().axis;
if (axis < 0)
axis += ifm_rank;
@@ -1874,7 +1686,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
auto fn = std::make_unique<::arm_compute::CLSplit>();
- fn->configure(ifm_alloc->handle(), output_allocs, axis);
+ fn->configure(ifm_tensor->handle(), output_tensors, axis);
_return_fn = asAclClFunction(std::move(fn));
}
@@ -1906,13 +1718,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
for (const auto &output_index : output_indexes)
{
size_t output_rank = _ctx.at(output_index).shape().rank();
- const auto &output_alloc = _tensor_builder->at(output_index);
- orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
- assert(output_rank == output_alloc->num_dimensions());
- if (output_rank != output_alloc->info()->num_dimensions())
+ const auto &output_tensor = _tensor_builder->at(output_index);
+ orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
+ assert(output_rank == output_tensor->num_dimensions());
+ if (output_rank != output_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and ifm tensor is applied dim_correction
- output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+ output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
_ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
}
}
@@ -1959,12 +1771,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
// Disable applied dim_correction
size_t input_rank = _ctx.at(input_index).shape().rank();
- const auto &input_alloc = _tensor_builder->at(input_index);
- assert(input_rank == input_alloc->num_dimensions());
- if (input_rank != input_alloc->info()->num_dimensions())
+ const auto &input_tensor = _tensor_builder->at(input_index);
+ assert(input_rank == input_tensor->num_dimensions());
+ if (input_rank != input_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and ifm tensor is applied dim_correction
- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
_ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
}
@@ -1982,13 +1794,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::CLElementwiseMin>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -2001,13 +1813,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::CLElementwiseMax>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclClFunction(std::move(fn));
@@ -2019,12 +1831,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
0);
auto acl_fn = asAclClFunction(std::move(fn));
@@ -2037,12 +1849,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
0);
auto acl_fn = asAclClFunction(std::move(fn));
diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h
new file mode 100644
index 000000000..625343411
--- /dev/null
+++ b/runtime/onert/backend/acl_common/AclKernelGen.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
+#define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
+
+#include <exec/IFunction.h>
+#include <ir/Operands.h>
+
+#include <ir/operation/LSTM.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace acl_common
+{
+
+template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
+ typename T_TensorBuilder>
+std::unique_ptr<exec::IFunction>
+kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands,
+ const std::shared_ptr<T_TensorBuilder> &tensor_builder)
+{
+ // TODO Support dynamic rnn
+ // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+ const auto scratch_buffer_index{
+ node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+ const auto output_state_out_index{
+ node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+ const auto cell_state_out_index{
+ node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+ const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+
+ const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+ const auto input_to_input_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+ const auto input_to_forget_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+ const auto input_to_cell_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+ const auto input_to_output_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+ const auto recurrent_to_input_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+ const auto recurrent_to_forget_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+ const auto recurrent_to_cell_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+ const auto recurrent_to_output_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+ const auto cell_to_input_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+ const auto cell_to_forget_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+ const auto cell_to_output_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+ const auto input_gate_bias_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+ const auto forget_gate_bias_index{
+ node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+ const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+ const auto output_gate_bias_index{
+ node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+ const auto projection_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+ const auto projection_bias_index{
+ node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+ const auto output_state_in_index{
+ node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+ const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+ const auto cell_threshold = node.param().cell_threshold;
+ const auto projection_threshold = node.param().projection_threshold;
+
+ bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+ operands.at(input_to_input_weights_index).shape().dim(1) != 0;
+ bool has_recurrent_to_input_weights =
+ operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+ operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+ bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+ bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0;
+ bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 &&
+ operands.at(projection_weights_index).shape().dim(1) != 0;
+ bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0);
+
+ // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+ // true: no CIFG
+ // false: CIFG
+ // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+ bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+
+ // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+ // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+ // true: peephole
+ // false: no peephole
+ bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+
+ // NOTE Although the projection weights has data the projection bias may not have data.
+ bool has_projection_param = has_projection_weights;
+
+ const auto activation = node.param().activation;
+ const auto cell_clip = cell_threshold;
+ const auto projection_clip = projection_threshold;
+ assert(cell_clip >= 0.f && projection_clip >= 0.f);
+
+ auto scratch_buffer_tensor = tensor_builder->at(scratch_buffer_index).get();
+ auto output_state_out_tensor = tensor_builder->at(output_state_out_index).get();
+ auto cell_state_out_tensor = tensor_builder->at(cell_state_out_index).get();
+ auto output_tensor = tensor_builder->at(output_index).get();
+
+ auto input_tensor = tensor_builder->at(input_index).get();
+
+ auto input_to_forget_weights_tensor = tensor_builder->at(input_to_forget_weights_index).get();
+ auto input_to_cell_weights_tensor = tensor_builder->at(input_to_cell_weights_index).get();
+ auto input_to_output_weights_tensor = tensor_builder->at(input_to_output_weights_index).get();
+ auto recurrent_to_forget_weights_tensor =
+ tensor_builder->at(recurrent_to_forget_weights_index).get();
+ auto recurrent_to_cell_weights_tensor = tensor_builder->at(recurrent_to_cell_weights_index).get();
+ auto recurrent_to_output_weights_tensor =
+ tensor_builder->at(recurrent_to_output_weights_index).get();
+
+ auto forget_gate_bias_tensor = tensor_builder->at(forget_gate_bias_index).get();
+ auto cell_bias_tensor = tensor_builder->at(cell_bias_index).get();
+ auto output_gate_bias_tensor = tensor_builder->at(output_gate_bias_index).get();
+ auto output_state_in_tensor = tensor_builder->at(output_state_in_index).get();
+ auto cell_state_in_tensor = tensor_builder->at(cell_state_in_index).get();
+
+ auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+
+ auto fn = std::make_unique<T_ACLLayer>();
+
+ ::arm_compute::LSTMParams<T_Tensor> lstm_params{};
+ if (has_cifg_param)
+ {
+ auto input_to_input_weights_tensor =
+ tensor_builder->at(input_to_input_weights_index).get(); // optional
+ auto recurrent_to_input_weights_tensor =
+ tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+ auto cell_to_input_weights_handle =
+ has_peephole_param ? tensor_builder->at(cell_to_input_weights_index).get()->handle()
+ : nullptr; // optional (non-cifg && peephole)
+ auto input_gate_bias_tensor = tensor_builder->at(input_gate_bias_index).get(); // optional
+ lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
+ recurrent_to_input_weights_tensor->handle(),
+ cell_to_input_weights_handle, input_gate_bias_tensor->handle());
+ }
+ if (has_peephole_param)
+ {
+ auto cell_to_forget_weights_tensor =
+ tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+ auto cell_to_output_weights_tensor =
+ tensor_builder->at(cell_to_output_weights_index).get(); // optional
+ lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
+ cell_to_output_weights_tensor->handle());
+ }
+ if (has_projection_param)
+ {
+ auto projection_weights_tensor = tensor_builder->at(projection_weights_index).get(); // optional
+ auto projection_bias_handle = has_projection_bias
+ ? tensor_builder->at(projection_bias_index).get()->handle()
+ : nullptr; // optional
+ lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
+ }
+
+ fn->configure(input_tensor->handle(), input_to_forget_weights_tensor->handle(),
+ input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(),
+ recurrent_to_forget_weights_tensor->handle(),
+ recurrent_to_cell_weights_tensor->handle(),
+ recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(),
+ cell_bias_tensor->handle(), output_gate_bias_tensor->handle(),
+ output_state_in_tensor->handle(), cell_state_in_tensor->handle(),
+ scratch_buffer_tensor->handle(), output_state_out_tensor->handle(),
+ cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info,
+ cell_clip, projection_clip);
+
+ return std::make_unique<T_FunctionWrapper>(std::move(fn));
+}
+
+template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
+ typename T_TensorBuilder>
+std::unique_ptr<exec::IFunction>
+kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands,
+ const std::shared_ptr<T_TensorBuilder> &tensor_builder, ir::Layout layout)
+{
+ using ir::operation::FullyConnected;
+
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+ const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+ const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+
+ const auto input_rank = operands.at(input_index).shape().rank();
+
+ const auto output_size =
+ operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1);
+ UNUSED_RELEASE(output_size);
+ assert(operands.at(bias_index).shape().dim(0) == output_size);
+ assert(operands.at(weight_index).shape().dim(0) == output_size);
+ const auto batch_size =
+ operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2);
+ const auto input_size =
+ operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1);
+
+ // Check for reshaping input's shape into rank-2
+ bool needs_reshape = false;
+ ir::Shape reshape(2);
+ if (input_rank == 3 || input_rank == 4)
+ {
+ const auto &ifm_shape = operands.at(input_index).shape();
+ auto feature_size = 1;
+ for (int i = 0; i < ifm_shape.rank(); ++i)
+ {
+ feature_size *= ifm_shape.dim(i);
+ }
+
+ UNUSED_RELEASE(feature_size);
+ assert(feature_size == batch_size * input_size);
+
+ // for reshaping
+ needs_reshape = true;
+ reshape.dim(0) = batch_size; /* H */
+ reshape.dim(1) = input_size; /* W */
+ }
+
+ auto output_tensor = tensor_builder->at(output_index).get();
+ const auto input_tensor = tensor_builder->at(input_index).get();
+ const auto weight_tensor = tensor_builder->at(weight_index).get();
+ const auto bias_tensor = tensor_builder->at(bias_index).get();
+ const auto frontend_layout = layout;
+ const auto acl_layout = output_tensor->handle()->info()->data_layout();
+
+ auto fn =
+ std::make_unique<T_ACLLayer>(tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+ typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL;
+ if (operands.at(weight_index).isConstant())
+ {
+ kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS;
+ assert(operands.at(weight_index).data());
+ }
+
+ fn->configure(
+ input_tensor->handle(), weight_tensor->handle(), bias_tensor->handle(),
+ output_tensor->handle(), needs_reshape,
+ ::onert::backend::acl_common::asTensorShape(
+ reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
+ kernel_type);
+
+ return std::make_unique<T_FunctionWrapper>(std::move(fn));
+}
+
+} // namespace acl_common
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
index e47186754..37ec9939e 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
@@ -31,6 +31,7 @@
#include "exec/NopFunction.h"
#include "util/logging.h"
#include "util/Utils.h"
+#include "AclKernelGen.h"
namespace onert
{
@@ -74,15 +75,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -96,10 +97,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto frontend_layout = _current_op_seq_layout;
- auto backend_layout = ifm_alloc->layout();
+ auto backend_layout = ifm_tensor->layout();
int axis_value = node.param().axis;
if (axis_value < 0)
@@ -112,7 +113,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
auto fn = std::make_unique<::arm_compute::NEArgMinMaxLayer>();
- fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(),
+ fn->configure(ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
arm_compute::ReductionOperation::ARG_IDX_MAX);
auto acl_fn = asAclFunction(std::move(fn));
@@ -127,15 +128,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
const auto block_size_index{
node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto block_size_tensor = _tensor_builder->at(block_size_index).get();
assert(_ctx.at(block_size_index).data());
auto fn = std::make_unique<::arm_compute::NEBatchToSpaceLayer>();
- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -147,15 +148,26 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
- auto fn = std::make_unique<::arm_compute::NECast>();
+ std::unique_ptr<::arm_compute::IFunction> fn;
+ if (ifm_tensor->data_type() == ofm_tensor->data_type())
+ {
+ auto l = std::make_unique<::arm_compute::NECopy>();
+
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+ fn = std::move(l);
+ }
+ else
+ {
+ auto l = std::make_unique<::arm_compute::NECast>();
- auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
- ? arm_compute::SubDataType::BOOL
- : arm_compute::SubDataType::NONE;
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
+
+ fn = std::move(l);
+ }
auto acl_fn = asAclFunction(std::move(fn));
@@ -183,10 +195,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
ker_width, ker_height);
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto ker_alloc = _tensor_builder->at(ker_index).get();
- auto bias_alloc = _tensor_builder->at(bias_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto ker_tensor = _tensor_builder->at(ker_index).get();
+ auto bias_tensor = _tensor_builder->at(bias_index).get();
const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -194,8 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
auto fn = std::make_unique<::arm_compute::NEConvolutionLayer>(
_tensor_builder->acl_tensor_manager()->internal_buffer_manager());
- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
- conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+ ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
+ ::arm_compute::Size2D(1U, 1U), act_info);
_return_fn = asAclFunction(std::move(fn));
}
@@ -208,12 +221,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
auto block_size = node.param().block_size;
assert(block_size > 0);
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
- auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayerEx>();
+ auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
auto acl_fn = asAclFunction(std::move(fn));
@@ -242,10 +255,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
const auto multiplier = node.param().multiplier;
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto ker_alloc = _tensor_builder->at(ker_index).get();
- auto bias_alloc = _tensor_builder->at(bias_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto ker_tensor = _tensor_builder->at(ker_index).get();
+ auto bias_tensor = _tensor_builder->at(bias_index).get();
const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -253,8 +266,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
{
auto fn = std::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>();
- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
- ofm_alloc->handle(), conv_info, multiplier, act_info);
+ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+ ofm_tensor->handle(), conv_info, multiplier, act_info);
_return_fn = asAclFunction(std::move(fn));
}
@@ -265,12 +278,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<::arm_compute::NEDequantizationLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -305,19 +318,19 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
- ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
- ::arm_compute::Size2D{kw, kh},
- acl_common::asPadStrideInfo(padding, stride)};
+ ::arm_compute::PoolingLayerInfo info{
+ ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh},
+ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)};
auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
@@ -348,19 +361,20 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
::arm_compute::PoolingLayerInfo info{
::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
- acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
+ ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride),
+ true /* exclude_padding */};
auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Concat &node)
@@ -383,7 +397,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
return;
}
- auto output_alloc = _tensor_builder->at(ofm_index).get();
+ auto output_tensor = _tensor_builder->at(ofm_index).get();
std::vector<::arm_compute::ITensor *> input_tensors;
for (const auto &ifm_ind : input_indexes)
input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
@@ -392,7 +406,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
if (input_indexes.size() < 2)
{
auto l = std::make_unique<::arm_compute::NECopy>();
- l->configure(input_tensors.at(0), output_alloc->handle());
+ l->configure(input_tensors.at(0), output_tensor->handle());
fn = std::move(l);
}
else
@@ -400,10 +414,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
auto l = std::make_unique<::arm_compute::NEConcatenateLayer>();
const auto rank = _ctx.at(ofm_index).shape().rank();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = output_alloc->layout();
+ const auto backend_layout = output_tensor->layout();
const auto fixed_axis =
acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
- l->configure(input_tensors, output_alloc->handle(), fixed_axis);
+ l->configure(input_tensors, output_tensor->handle(), fixed_axis);
fn = std::move(l);
}
@@ -418,13 +432,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto lookups_alloc = _tensor_builder->at(lookups_index).get();
- auto values_alloc = _tensor_builder->at(values_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+ auto values_tensor = _tensor_builder->at(values_index).get();
auto fn = std::make_unique<::arm_compute::NEEmbeddingLookup>();
- fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+ fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -436,12 +450,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::NEFloor>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -450,76 +464,15 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
void KernelGenerator::visit(const ir::operation::FullyConnected &node)
{
- using ir::operation::FullyConnected;
-
const auto output_index{node.getOutputs().at(0)};
- const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
- const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
- const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
-
- const auto input_rank = _ctx.at(input_index).shape().rank();
-
- const auto output_size =
- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
- UNUSED_RELEASE(output_size);
- assert(_ctx.at(bias_index).shape().dim(0) == output_size);
- assert(_ctx.at(weight_index).shape().dim(0) == output_size);
- const auto batch_size =
- _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
- const auto input_size =
- _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
-
- // Check for reshaping input's shape into rank-2
- bool needs_reshape = false;
- ir::Shape reshape(2);
- if (input_rank == 3 || input_rank == 4)
- {
- const auto &ifm_shape = _ctx.at(input_index).shape();
- auto feature_size = 1;
- for (int i = 0; i < ifm_shape.rank(); ++i)
- {
- feature_size *= ifm_shape.dim(i);
- }
-
- UNUSED_RELEASE(feature_size);
- assert(feature_size == batch_size * input_size);
-
- // for reshaping
- needs_reshape = true;
- reshape.dim(0) = batch_size; /* H */
- reshape.dim(1) = input_size; /* W */
- }
-
+ auto output_tensor = _tensor_builder->at(output_index).get();
const auto activation = node.param().activation;
- auto output_alloc = _tensor_builder->at(output_index).get();
- const auto input_alloc = _tensor_builder->at(input_index).get();
- const auto weight_alloc = _tensor_builder->at(weight_index).get();
- const auto bias_alloc = _tensor_builder->at(bias_index).get();
- const auto frontend_layout = _current_op_seq_layout;
- const auto acl_layout = output_alloc->handle()->info()->data_layout();
-
- auto fn = std::make_unique<arm_compute::NEFullyConnectedReshapingLayer>(
- _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-
- arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type =
- arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL;
- if (_ctx.at(weight_index).isConstant())
- {
- kernel_type = arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
- assert(_ctx.at(weight_index).data());
- }
-
- fn->configure(
- input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
- needs_reshape,
- ::onert::backend::acl_common::asTensorShape(
- reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
- kernel_type);
-
+ auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
+ ::arm_compute::NEFullyConnectedReshapingLayer>(
+ node, _ctx, _tensor_builder, _current_op_seq_layout);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)),
- ActivationBuilder::generate(activation, output_alloc->handle()));
+ std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
@@ -531,17 +484,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto hits_alloc = _tensor_builder->at(hits_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto hits_tensor = _tensor_builder->at(hits_index).get();
- auto lookups_alloc = _tensor_builder->at(lookups_index).get();
- auto keys_alloc = _tensor_builder->at(keys_index).get();
- auto values_alloc = _tensor_builder->at(values_index).get();
+ auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+ auto keys_tensor = _tensor_builder->at(keys_index).get();
+ auto values_tensor = _tensor_builder->at(values_index).get();
auto fn = std::make_unique<::arm_compute::NEHashtableLookup>();
- fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
- output_alloc->handle(), hits_alloc->handle());
+ fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
+ output_tensor->handle(), hits_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -561,10 +514,10 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
// Converting in reverse order
const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto indices_alloc = _tensor_builder->at(indices_index).get();
- const auto backend_layout = ofm_alloc->layout();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto indices_tensor = _tensor_builder->at(indices_index).get();
+ const auto backend_layout = ofm_tensor->layout();
UNUSED_RELEASE(backend_layout);
// NOTE The frontend layout and backend layout must be the same for this operation.
@@ -575,35 +528,35 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
// a model. For example, if a model in NHWC has this operation as output rank == 4, indices
// rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
// and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
- assert(backend_layout == ifm_alloc->layout());
- assert(backend_layout == indices_alloc->layout());
+ assert(backend_layout == ifm_tensor->layout());
+ assert(backend_layout == indices_tensor->layout());
assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
auto fn = std::make_unique<::arm_compute::NEGatherEx>();
// input is n-D, indices k-D, output is (n + k - 1)-D
size_t n = ifm_rank;
- assert(n == ifm_alloc->num_dimensions());
+ assert(n == ifm_tensor->num_dimensions());
size_t k = _ctx.at(indices_index).shape().rank();
- assert(k == indices_alloc->num_dimensions());
+ assert(k == indices_tensor->num_dimensions());
// Disable applied dim_correction
- if (n != ifm_alloc->info()->num_dimensions())
+ if (n != ifm_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and ifm tensor is applied dim_correction
const auto ifm = _ctx.at(ifm_index);
- ifm_alloc->info()->set_tensor_shape(
+ ifm_tensor->info()->set_tensor_shape(
acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
}
- if (k != indices_alloc->info()->num_dimensions())
+ if (k != indices_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and indices tensor is applied dim_correction
const auto indices = _ctx.at(indices_index);
- indices_alloc->info()->set_tensor_shape(
+ indices_tensor->info()->set_tensor_shape(
acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
}
- fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+ fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
// acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
// use arm_compute::TensorInfo::offset_element_in_bytes()
@@ -621,20 +574,20 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto gamma_alloc = _tensor_builder->at(gamma_index).get();
- auto beta_alloc = _tensor_builder->at(beta_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto gamma_tensor = _tensor_builder->at(gamma_index).get();
+ auto beta_tensor = _tensor_builder->at(beta_index).get();
auto epsilon = node.param().epsilon;
auto activation = node.param().activation;
auto fn = std::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
- beta_alloc->handle(), epsilon);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
+ beta_tensor->handle(), epsilon);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::L2Normalization &node)
@@ -656,15 +609,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
float bias = 0.0f; // Don't offset the reduction.
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
radius, alpha, beta, bias, false);
auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -686,19 +639,20 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
::arm_compute::PoolingLayerInfo info{
::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
+ ifm_tensor->info()->data_layout(),
::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
@@ -712,15 +666,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
auto beta = node.param().beta;
auto bias = node.param().bias;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const auto norm_info = ::arm_compute::NormalizationLayerInfo(
::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -733,13 +687,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input0_alloc = _tensor_builder->at(input0_index).get();
- auto input1_alloc = _tensor_builder->at(input1_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input0_tensor = _tensor_builder->at(input0_index).get();
+ auto input1_tensor = _tensor_builder->at(input1_index).get();
auto fn = std::make_unique<::arm_compute::NELogicalAnd>();
- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -751,12 +705,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<::arm_compute::NEBitwiseNot>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -769,13 +723,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input0_alloc = _tensor_builder->at(input0_index).get();
- auto input1_alloc = _tensor_builder->at(input1_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input0_tensor = _tensor_builder->at(input0_index).get();
+ auto input1_tensor = _tensor_builder->at(input1_index).get();
auto fn = std::make_unique<::arm_compute::NELogicalOr>();
- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -787,8 +741,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
@@ -798,7 +752,7 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
// instead of 'INF', and then the result of this op will be errors due to the 'NaN'.
auto fn = std::make_unique<::arm_compute::NEActivationLayerEx>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -807,159 +761,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
void KernelGenerator::visit(const ir::operation::LSTM &node)
{
- // TODO Support dynamic rnn
- // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
- const auto scratch_buffer_index{
- node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
- const auto output_state_out_index{
- node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
- const auto cell_state_out_index{
- node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
- const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
-
- const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
- const auto input_to_input_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
- const auto input_to_forget_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
- const auto input_to_cell_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
- const auto input_to_output_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
- const auto recurrent_to_input_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
- const auto recurrent_to_forget_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
- const auto recurrent_to_cell_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
- const auto recurrent_to_output_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
- const auto cell_to_input_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
- const auto cell_to_forget_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
- const auto cell_to_output_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
- const auto input_gate_bias_index{
- node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
- const auto forget_gate_bias_index{
- node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
- const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
- const auto output_gate_bias_index{
- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
- const auto projection_weights_index{
- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
- const auto projection_bias_index{
- node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
- const auto output_state_in_index{
- node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
- const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
- const auto cell_threshold = node.param().cell_threshold;
- const auto projection_threshold = node.param().projection_threshold;
-
- bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
- _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
- bool has_recurrent_to_input_weights =
- _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
- _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
- bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
- bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
- bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
- _ctx.at(projection_weights_index).shape().dim(1) != 0;
- bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
- // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
- // true: no CIFG
- // false: CIFG
- // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
- bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
- // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
- // But the cell_to_input_weights does not exist in regular CIFG although peephole.
- // true: peephole
- // false: no peephole
- bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
- // NOTE Although the projection weights has data the projection bias may not have data.
- bool has_projection_param = has_projection_weights;
-
- const auto activation = node.param().activation;
- const auto cell_clip = cell_threshold;
- const auto projection_clip = projection_threshold;
- assert(cell_clip >= 0.f && projection_clip >= 0.f);
-
- auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
- auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
- auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
- auto output_alloc = _tensor_builder->at(output_index).get();
-
- auto input_alloc = _tensor_builder->at(input_index).get();
-
- auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
- auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
- auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
- auto recurrent_to_forget_weights_alloc =
- _tensor_builder->at(recurrent_to_forget_weights_index).get();
- auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
- auto recurrent_to_output_weights_alloc =
- _tensor_builder->at(recurrent_to_output_weights_index).get();
-
- auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
- auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
- auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
- auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
- auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
-
- auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
-
- auto fn = std::make_unique<::arm_compute::NELSTMLayer>();
-
- ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{};
- if (has_cifg_param)
- {
- auto input_to_input_weights_alloc =
- _tensor_builder->at(input_to_input_weights_index).get(); // optional
- auto recurrent_to_input_weights_alloc =
- _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
- auto cell_to_input_weights_handle =
- has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
- : nullptr; // optional (non-cifg && peephole)
- auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
- lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
- recurrent_to_input_weights_alloc->handle(),
- cell_to_input_weights_handle, input_gate_bias_alloc->handle());
- }
- if (has_peephole_param)
- {
- auto cell_to_forget_weights_alloc =
- _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
- auto cell_to_output_weights_alloc =
- _tensor_builder->at(cell_to_output_weights_index).get(); // optional
- lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
- cell_to_output_weights_alloc->handle());
- }
- if (has_projection_param)
- {
- auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
- auto projection_bias_handle = has_projection_bias
- ? _tensor_builder->at(projection_bias_index).get()->handle()
- : nullptr; // optional
- lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
- }
-
- fn->configure(
- input_alloc->handle(), input_to_forget_weights_alloc->handle(),
- input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
- recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
- recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
- cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
- cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
- output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
- lstm_params, act_info, cell_clip, projection_clip);
-
- auto acl_fn = asAclFunction(std::move(fn));
-
- _return_fn = std::move(acl_fn);
+ _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
+ ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_builder);
}
void KernelGenerator::visit(const ir::operation::Mul &node)
@@ -970,18 +773,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::NEPixelWiseMultiplication>();
// RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Neg &node)
@@ -989,12 +792,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::NENegLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -1030,12 +833,12 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
for (const auto &input_index : input_indexes)
{
size_t input_rank = _ctx.at(input_index).shape().rank();
- const auto &input_alloc = _tensor_builder->at(input_index);
- assert(input_rank == input_alloc->num_dimensions());
- if (input_rank != input_alloc->info()->num_dimensions())
+ const auto &input_tensor = _tensor_builder->at(input_index);
+ assert(input_rank == input_tensor->num_dimensions());
+ if (input_rank != input_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and ifm tensor is applied dim_correction
- input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+ input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
_ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
}
}
@@ -1094,8 +897,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
const auto ofm_idx{node.getOutputs().at(0)};
const auto ifm_idx{node.getInputs().at(0)};
const auto permute_type = node.getPermuteType();
- auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
- auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
const auto rank = _ctx.at(ofm_idx).shape().rank();
assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
@@ -1108,7 +911,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
auto l = std::make_unique<::arm_compute::NEPermute>();
- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
fn = std::move(l);
}
@@ -1119,7 +922,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
auto l = std::make_unique<::arm_compute::NEPermute>();
- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
fn = std::move(l);
}
@@ -1127,7 +930,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
{
auto l = std::make_unique<::arm_compute::NECopy>();
- l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
fn = std::move(l);
}
@@ -1143,15 +946,15 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto alpha_tensor = _tensor_builder->at(alpha_index).get();
std::unique_ptr<::arm_compute::IFunction> fn;
- auto l = std::make_unique<::arm_compute::NEPReLU>();
+ auto l = std::make_unique<::arm_compute::NEPReluLayer>();
- l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+ l->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
fn = std::move(l);
@@ -1166,14 +969,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
// Convert to ACL axes taking into account negative values and possible duplicates.
const auto &axes = _ctx.at(axes_index);
const auto input_rank = _ctx.at(input_index).shape().rank();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = input_alloc->layout();
+ const auto backend_layout = input_tensor->layout();
const auto reduce_axes =
acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
const auto reduce_type = node.param().reduce_type;
@@ -1182,11 +985,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
std::unique_ptr<::arm_compute::IFunction> fn;
if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
{
- // NOTE NEReduceMean has a bug that does not support NHWC layout
- // NEReduceMean intermediate tensors are always NCHW layout
- auto l = std::make_unique<::arm_compute::NEReduceMeanEx>();
+ auto l = std::make_unique<::arm_compute::NEReduceMean>();
- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
+ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
fn = std::move(l);
}
@@ -1194,7 +995,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
{
auto l = std::make_unique<::arm_compute::NEReduceSum>();
- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
+ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
fn = std::move(l);
}
@@ -1202,7 +1003,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
{
auto l = std::make_unique<::arm_compute::NEReduceOperation>();
- l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle(),
+ l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
acl_common::convertReduceType(reduce_type));
fn = std::move(l);
@@ -1218,15 +1019,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<arm_compute::NEActivationLayer>();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1238,15 +1039,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1258,15 +1059,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1278,13 +1079,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
// NOTE This operation must not be changed the layout from frontend to backend
// So, PermutationOperationPass makes layouts of frontend and backend the same.
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = output_alloc->layout();
+ const auto backend_layout = output_tensor->layout();
assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
frontend_layout == backend_layout);
UNUSED_RELEASE(frontend_layout);
@@ -1292,7 +1093,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -1305,12 +1106,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::NEScale>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
@@ -1334,25 +1135,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
const auto activation = node.param().activation;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
- auto weights_alloc = _tensor_builder->at(weights_index).get();
- auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
- auto bias_alloc = _tensor_builder->at(bias_index).get();
- auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
+ auto weights_tensor = _tensor_builder->at(weights_index).get();
+ auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
+ auto bias_tensor = _tensor_builder->at(bias_index).get();
+ auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
auto copy_layer = std::make_unique<::arm_compute::NECopy>();
- copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+ copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
_return_fn = asAclFunction(std::move(copy_layer));
- auto fn = std::make_unique<::arm_compute::NERNNLayerEx>(
+ auto fn = std::make_unique<::arm_compute::NERNNLayer>(
_tensor_builder->acl_tensor_manager()->internal_buffer_manager());
- fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
- bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
- act_info);
+ fn->configure(input_tensor->handle(), weights_tensor->handle(),
+ recurrent_weights_tensor->handle(), bias_tensor->handle(),
+ hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
_return_fn = asAclFunction(std::move(fn));
}
@@ -1361,12 +1162,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
auto fn = std::make_unique<::arm_compute::NERsqrtLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
_return_fn = asAclFunction(std::move(fn));
}
@@ -1383,10 +1184,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
(void)dims;
(void)ndim;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
_return_fn = std::move(acl_fn);
}
@@ -1396,15 +1197,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<arm_compute::NEActivationLayer>();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1417,13 +1218,25 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
const auto beta = node.param().beta;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
+ const auto frontend_layout = _current_op_seq_layout;
+ const auto backend_layout = input_tensor->layout();
+
+ // Disable applied dim_correction
+ const size_t input_rank = _ctx.at(input_index).shape().rank();
+ if (input_rank != input_tensor->info()->num_dimensions())
+ {
+ // This means that high dimension's value is 1 and input tensor is applied dim_correction
+ const auto input = _ctx.at(input_index);
+ input_tensor->info()->set_tensor_shape(
+ acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
+ }
auto fn = std::make_unique<::arm_compute::NESoftmaxLayer>(
_tensor_builder->acl_tensor_manager()->internal_buffer_manager());
- fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1438,20 +1251,18 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto block_size_alloc = _tensor_builder->at(block_size_index).get();
- auto paddings_alloc = _tensor_builder->at(paddings_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+ auto paddings_tensor = _tensor_builder->at(paddings_index).get();
assert(_ctx.at(block_size_index).data());
assert(_ctx.at(paddings_index).data());
- // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is
- // not 0.
- auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayerEx>();
+ auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayer>();
- fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
- ofm_alloc->handle());
+ fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
+ ofm_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -1465,12 +1276,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
auto block_size = node.param().block_size;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
- auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayerEx>();
+ auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayer>();
- fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+ fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1489,13 +1300,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
for (const auto &output : node.getOutputs())
output_indexes.emplace_back(output);
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- std::vector<arm_compute::ITensor *> output_allocs;
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ std::vector<arm_compute::ITensor *> output_tensors;
for (const auto &ofm_ind : output_indexes)
- output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+ output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = ifm_alloc->layout();
+ const auto backend_layout = ifm_tensor->layout();
auto axis = node.param().axis;
if (axis < 0)
axis += ifm_rank;
@@ -1503,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
auto fn = std::make_unique<::arm_compute::NESplit>();
- fn->configure(ifm_alloc->handle(), output_allocs, axis);
+ fn->configure(ifm_tensor->handle(), output_tensors, axis);
_return_fn = asAclFunction(std::move(fn));
}
@@ -1513,15 +1324,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
const ::arm_compute::ActivationLayerInfo act_info{
::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+ fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1534,13 +1345,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::NEElementwiseSquaredDiff>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -1555,17 +1366,17 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::NEArithmeticSubtraction>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
arm_compute::ConvertPolicy::SATURATE);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Slice &node)
@@ -1575,10 +1386,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
- auto outputData_alloc = _tensor_builder->at(output_index).get();
- auto inputData_alloc = _tensor_builder->at(input_index).get();
+ auto outputData_tensor = _tensor_builder->at(output_index).get();
+ auto inputData_tensor = _tensor_builder->at(input_index).get();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = inputData_alloc->layout();
+ const auto backend_layout = inputData_tensor->layout();
// Set initializers for indices data such as order of inputData
int input_rank = _ctx.at(input_index).shape().rank();
@@ -1628,7 +1439,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
auto fn = std::make_unique<::arm_compute::NESlice>();
- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
+ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1643,10 +1454,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
- auto outputData_alloc = _tensor_builder->at(output_index).get();
- auto inputData_alloc = _tensor_builder->at(input_index).get();
+ auto outputData_tensor = _tensor_builder->at(output_index).get();
+ auto inputData_tensor = _tensor_builder->at(input_index).get();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = inputData_alloc->layout();
+ const auto backend_layout = inputData_tensor->layout();
// Set initializers for indices data such as order of inputData
int input_rank = _ctx.at(input_index).shape().rank();
@@ -1715,7 +1526,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
auto fn = std::make_unique<::arm_compute::NEStridedSlice>();
- fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+ fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
strides_set, begin_mask, end_mask, shrink_axis_mask);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1749,16 +1560,16 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
}
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto ifm_alloc = _tensor_builder->at(ifm_index).get();
- auto ker_alloc = _tensor_builder->at(ker_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ auto ker_tensor = _tensor_builder->at(ker_index).get();
const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
auto fn = std::make_unique<::arm_compute::NETransposeConvLayer>();
- fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
- invalid_horizontal, invalid_vertical);
+ fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
+ tconv_info, invalid_horizontal, invalid_vertical);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1771,10 +1582,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
const auto &perm{node.param().perm};
- auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
- const auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+ const auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
const auto frontend_layout = _current_op_seq_layout;
- const auto backend_layout = ifm_alloc->layout();
+ const auto backend_layout = ifm_tensor->layout();
const auto rank = _ctx.at(ifm_idx).shape().rank();
std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
@@ -1783,11 +1594,11 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
std::unique_ptr<::arm_compute::IFunction> fn;
- if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2)
+ if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
{
auto l = std::make_unique<::arm_compute::NETranspose>();
- l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle());
fn = std::move(l);
}
@@ -1795,7 +1606,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
{
auto l = std::make_unique<::arm_compute::NEPermute>();
- l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+ l->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
fn = std::move(l);
}
@@ -1834,13 +1645,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
for (const auto &output_index : output_indexes)
{
size_t output_rank = _ctx.at(output_index).shape().rank();
- const auto &output_alloc = _tensor_builder->at(output_index);
- orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
- assert(output_rank == output_alloc->num_dimensions());
- if (output_rank != output_alloc->info()->num_dimensions())
+ const auto &output_tensor = _tensor_builder->at(output_index);
+ orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
+ assert(output_rank == output_tensor->num_dimensions());
+ if (output_rank != output_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and ifm tensor is applied dim_correction
- output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+ output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
_ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
}
}
@@ -1858,17 +1669,17 @@ void KernelGenerator::visit(const ir::operation::Add &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::NEArithmeticAddition>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
arm_compute::ConvertPolicy::SATURATE);
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Div &node)
@@ -1879,16 +1690,16 @@ void KernelGenerator::visit(const ir::operation::Div &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::NEElementwiseDivision>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
_return_fn = std::make_unique<exec::FunctionSequence>(
- asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+ asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
}
void KernelGenerator::visit(const ir::operation::Exp &node)
@@ -1896,12 +1707,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<::arm_compute::NEExpLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -1913,12 +1724,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input_tensor = _tensor_builder->at(input_index).get();
auto fn = std::make_unique<::arm_compute::NEReshapeLayer>();
- fn->configure(input_alloc->handle(), output_alloc->handle());
+ fn->configure(input_tensor->handle(), output_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -1933,13 +1744,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
const auto comparison_type = node.param().comparison_type;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input0_alloc = _tensor_builder->at(input0_index).get();
- auto input1_alloc = _tensor_builder->at(input1_index).get();
+ auto output_tensor = _tensor_builder->at(output_index).get();
+ auto input0_tensor = _tensor_builder->at(input0_index).get();
+ auto input1_tensor = _tensor_builder->at(input1_index).get();
auto fn = std::make_unique<::arm_compute::NEElementwiseComparison>();
- fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+ fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
(arm_compute::ComparisonOperation)comparison_type);
auto acl_fn = asAclFunction(std::move(fn));
@@ -1953,13 +1764,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::NEElementwiseMin>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
@@ -1972,13 +1783,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
- auto ofm_alloc = _tensor_builder->at(ofm_index).get();
- auto lhs_alloc = _tensor_builder->at(lhs_index).get();
- auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->at(rhs_index).get();
auto fn = std::make_unique<::arm_compute::NEElementwiseMax>();
- fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+ fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
auto acl_fn = asAclFunction(std::move(fn));
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/backend/cpu/ConstantInitializer.cc
index 71e313628..deb27f0fe 100644
--- a/runtime/onert/backend/cpu/ConstantInitializer.cc
+++ b/runtime/onert/backend/cpu/ConstantInitializer.cc
@@ -15,6 +15,7 @@
*/
#include "ConstantInitializer.h"
+#include "Tensor.h"
namespace onert
{
@@ -30,39 +31,61 @@ ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
// DO NOTHING
}
+void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index,
+ const ir::Operand &obj)
+{
+ registerExternalInitializer(index, obj);
+}
+
+void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index,
+ const ir::Operand &obj)
+{
+ // For only CONSTANTS
+ // TODO Add to check if tensor has been allocated
+ if (!obj.isConstant())
+ return;
+
+ _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) {
+ auto data = model_obj.shareData();
+ assert(data && data->base());
+ ExternalTensor &tensor = dynamic_cast<ExternalTensor &>(itensor);
+ tensor.setData(data);
+ };
+}
+
void ConstantInitializer::visit(const ir::operation::Conv2D &node)
{
const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
const auto &kernel_obj = _operands.at(kernel_index);
- registerCopyInitializer(kernel_index, kernel_obj);
+ registerExternalInitializer(kernel_index, kernel_obj);
const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
const auto &bias_obj = _operands.at(bias_index);
- registerCopyInitializer(bias_index, bias_obj);
+ registerExternalInitializer(bias_index, bias_obj);
}
void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node)
{
const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
const auto &kernel_obj = _operands.at(kernel_index);
- registerCopyInitializer(kernel_index, kernel_obj);
+ registerExternalInitializer(kernel_index, kernel_obj);
const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
const auto &bias_obj = _operands.at(bias_index);
- registerCopyInitializer(bias_index, bias_obj);
+ registerExternalInitializer(bias_index, bias_obj);
}
void ConstantInitializer::visit(const ir::operation::FullyConnected &node)
{
const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
const auto &weight_obj = _operands.at(weight_index);
- registerCopyInitializer(weight_index, weight_obj);
+ registerExternalInitializer(weight_index, weight_obj);
const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
if (!bias_index.undefined())
{
const auto &bias_obj = _operands.at(bias_index);
- registerCopyInitializer(bias_index, bias_obj);
+ registerExternalInitializer(bias_index, bias_obj);
}
}
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h
index bd06c64d1..de03a693a 100644
--- a/runtime/onert/backend/cpu/ConstantInitializer.h
+++ b/runtime/onert/backend/cpu/ConstantInitializer.h
@@ -36,6 +36,15 @@ public:
const std::shared_ptr<TensorBuilder> &tensor_builder);
public:
+ void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override;
+
+ // TODO: For now the only cpu backend supports constant tensor to use data from external
+ // If the other backend supports (to do this,
+ // ExternalTensor should be abstract such as IExternal, maybe),
+ // this can be an interface of IConstantInitializer
+ void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &);
+
+public:
void visit(const ir::operation::Conv2D &) override;
void visit(const ir::operation::DepthwiseConv2D &) override;
void visit(const ir::operation::FullyConnected &) override;
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 72f960675..2766aa2d5 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -60,6 +60,7 @@
#include "ops/SoftMaxLayer.h"
#include "ops/StridedSliceLayer.h"
#include "ops/SpaceToBatchNDLayer.h"
+#include "ops/SpaceToDepthLayer.h"
#include "ops/SplitLayer.h"
#include "ops/SubLayer.h"
#include "ops/TanhLayer.h"
@@ -70,11 +71,13 @@
#include "ops/ZerosLikeLayer.h"
#include "ops/SquaredDiffLayer.h"
#include "ops/LogicalOrLayer.h"
+#include "ops/L2NormLayer.h"
#include "ops/MatrixBandPartLayer.h"
#include "ops/BatchMatMulLayer.h"
#include "ops/BroadcastToLayer.h"
#include "ops/FusedBatchNormLayer.h"
#include "ops/LogSoftMaxLayer.h"
+#include "ops/QuantizeLayer.h"
#include <backend/Backend.h>
#include <backend/IConfig.h>
@@ -184,10 +187,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
- auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
- auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
+ auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
const auto stride = node.param().stride;
const auto activation = node.param().activation;
@@ -196,9 +199,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic())
{
- fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, param_padding.param.left,
+ fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left,
param_padding.param.right, param_padding.param.top, param_padding.param.bottom,
- stride.horizontal, stride.vertical, activation, ofm_alloc);
+ stride.horizontal, stride.vertical, activation, ofm_tensor);
_return_fn = std::move(fn);
return;
@@ -213,9 +216,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
const auto padding =
ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
- fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, padding.left, padding.right,
- padding.top, padding.bottom, stride.horizontal, stride.vertical, activation,
- ofm_alloc);
+ fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
+ padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
+ activation, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -241,16 +244,16 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
const auto multiplier = node.param().multiplier;
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
- auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
- auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
+ auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>();
- fn->configure(ifm_alloc, ker_alloc, bias_alloc, padding.left, padding.right, padding.top,
+ fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top,
padding.bottom, stride.horizontal, stride.vertical, multiplier, activation,
- ofm_alloc);
+ ofm_tensor);
_return_fn = std::move(fn);
}
@@ -270,13 +273,13 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::MaxPoolLayer>();
- fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
- stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
+ fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
+ stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -295,13 +298,13 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::AvgPoolLayer>();
- fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
- stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
+ fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
+ stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -313,7 +316,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
const auto rank = _ctx.at(ofm_index).shape().rank();
const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
- auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+ auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
std::vector<const IPortableTensor *> input_tensors;
for (auto &ifm_idx : node.getInputs())
@@ -321,7 +324,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
auto fn = std::make_unique<ops::ConcatLayer>();
- fn->configure(input_tensors, axis, output_alloc);
+ fn->configure(input_tensors, axis, output_tensor);
_return_fn = std::move(fn);
}
@@ -332,13 +335,13 @@ void KernelGenerator::visit(const ir::operation::Fill &node)
const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)};
const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto value_alloc = _tensor_builder->portableAt(value_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto value_tensor = _tensor_builder->portableAt(value_index).get();
auto fn = std::make_unique<ops::FillLayer>();
- fn->configure(input_alloc, value_alloc, output_alloc);
+ fn->configure(input_tensor, value_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -353,15 +356,15 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
const auto activation = node.param().activation;
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto weight_alloc = _tensor_builder->portableAt(weight_index).get();
- auto bias_alloc =
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto weight_tensor = _tensor_builder->portableAt(weight_index).get();
+ auto bias_tensor =
bias_index.undefined() ? nullptr : _tensor_builder->portableAt(bias_index).get();
auto fn = std::make_unique<ops::FullyConnectedLayer>();
- fn->configure(input_alloc, weight_alloc, bias_alloc, activation, output_alloc);
+ fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor);
_return_fn = std::move(fn);
}
@@ -371,21 +374,21 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
// optional 2nd input
- IPortableTensor *shape_alloc = nullptr;
+ IPortableTensor *shape_tensor = nullptr;
if (node.getInputs().size() == 2)
{
const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)};
- shape_alloc = _tensor_builder->portableAt(shape_index).get();
+ shape_tensor = _tensor_builder->portableAt(shape_index).get();
}
auto fn = std::make_unique<ops::ReshapeLayer>();
- fn->configure(input_alloc, shape_alloc, output_alloc);
+ fn->configure(input_tensor, shape_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -394,13 +397,13 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
// Squeeze can share same kernel with reshape
auto fn = std::make_unique<ops::ReshapeLayer>();
- fn->configure(input_alloc, nullptr, output_alloc);
+ fn->configure(input_tensor, nullptr, output_tensor);
_return_fn = std::move(fn);
}
@@ -412,12 +415,12 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
const auto beta = node.param().beta;
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::SoftMaxLayer>();
- fn->configure(input_alloc, beta, output_alloc);
+ fn->configure(input_tensor, beta, output_tensor);
_return_fn = std::move(fn);
}
@@ -430,13 +433,13 @@ void KernelGenerator::visit(const ir::operation::Add &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::AddLayer>();
- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -447,15 +450,15 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto comparison_type = node.param().comparison_type;
auto fn = std::make_unique<ops::CompareLayer>();
- fn->configure(lhs_alloc, rhs_alloc, comparison_type, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -466,11 +469,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
- const auto backend_layout = output_alloc->layout();
+ const auto backend_layout = output_tensor->layout();
UNUSED_RELEASE(backend_layout);
// NOTE The frontend layout and backend layout must be the same for this operation.
@@ -481,8 +484,8 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
// a model. For example, if a model in NHWC has this operation as output rank == 4, indices
// rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
// and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
- assert(backend_layout == input_alloc->layout());
- assert(backend_layout == indices_alloc->layout());
+ assert(backend_layout == input_tensor->layout());
+ assert(backend_layout == indices_tensor->layout());
const auto &input_shape = _ctx.at(input_index).shape();
UNUSED_RELEASE(input_shape);
assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout);
@@ -492,7 +495,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
auto fn = std::make_unique<ops::GatherLayer>();
- fn->configure(input_alloc, indices_alloc, output_alloc, axis_value);
+ fn->configure(input_tensor, indices_tensor, output_tensor, axis_value);
_return_fn = std::move(fn);
}
@@ -506,13 +509,13 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::SubLayer>();
- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -526,13 +529,13 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::MulLayer>();
- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -547,18 +550,18 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
const auto axis = node.param().axis;
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
- auto depth_alloc = _tensor_builder->portableAt(depth_index).get();
- auto onvalue_alloc = _tensor_builder->portableAt(onvalue_index).get();
- auto offvalue_alloc = _tensor_builder->portableAt(offvalue_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
+ auto depth_tensor = _tensor_builder->portableAt(depth_index).get();
+ auto onvalue_tensor = _tensor_builder->portableAt(onvalue_index).get();
+ auto offvalue_tensor = _tensor_builder->portableAt(offvalue_index).get();
- assert(indices_alloc->data_type() == OperandType::INT32);
- assert(axis <= static_cast<int>(indices_alloc->num_dimensions()));
+ assert(indices_tensor->data_type() == OperandType::INT32);
+ assert(axis <= static_cast<int>(indices_tensor->num_dimensions()));
auto fn = std::make_unique<ops::OneHotLayer>();
- fn->configure(indices_alloc, depth_alloc, onvalue_alloc, offvalue_alloc, output_alloc, axis);
+ fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis);
_return_fn = std::move(fn);
}
@@ -572,13 +575,13 @@ void KernelGenerator::visit(const ir::operation::Div &node)
const auto activation = node.param().activation;
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::DivLayer>();
- fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -587,16 +590,16 @@ void KernelGenerator::visit(const ir::operation::Einsum &node)
{
const auto ofm_index{node.getOutputs().at(0)};
- auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
- std::vector<const IPortableTensor *> input_allocs;
+ auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+ std::vector<const IPortableTensor *> input_tensors;
for (auto &ifm_idx : node.getInputs())
- input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+ input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
const auto equation = node.param().equation;
auto fn = std::make_unique<ops::EinsumLayer>();
- fn->configure(input_allocs, equation, output_alloc);
+ fn->configure(input_tensors, equation, output_tensor);
_return_fn = std::move(fn);
}
@@ -605,14 +608,14 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
{
auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
std::vector<custom::TypeInfo> &types,
- std::vector<std::shared_ptr<IPortableTensor>> &allocs) {
+ std::vector<std::shared_ptr<IPortableTensor>> &tensors) {
for (auto &idx : opSeq)
{
const auto &operand = _ctx.at(idx);
// TODO make sure using `_current_op_seq_layout` is correct for custom operations
types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()});
- auto in_alloc = _tensor_builder->portableAt(idx);
- allocs.emplace_back(in_alloc);
+ auto in_tensor = _tensor_builder->portableAt(idx);
+ tensors.emplace_back(in_tensor);
}
};
@@ -634,12 +637,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::ExpLayer>();
- fn->configure(input_alloc, output_alloc);
+ fn->configure(input_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -650,13 +653,13 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
auto fn = std::make_unique<ops::ExpandDimsLayer>();
- fn->configure(input_alloc, axis_alloc, output_alloc);
+ fn->configure(input_tensor, axis_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -666,12 +669,12 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::LogisticLayer>();
- fn->configure(input_alloc, output_alloc);
+ fn->configure(input_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -681,12 +684,12 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::TanhLayer>();
- fn->configure(input_alloc, output_alloc);
+ fn->configure(input_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -700,7 +703,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
assert(-rank <= axis && axis < rank);
- auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+ auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
std::vector<const IPortableTensor *> input_tensors;
for (auto &ifm_idx : node.getInputs())
@@ -708,7 +711,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
auto fn = std::make_unique<ops::PackLayer>();
- fn->configure(input_tensors, axis, output_alloc);
+ fn->configure(input_tensors, axis, output_tensor);
_return_fn = std::move(fn);
}
@@ -722,7 +725,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
assert(rank == 0 || (-rank <= axis && axis < rank));
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
std::vector<IPortableTensor *> output_tensors;
for (auto &output_idx : node.getOutputs())
@@ -732,7 +735,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
uint32_t axis_resolved = (axis < 0 ? axis + rank : axis);
- fn->configure(input_alloc, axis_resolved, node.param().num, output_tensors);
+ fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors);
_return_fn = std::move(fn);
}
@@ -751,8 +754,16 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
auto fn = std::make_unique<ops::PadLayer>();
- fn->configure(input, output, pad_base, pad_rank);
+ bool isPadV2 = node.getInputs().size() == 3 ? true : false;
+ const void *value = nullptr;
+ if (isPadV2)
+ {
+ const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)};
+ value = reinterpret_cast<const void *>(_ctx.at(value_index).data()->base());
+ }
+
+ fn->configure(input, output, pad_base, pad_rank, value);
_return_fn = std::move(fn);
}
@@ -762,13 +773,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::MaxLayer>();
- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -779,13 +790,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::MinLayer>();
- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -795,12 +806,12 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::CastLayer>();
- fn->configure(ifm_alloc, ofm_alloc);
+ fn->configure(ifm_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -810,12 +821,12 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::TransposeLayer>();
- fn->configure(input_alloc, output_alloc, node.param().perm);
+ fn->configure(input_tensor, output_tensor, node.param().perm);
_return_fn = std::move(fn);
}
@@ -827,15 +838,15 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
const auto keep_dims = node.param().keep_dims;
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto axes_alloc = _tensor_builder->portableAt(axes_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto axes_tensor = _tensor_builder->portableAt(axes_index).get();
if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN)
{
auto fn = std::make_unique<ops::MeanLayer>();
- fn->configure(input_alloc, axes_alloc, output_alloc, keep_dims);
+ fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims);
_return_fn = std::move(fn);
}
@@ -844,7 +855,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
auto fn = std::make_unique<ops::ReduceLayer>();
const auto reduce_type = convertReduceType(node.param().reduce_type);
- fn->configure(input_alloc, axes_alloc, output_alloc, reduce_type, keep_dims);
+ fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims);
_return_fn = std::move(fn);
}
@@ -855,12 +866,12 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(0)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::ReLULayer>();
- fn->configure(input_alloc, output_alloc);
+ fn->configure(input_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -872,14 +883,14 @@ void KernelGenerator::visit(const ir::operation::Select &node)
const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto condition_alloc = _tensor_builder->portableAt(condition_index).get();
- auto true_alloc = _tensor_builder->portableAt(true_index).get();
- auto false_alloc = _tensor_builder->portableAt(false_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto condition_tensor = _tensor_builder->portableAt(condition_index).get();
+ auto true_tensor = _tensor_builder->portableAt(true_index).get();
+ auto false_tensor = _tensor_builder->portableAt(false_index).get();
auto fn = std::make_unique<ops::SelectLayer>();
- fn->configure(condition_alloc, true_alloc, false_alloc, output_alloc);
+ fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -891,14 +902,14 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto begins_alloc = _tensor_builder->portableAt(begins_index).get();
- auto sizes_alloc = _tensor_builder->portableAt(sizes_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto begins_tensor = _tensor_builder->portableAt(begins_index).get();
+ auto sizes_tensor = _tensor_builder->portableAt(sizes_index).get();
auto fn = std::make_unique<ops::SliceLayer>();
- fn->configure(input_alloc, begins_alloc, sizes_alloc, output_alloc);
+ fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -911,11 +922,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto starts_alloc = _tensor_builder->portableAt(starts_index).get();
- auto ends_alloc = _tensor_builder->portableAt(ends_index).get();
- auto strides_alloc = _tensor_builder->portableAt(strides_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto starts_tensor = _tensor_builder->portableAt(starts_index).get();
+ auto ends_tensor = _tensor_builder->portableAt(ends_index).get();
+ auto strides_tensor = _tensor_builder->portableAt(strides_index).get();
auto begin_mask = node.param().begin_mask;
auto end_mask = node.param().end_mask;
@@ -923,7 +934,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
auto fn = std::make_unique<ops::StridedSliceLayer>();
- fn->configure(input_alloc, starts_alloc, ends_alloc, strides_alloc, output_alloc, begin_mask,
+ fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask,
end_mask, shrink_axis_mask);
_return_fn = std::move(fn);
@@ -957,12 +968,12 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::AbsLayer>();
- fn->configure(ifm_alloc, ofm_alloc);
+ fn->configure(ifm_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -972,12 +983,12 @@ void KernelGenerator::visit(const ir::operation::Sin &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Sin::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::SinLayer>();
- fn->configure(ifm_alloc, ofm_alloc);
+ fn->configure(ifm_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -987,12 +998,12 @@ void KernelGenerator::visit(const ir::operation::Cos &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Cos::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::CosLayer>();
- fn->configure(ifm_alloc, ofm_alloc);
+ fn->configure(ifm_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -1002,12 +1013,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::RsqrtLayer>();
- fn->configure(ifm_alloc, ofm_alloc);
+ fn->configure(ifm_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -1017,12 +1028,12 @@ void KernelGenerator::visit(const ir::operation::Shape &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::ShapeLayer>();
- fn->configure(ifm_alloc, ofm_alloc);
+ fn->configure(ifm_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -1033,13 +1044,13 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)};
const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
auto fn = std::make_unique<ops::ReverseLayer>();
- fn->configure(input_alloc, axis_alloc, output_alloc);
+ fn->configure(input_tensor, axis_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -1049,12 +1060,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::NegLayer>();
- fn->configure(ifm_alloc, ofm_alloc);
+ fn->configure(ifm_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -1066,12 +1077,12 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
const auto axis = node.param().axis;
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::ArgMinMaxLayer>();
- fn->configure(input_alloc, output_alloc, axis, /* is_arg_max */ true);
+ fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true);
_return_fn = std::move(fn);
}
@@ -1082,13 +1093,13 @@ void KernelGenerator::visit(const ir::operation::Pow &node)
const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::PowLayer>();
- fn->configure(lhs_alloc, rhs_alloc, ir::Activation::NONE, output_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor);
_return_fn = std::move(fn);
}
@@ -1098,12 +1109,12 @@ void KernelGenerator::visit(const ir::operation::Log &node)
const auto ofm_index{node.getOutputs().at(0)};
const auto ifm_index{node.getInputs().at(ir::operation::Log::Input::INPUT)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
auto fn = std::make_unique<ops::LogLayer>();
- fn->configure(ifm_alloc, ofm_alloc);
+ fn->configure(ifm_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -1113,12 +1124,12 @@ void KernelGenerator::visit(const ir::operation::Round &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::Round::INPUT)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::RoundLayer>();
- fn->configure(input_alloc, output_alloc);
+ fn->configure(input_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -1128,12 +1139,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
const auto output_index{node.getOutputs().at(0)};
const auto input_index{node.getInputs().at(ir::operation::LogicalNot::INPUT)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::LogicalNotLayer>();
- fn->configure(input_alloc, output_alloc);
+ fn->configure(input_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -1144,28 +1155,43 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
const auto lhs_index{node.getInputs().at(0)};
const auto rhs_index{node.getInputs().at(1)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::LogicalOrLayer>();
- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
-void KernelGenerator::visit(const ir::operation::ZerosLike &node)
+void KernelGenerator::visit(const ir::operation::L2Normalization &node)
{
const auto output_index{node.getOutputs().at(0)};
- const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
+ const auto input_index{node.getInputs().at(0)};
auto output_alloc = _tensor_builder->portableAt(output_index).get();
auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto fn = std::make_unique<ops::ZerosLikeLayer>();
+ auto fn = std::make_unique<ops::L2NormLayer>();
fn->configure(input_alloc, output_alloc);
+
+ _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::ZerosLike &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
+
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+ auto fn = std::make_unique<ops::ZerosLikeLayer>();
+
+ fn->configure(input_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -1176,14 +1202,14 @@ void KernelGenerator::visit(const ir::operation::Range &node)
const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)};
const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto start_alloc = _tensor_builder->portableAt(start_index).get();
- auto limit_alloc = _tensor_builder->portableAt(limit_index).get();
- auto delta_alloc = _tensor_builder->portableAt(delta_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto start_tensor = _tensor_builder->portableAt(start_index).get();
+ auto limit_tensor = _tensor_builder->portableAt(limit_index).get();
+ auto delta_tensor = _tensor_builder->portableAt(delta_index).get();
auto fn = std::make_unique<ops::RangeLayer>();
- fn->configure(start_alloc, limit_alloc, delta_alloc, output_alloc);
+ fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -1193,13 +1219,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
- auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
auto fn = std::make_unique<ops::SqDiffLayer>();
- fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
_return_fn = std::move(fn);
}
@@ -1209,13 +1235,13 @@ void KernelGenerator::visit(const ir::operation::Tile &node)
const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)};
const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto multiples_alloc = _tensor_builder->portableAt(multiples_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto multiples_tensor = _tensor_builder->portableAt(multiples_index).get();
auto fn = std::make_unique<ops::TileLayer>();
- fn->configure(input_alloc, multiples_alloc, output_alloc);
+ fn->configure(input_tensor, multiples_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -1226,14 +1252,14 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)};
const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto num_lower_alloc = _tensor_builder->portableAt(num_lower_index).get();
- auto num_upper_alloc = _tensor_builder->portableAt(num_upper_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto num_lower_tensor = _tensor_builder->portableAt(num_lower_index).get();
+ auto num_upper_tensor = _tensor_builder->portableAt(num_upper_index).get();
auto fn = std::make_unique<ops::MatrixBandPartLayer>();
- fn->configure(input_alloc, num_lower_alloc, num_upper_alloc, output_alloc);
+ fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -1243,16 +1269,16 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)};
const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
- auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+ auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
const auto adj_x = node.param().adj_x;
const auto adj_y = node.param().adj_y;
auto fn = std::make_unique<ops::BatchMatMulLayer>();
- fn->configure(lhs_alloc, rhs_alloc, adj_x, adj_y, output_alloc);
+ fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor);
_return_fn = std::move(fn);
}
@@ -1262,13 +1288,13 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node)
const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)};
const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto shape_alloc = _tensor_builder->portableAt(shape_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto shape_tensor = _tensor_builder->portableAt(shape_index).get();
auto fn = std::make_unique<ops::BroadcastToLayer>();
- fn->configure(input_alloc, shape_alloc, output_alloc);
+ fn->configure(input_tensor, shape_tensor, output_tensor);
_return_fn = std::move(fn);
}
@@ -1277,10 +1303,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
{
const auto ofm_index{node.getOutputs().at(0)};
- auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
- std::vector<const IPortableTensor *> input_allocs;
+ auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+ std::vector<const IPortableTensor *> input_tensors;
for (auto &ifm_idx : node.getInputs())
- input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+ input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
const auto epsilon = node.param().epsilon;
const auto is_training = node.param().is_training;
@@ -1288,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
auto fn = std::make_unique<ops::FusedBatchNormLayer>();
- fn->configure(input_allocs, epsilon, is_training, data_format, output_alloc);
+ fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor);
_return_fn = std::move(fn);
}
@@ -1301,12 +1327,12 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node)
const auto beta = node.param().beta;
const auto axis = node.param().axis;
- auto output_alloc = _tensor_builder->at(output_index).get();
- auto input_alloc = _tensor_builder->at(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
auto fn = std::make_unique<ops::LogSoftMaxLayer>();
- fn->configure(input_alloc, beta, axis, output_alloc);
+ fn->configure(input_tensor, beta, axis, output_tensor);
_return_fn = std::move(fn);
}
@@ -1318,14 +1344,45 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)};
const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)};
- auto output_alloc = _tensor_builder->portableAt(output_index).get();
- auto input_alloc = _tensor_builder->portableAt(input_index).get();
- auto block_shape_alloc = _tensor_builder->portableAt(block_shape_index).get();
- auto padding_alloc = _tensor_builder->portableAt(padding_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto block_shape_tensor = _tensor_builder->portableAt(block_shape_index).get();
+ auto padding_tensor = _tensor_builder->portableAt(padding_index).get();
auto fn = std::make_unique<ops::SpaceToBatchNDLayer>();
- fn->configure(input_alloc, block_shape_alloc, padding_alloc, output_alloc);
+ fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::Quantize &node)
+{
+ const auto input_index{node.getInputs().at(ir::operation::Quantize::Input::INPUT)};
+ const auto output_index{node.getOutputs().at(0)};
+
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+
+ auto fn = std::make_unique<ops::QuantizeLayer>();
+
+ fn->configure(input_tensor, output_tensor);
+
+ _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
+{
+ const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
+ const auto output_index{node.getOutputs().at(0)};
+ auto block_size = node.param().block_size;
+
+ auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ auto output_tensor = _tensor_builder->portableAt(output_index).get();
+
+ auto fn = std::make_unique<ops::SpaceToDepthLayer>();
+
+ fn->configure(input_tensor, block_size, output_tensor);
_return_fn = std::move(fn);
}
diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h
index d6f4c2825..f564bf8be 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.h
+++ b/runtime/onert/backend/cpu/KernelGenerator.h
@@ -94,6 +94,7 @@ public:
void visit(const ir::operation::SquaredDifference &) override;
void visit(const ir::operation::Tile &) override;
void visit(const ir::operation::LogicalOr &) override;
+ void visit(const ir::operation::L2Normalization &) override;
void visit(const ir::operation::Range &) override;
void visit(const ir::operation::MatrixBandPart &) override;
void visit(const ir::operation::BatchMatMul &) override;
@@ -101,6 +102,8 @@ public:
void visit(const ir::operation::FusedBatchNorm &) override;
void visit(const ir::operation::LogSoftmax &) override;
void visit(const ir::operation::SpaceToBatchND &) override;
+ void visit(const ir::operation::Quantize &) override;
+ void visit(const ir::operation::SpaceToDepth &) override;
private:
const ir::Operands &_ctx;
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc
new file mode 100644
index 000000000..872307261
--- /dev/null
+++ b/runtime/onert/backend/cpu/StaticTensorManager.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "StaticTensorManager.h"
+#include "Tensor.h"
+
+#include <util/logging.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+StaticTensorManager::StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg)
+ : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg}
+{
+ // DO NOTHING
+}
+
+void StaticTensorManager::allocateNonconsts(void)
+{
+ _nonconst_mgr->allocate();
+
+ for (auto &pair : _tensors->native_tensors())
+ {
+ const auto &ind = pair.first;
+ auto tensor = pair.second;
+ if (!_as_constants[ind] && !tensor->is_dynamic())
+ {
+ auto *buffer = _nonconst_mgr->getBuffer(ind);
+ tensor->setBuffer(buffer);
+
+ VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
+ << "): " << static_cast<void *>(buffer) << std::endl;
+ }
+ }
+}
+
+void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); }
+
+void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
+ const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
+ bool as_const)
+{
+ assert(!_tensors->getITensor(ind));
+ if (as_const)
+ {
+ auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout);
+ _tensors->setNativeTensor(ind, tensor);
+ }
+ else
+ {
+ auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
+ _tensors->setNativeTensor(ind, tensor);
+ }
+ _as_constants[ind] = as_const;
+}
+
+void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
+{
+ assert(_tensors->getITensor(ind));
+
+ // This method is called only when a tensor has proper shape
+ assert(!_tensors->getITensor(ind)->is_dynamic());
+
+ if (!_as_constants[ind])
+ _nonconst_mgr->claimPlan(ind, size);
+}
+
+void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
+{
+ assert(_tensors->getITensor(ind));
+
+ // This method is called only when a tensor has proper shape
+ assert(!_tensors->getITensor(ind)->is_dynamic());
+
+ if (!_as_constants[ind])
+ _nonconst_mgr->releasePlan(ind);
+}
+
+void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
+{
+ for (const auto &it : _tensors->native_tensors())
+ fn(it.first);
+}
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h
new file mode 100644
index 000000000..66243a599
--- /dev/null
+++ b/runtime/onert/backend/cpu/StaticTensorManager.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
+#define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
+
+#include "backend/IStaticTensorManager.h"
+#include "backend/cpu_common/MemoryManager.h"
+#include "backend/cpu_common/TensorRegistry.h"
+#include "backend/ITensorManager.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandInfo.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+class StaticTensorManager : public backend::IStaticTensorManager
+{
+public:
+ StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg);
+ virtual ~StaticTensorManager() = default;
+
+ void allocateNonconsts(void);
+ void deallocateNonconsts(void);
+
+ void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
+ ir::Layout backend_layout, bool as_const);
+
+ void claimPlan(const ir::OperandIndex &ind, uint32_t size);
+ void releasePlan(const ir::OperandIndex &ind);
+
+ void iterate(const std::function<void(const ir::OperandIndex &)> &fn);
+
+private:
+ std::unique_ptr<cpu_common::MemoryManager> _nonconst_mgr;
+ const std::shared_ptr<cpu_common::TensorRegistry> _tensors;
+ ir::OperandIndexMap<bool> _as_constants;
+};
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
index 4dd251bd3..da16d0581 100644
--- a/runtime/onert/backend/cpu/Tensor.h
+++ b/runtime/onert/backend/cpu/Tensor.h
@@ -29,8 +29,14 @@ namespace cpu
using Tensor = cpu_common::Tensor;
-// Tensor which has data from external. To support this, assume below things
-// no padding, always NHWC layout, constant tensor and not dynamic
+/**
+ * @brief Class that uses data from external memory that is not managed by a backend
+ * instead of allocating and copying the data. ExternalTensor's data pointer points to
+ * an address of memory such as where memory is already allocated, or mmapped area.
+ * This is meaning that ExternalTensor can take all of types' ir::Data.
+ * To support this, assume below things no padding, always NHWC layout,
+ * constant tensor and not dynamic.
+ */
class ExternalTensor : public Tensor
{
public:
@@ -45,6 +51,11 @@ public:
}
public:
+ /**
+ * @brief set Data to be shared from external so that this ExternalTensor will not be
+ * allocated on CPU backend
+ * @param[in] data data of Operand to be set
+ */
void setData(const std::shared_ptr<ir::Data> data)
{
assert(data != nullptr);
diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc
index 886e8d820..7eb3ce8a5 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.cc
+++ b/runtime/onert/backend/cpu/TensorBuilder.cc
@@ -29,7 +29,7 @@ namespace cpu
TensorBuilder::TensorBuilder()
: _tensor_reg{new cpu_common::TensorRegistry()},
- _static_tensor_mgr{new cpu_common::StaticTensorManager(_tensor_reg)},
+ _static_tensor_mgr{new StaticTensorManager(_tensor_reg)},
_dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)}
{
/* empty */
@@ -77,11 +77,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
return _tensor_info_map.find(ind) != _tensor_info_map.end();
}
-void TensorBuilder::prepare(void)
-{
- _static_tensor_mgr->allocateConsts();
- _static_tensor_mgr->allocateNonconsts();
-}
+void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
void TensorBuilder::allocate()
{
@@ -99,17 +95,17 @@ std::shared_ptr<IPortableTensor> TensorBuilder::portableAt(const ir::OperandInde
return _tensor_reg->getPortableTensor(ind);
}
-bool TensorBuilder::setExternalTensor(const ir::OperandIndex &ind,
- const std::shared_ptr<IPortableTensor> &tensor)
+bool TensorBuilder::setMigrantTensor(const ir::OperandIndex &ind,
+ const std::shared_ptr<IPortableTensor> &tensor)
{
- return _tensor_reg->setExternalTensor(ind, tensor);
+ return _tensor_reg->setMigrantTensor(ind, tensor);
}
void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->iterate(fn); }
-std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
+std::shared_ptr<Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
{
- return _tensor_reg->getManagedTensor(ind);
+ return _tensor_reg->getNativeTensor(ind);
}
std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
index ba25451ec..12ca28cb3 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.h
+++ b/runtime/onert/backend/cpu/TensorBuilder.h
@@ -18,13 +18,14 @@
#define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__
#include <backend/cpu_common/DynamicTensorManager.h>
-#include <backend/cpu_common/StaticTensorManager.h>
#include <backend/cpu_common/TensorRegistry.h>
-#include <backend/cpu_common/Tensor.h>
#include <backend/ITensorBuilder.h>
#include <ir/OperandIndexMap.h>
+#include "StaticTensorManager.h"
+#include "Tensor.h"
+
#include <unordered_map>
namespace onert
@@ -80,16 +81,16 @@ public:
* If not, program will crash with assert or exception.
* @return shared_ptr<Tensor>
*/
- std::shared_ptr<cpu_common::Tensor> at(const ir::OperandIndex &ind);
+ std::shared_ptr<Tensor> at(const ir::OperandIndex &ind);
std::shared_ptr<IPortableTensor> portableAt(const ir::OperandIndex &ind);
- bool setExternalTensor(const ir::OperandIndex &ind,
- const std::shared_ptr<IPortableTensor> &tensor) override;
+ bool setMigrantTensor(const ir::OperandIndex &ind,
+ const std::shared_ptr<IPortableTensor> &tensor) override;
std::shared_ptr<ITensorRegistry> tensorRegistry() override { return _tensor_reg; }
private:
const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
- std::unique_ptr<cpu_common::StaticTensorManager> _static_tensor_mgr;
+ std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
};
diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc
index f557f3ade..adf902aaf 100644
--- a/runtime/onert/backend/cpu/ops/CompareLayer.cc
+++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc
@@ -17,6 +17,7 @@
#include "OperationUtils.h"
+#include <assert.h>
#include <cker/operation/Comparison.h>
using namespace nnfw::cker;
namespace onert
@@ -34,6 +35,14 @@ namespace
using OpType = onert::ir::operation::Comparison::ComparisonType;
using namespace onert::backend::cpu;
+// Assumes these enum values to be in the order like this
+static_assert(static_cast<int>(OpType::Equal) == 0, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::NotEqual) == 1, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::Greater) == 2, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::GreaterEqual) == 3, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::Less) == 4, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::LessEqual) == 5, "An OpType value has changed!");
+
template <typename T>
void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output,
OpType op_type)
@@ -52,95 +61,33 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
&params.input2_shift);
params.is_broadcast = !HaveSameShapes(lhs, rhs);
- if (params.is_broadcast)
- {
- switch (op_type)
- {
- case OpType::Equal:
- Broadcast4DSlowEqualWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::NotEqual:
- Broadcast4DSlowNotEqualWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::Greater:
- Broadcast4DSlowGreaterWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::GreaterEqual:
- Broadcast4DSlowGreaterEqualWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::Less:
- Broadcast4DSlowLessWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::LessEqual:
- Broadcast4DSlowLessEqualWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- default:
- throw std::runtime_error{"Invalid OpType for CompareLayer"};
- }
- }
- else // if (requires_broadcast == false)
- {
- switch (op_type)
- {
- case OpType::Equal:
- EqualWithScaling(params, getExtendedTensorShape(lhs),
- reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
- reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
- reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::NotEqual:
- NotEqualWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::Greater:
- GreaterWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::GreaterEqual:
- GreaterEqualWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::Less:
- LessWithScaling(params, getExtendedTensorShape(lhs),
- reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
- reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
- reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::LessEqual:
- LessEqualWithScaling(
- params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- default:
- throw std::runtime_error{"Invalid OpType for CompareLayer"};
- }
- }
- return;
+ using CompareFunction =
+ void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data,
+ const Shape &input2_shape, const T *input2_data, const Shape &output_shape,
+ bool *output_data);
+
+ static const CompareFunction broadcast_fns[] = {
+ Broadcast4DSlowEqualWithScaling, Broadcast4DSlowNotEqualWithScaling,
+ Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling,
+ Broadcast4DSlowLessWithScaling, Broadcast4DSlowLessEqualWithScaling,
+ };
+ static const CompareFunction non_broadcast_fns[] = {
+ EqualWithScaling, NotEqualWithScaling, GreaterWithScaling,
+ GreaterEqualWithScaling, LessWithScaling, LessEqualWithScaling,
+ };
+
+ static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
+ "Sizes of broadcast_fns and non_broadcast_fns must match!");
+
+ auto index = static_cast<int>(op_type);
+ if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
+ throw std::runtime_error{"Invalid OpType for CompareLayer"};
+
+ CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
+
+ fn(params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+ getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+ getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
}
template <typename T>
@@ -149,94 +96,33 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
{
bool requires_broadcast = !HaveSameShapes(lhs, rhs);
- if (requires_broadcast)
- {
- switch (op_type)
- {
- case OpType::Equal:
- Broadcast4DSlowEqual(
- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::NotEqual:
- Broadcast4DSlowNotEqual(
- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::Greater:
- Broadcast4DSlowGreater(
- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::GreaterEqual:
- Broadcast4DSlowGreaterEqual(
- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::Less:
- Broadcast4DSlowLess(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output),
- reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::LessEqual:
- Broadcast4DSlowLessEqual(
- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- default:
- throw std::runtime_error{"Invalid OpType for CompareLayer"};
- }
- }
- else // if (requires_broadcast == false)
- {
- switch (op_type)
- {
- case OpType::Equal:
- EqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::NotEqual:
- NotEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output),
- reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::Greater:
- GreaterNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output),
- reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::GreaterEqual:
- GreaterEqualNoScaling(
- getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::Less:
- LessNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
- break;
- case OpType::LessEqual:
- LessEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
- getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
- getExtendedTensorShape(output),
- reinterpret_cast<bool *>(output->buffer()));
- break;
- default:
- throw std::runtime_error{"Invalid OpType for CompareLayer"};
- }
- }
- return;
+ using CompareFunction =
+ void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
+ const T *input2_data, const Shape &output_shape, bool *output_data);
+
+ static const CompareFunction broadcast_fns[] = {
+ Broadcast4DSlowEqual, Broadcast4DSlowNotEqual, Broadcast4DSlowGreater,
+ Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess, Broadcast4DSlowLessEqual,
+ };
+ static const CompareFunction non_broadcast_fns[] = {
+ EqualNoScaling, NotEqualNoScaling, GreaterNoScaling,
+ GreaterEqualNoScaling, LessNoScaling, LessEqualNoScaling,
+ };
+
+ static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
+ "Sizes of broadcast_fns and non_broadcast_fns must match!");
+
+ auto index = static_cast<int>(op_type);
+ if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
+ throw std::runtime_error{"Invalid OpType for CompareLayer"};
+
+ CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
+
+ fn(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+ getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+ getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
}
+
} // namespace
CompareLayer::CompareLayer()
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
index c00be64e5..ff22e32e6 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -18,6 +18,7 @@
#include "../Tensor.h"
#include <cker/operation/FullyConnected.h>
+#include <cker/TensorUtils.h>
namespace onert
{
@@ -112,15 +113,32 @@ void FullyConnectedLayer::fullyConnectedHybrid()
getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena);
-// TODO Enable calling decrease_ref
-#if 0
+// TODO Remove this ifdef
+#ifdef EXPERIMENTAL_RUY_FEATURE
if (_cached_weights == nullptr || _is_weights_freed)
return;
+ // '_cached_weights is not nullptr and _is_weights_freed is false' means
+ // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
+ // After entering here, it will not enter again except below the case - input is zero-vector
+
+ // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
+ // so that handle this case
+ const int input_size = getTensorShape(_input).FlatSize();
+ if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size))
+ return;
+
+ // This weight tensor could be other ops' const tensor.
+ // Therefore, below reference should be checked like following
auto weight_tensor = dynamic_cast<const Tensor *>(_weights);
if (weight_tensor)
{
auto tensor = const_cast<Tensor *>(weight_tensor);
+ if (tensor->buffer() == nullptr) // ref is already 0?
+ {
+ _is_weights_freed = true;
+ return;
+ }
tensor->decrease_ref();
if (tensor->buffer() == nullptr) // ref == 0?
@@ -128,7 +146,7 @@ void FullyConnectedLayer::fullyConnectedHybrid()
_is_weights_freed = true;
}
}
-#endif // if 0
+#endif
#endif
}
@@ -167,7 +185,17 @@ void FullyConnectedLayer::run()
void FullyConnectedLayer::prepare()
{
+ if (_bias && _bias->is_constant())
+ {
+ const int bias_size = getTensorShape(_bias).FlatSize();
+ if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size))
+ {
+ _bias = nullptr;
+ }
+ }
+
#ifdef USE_RUY_GEMV
+#ifdef EXPERIMENTAL_RUY_FEATURE
// TODO This is workaround
// The only fc hybrid will use ruy kernel
if (_input->data_type() != OperandType::FLOAT32 ||
@@ -199,6 +227,7 @@ void FullyConnectedLayer::prepare()
}
}
#endif
+#endif
}
} // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
index dd5ef2436..e405b2476 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
@@ -72,6 +72,9 @@ private:
#ifdef USE_RUY_GEMV
uint8_t *_cached_weights = nullptr; // weights to be cached and a key
+#ifdef EXPERIMENTAL_RUY_FEATURE
+ bool _is_weights_freed = false; // is weights freed?
+#endif
#endif
};
diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
new file mode 100644
index 000000000..0d99b0586
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "L2NormLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/L2Normalize.h>
+#include <cker/Types.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+void L2NormLayer::configure(const IPortableTensor *input, IPortableTensor *output)
+{
+ assert(input != nullptr);
+ assert(output != nullptr);
+
+ _input = input;
+ _output = output;
+}
+
+void L2NormLayer::run()
+{
+ switch (_input->data_type())
+ {
+ case OperandType::FLOAT32:
+ nnfw::cker::L2NormalizeFloat32(
+ getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+ getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+ break;
+
+ case OperandType::QUANT_UINT8_ASYMM:
+ {
+ nnfw::cker::L2NormParams params;
+ assert(_input->data_offset() == 128);
+ params.input_zero_point = _input->data_offset();
+ nnfw::cker::L2NormalizeQuant8(
+ params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
+ getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+ }
+ break;
+
+ default:
+ throw std::runtime_error{"L2Norm: Unsupported data type"};
+ }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.h b/runtime/onert/backend/cpu/ops/L2NormLayer.h
new file mode 100644
index 000000000..63f2d1133
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/L2NormLayer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in riting, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+class L2NormLayer : public ::onert::exec::IFunction
+{
+public:
+ L2NormLayer() : _input(nullptr), _output(nullptr)
+ {
+ // Nothing
+ }
+
+public:
+ void configure(const IPortableTensor *_input, IPortableTensor *output);
+
+ void run() override;
+
+private:
+ const IPortableTensor *_input;
+ IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
index d71e325ac..06dde4fc4 100644
--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
@@ -49,8 +49,8 @@ void LogSoftMaxLayer::logsoftmaxQuant8()
// NYI
}
-void LogSoftMaxLayer::configure(const Tensor *input, const float beta, const int axis,
- Tensor *output)
+void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis,
+ IPortableTensor *output)
{
_input = input;
_output = output;
diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
index bc145cea7..ba9deca17 100644
--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
+++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
@@ -40,13 +40,14 @@ public:
void logsoftmaxQuant8();
- void configure(const Tensor *input, const float beta, const int axis, Tensor *output);
+ void configure(const IPortableTensor *input, const float beta, const int axis,
+ IPortableTensor *output);
void run();
private:
- const Tensor *_input;
- Tensor *_output;
+ const IPortableTensor *_input;
+ IPortableTensor *_output;
float _beta;
int _axis;
diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h
index 8d29374ff..98385521a 100644
--- a/runtime/onert/backend/cpu/ops/OperationUtils.h
+++ b/runtime/onert/backend/cpu/ops/OperationUtils.h
@@ -52,6 +52,17 @@ union DataPtr {
void *v;
};
+union ConstDataPtr {
+ const uint8_t *u8;
+ const int8_t *i8;
+ const uint32_t *u32;
+ const int32_t *i32;
+ const bool *b;
+ const float *f;
+ const int64_t *i64;
+ const void *v;
+};
+
uint32_t getNumberOfDimensions(const IPortableTensor *tensor);
uint32_t getNumberOfElements(const IPortableTensor *tensor);
diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc
index fcfcf7b5e..6a2bf9da0 100644
--- a/runtime/onert/backend/cpu/ops/PadLayer.cc
+++ b/runtime/onert/backend/cpu/ops/PadLayer.cc
@@ -33,33 +33,40 @@ PadLayer::PadLayer()
// DO NOTHING
}
-void PadLayer::padFloat32()
+template <typename T> void PadLayer::padImpl(const T *constant_value_data)
{
- nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input),
- reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
- reinterpret_cast<float *>(_output->buffer()), _constantValueData.f);
+ nnfw::cker::Pad<T>(_padData, _padRank, getTensorShape(_input),
+ reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
+ reinterpret_cast<T *>(_output->buffer()), constant_value_data);
}
-void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); }
void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output,
- const int32_t *padData, int32_t padRank, uint8_t *constantValueData)
+ const int32_t *padData, int32_t padRank, const void *constantValueData)
{
_input = input;
_output = output;
memcpy(_padData, padData, sizeof(_padData));
_padRank = padRank;
- _constantValueData.u8 = constantValueData;
+ _constantValueData.v = constantValueData;
}
void PadLayer::run()
{
if (_input->data_type() == OperandType::FLOAT32)
{
- padFloat32();
+ padImpl<float>(_constantValueData.f);
}
else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
{
- padQuant8();
+ if (_constantValueData.u8 == nullptr)
+ {
+ uint8_t pad_value = static_cast<uint8_t>(_output->data_offset());
+ padImpl<uint8_t>(&pad_value);
+ }
+ else
+ {
+ padImpl<uint8_t>(_constantValueData.u8);
+ }
}
else
{
diff --git a/runtime/onert/backend/cpu/ops/PadLayer.h b/runtime/onert/backend/cpu/ops/PadLayer.h
index 85bd2e6f0..efd73d5e5 100644
--- a/runtime/onert/backend/cpu/ops/PadLayer.h
+++ b/runtime/onert/backend/cpu/ops/PadLayer.h
@@ -39,12 +39,10 @@ public:
PadLayer();
public:
- void padFloat32();
-
- void padQuant8();
+ template <typename T> void padImpl(const T *constant_value_data);
void configure(const IPortableTensor *input, IPortableTensor *output, const int32_t *padData,
- int32_t padRank, uint8_t *constantValueData = nullptr);
+ int32_t padRank, const void *constantValueData = nullptr);
void run() override;
@@ -54,7 +52,7 @@ private:
int32_t _padData[8];
int32_t _padRank;
- DataPtr _constantValueData;
+ ConstDataPtr _constantValueData;
};
} // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
new file mode 100644
index 000000000..45fc148bf
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizeLayer.h"
+
+#include <cker/operation/Quantize.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+QuantizeLayer::QuantizeLayer() : _input(nullptr), _output(nullptr)
+{
+ // DO NOTHING
+}
+
+template <typename InputT, typename OutputT> void QuantizeLayer::affineQuantize()
+{
+ nnfw::cker::Quantize(getTensorShape(_input), reinterpret_cast<const InputT *>(_input->buffer()),
+ getTensorShape(_output), reinterpret_cast<OutputT *>(_output->buffer()),
+ _output->data_scale(), _output->data_offset());
+}
+
+void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output)
+{
+ _input = input;
+ _output = output;
+}
+
+void QuantizeLayer::run()
+{
+ if (_input->data_type() == OperandType::FLOAT32)
+ {
+ affineQuantize<float, uint8_t>();
+ }
+ else
+ {
+ throw std::runtime_error{"Quantize: unsupported data type"};
+ }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
new file mode 100644
index 000000000..b4e7aca40
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+
+#include <backend/IPortableTensor.h>
+#include "OperationUtils.h"
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class QuantizeLayer : public ::onert::exec::IFunction
+{
+public:
+ QuantizeLayer();
+
+public:
+ template <typename InputT, typename OutputT> void affineQuantize();
+
+ void configure(const IPortableTensor *input, IPortableTensor *output);
+
+ void run() override;
+
+private:
+ const IPortableTensor *_input;
+ IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc
index a9106c1a2..449c073e6 100644
--- a/runtime/onert/backend/cpu/ops/SliceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc
@@ -46,7 +46,7 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b
}
}
-void SliceLayer::sliceFloat32()
+template <typename T> void SliceLayer::sliceImpl()
{
const int kMaxDim = nnfw::cker::Shape::kMaxSmallSize;
@@ -74,14 +74,8 @@ void SliceLayer::sliceFloat32()
}
nnfw::cker::Slice(op_params, getExtendedTensorShape(_input),
- reinterpret_cast<const float *>(_input->buffer()),
- reinterpret_cast<float *>(_output->buffer()));
-}
-
-void SliceLayer::sliceQuant8()
-{
- // cker quant8 slice is not implemented yet
- throw std::runtime_error{"NYI"};
+ reinterpret_cast<const T *>(_input->buffer()),
+ reinterpret_cast<T *>(_output->buffer()));
}
void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
@@ -97,11 +91,11 @@ void SliceLayer::run()
{
if (_input->data_type() == OperandType::FLOAT32)
{
- sliceFloat32();
+ sliceImpl<float>();
}
else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
{
- sliceQuant8();
+ sliceImpl<uint8_t>();
}
else
{
diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.h b/runtime/onert/backend/cpu/ops/SliceLayer.h
index 9945d7ee6..650e2c97a 100644
--- a/runtime/onert/backend/cpu/ops/SliceLayer.h
+++ b/runtime/onert/backend/cpu/ops/SliceLayer.h
@@ -42,8 +42,7 @@ public:
void run() override;
private:
- void sliceFloat32();
- void sliceQuant8();
+ template <typename T> void sliceImpl();
template <typename T>
void GetBeginAndSizeVectors(int dimensions, const IPortableTensor *begin,
diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
new file mode 100644
index 000000000..110b0bc92
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SpaceToDepthLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/SpaceToDepth.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+SpaceToDepthLayer::SpaceToDepthLayer() : _input(nullptr), _block_size(0), _output(nullptr)
+{
+ // DO NOTHING
+}
+
+template <typename T> void SpaceToDepthLayer::spaceToDepth()
+{
+
+ nnfw::cker::SpaceToDepthParams params;
+ params.block_size = _block_size;
+
+ nnfw::cker::SpaceToDepth(params, getTensorShape(_input),
+ reinterpret_cast<const float *>(_input->buffer()),
+ getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
+}
+
+void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size,
+ IPortableTensor *output)
+{
+ _input = input;
+ _block_size = block_size;
+ _output = output;
+}
+
+void SpaceToDepthLayer::run()
+{
+ if (_input->data_type() == OperandType::FLOAT32)
+ {
+ spaceToDepth<float>();
+ }
+ else
+ {
+ throw std::runtime_error{"SpaceToDepth: unsupported data type"};
+ }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
new file mode 100644
index 000000000..c11ef2b0a
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in riting, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+class SpaceToDepthLayer : public ::onert::exec::IFunction
+{
+public:
+ SpaceToDepthLayer();
+
+ void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output);
+
+ void run() override;
+
+private:
+ template <typename T> void spaceToDepth();
+
+ const IPortableTensor *_input;
+ int32_t _block_size;
+ IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_SPACE_TO_BATCH_ND_LAYER_H__
diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h
index a49525ba7..b760cda0e 100644
--- a/runtime/onert/core/include/backend/ITensorBuilder.h
+++ b/runtime/onert/core/include/backend/ITensorBuilder.h
@@ -112,12 +112,12 @@ public: // methods for static tensor allocation
virtual std::shared_ptr<ITensor> tensorAt(const ir::OperandIndex &ind) = 0;
/**
- * @brief Set the External Tensor object
+ * @brief Set the migrant tensor object
*
* @return true if succeeded
* @return false if failed or unsupported
*/
- virtual bool setExternalTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
+ virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
{
return false;
}
diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h
index f5a95f49c..855513124 100644
--- a/runtime/onert/core/include/backend/ITensorRegistry.h
+++ b/runtime/onert/core/include/backend/ITensorRegistry.h
@@ -35,17 +35,22 @@ struct ITensorRegistry
virtual ~ITensorRegistry() = default;
/**
- * @brief Returns pointer of ITensor among managed and external tensors
+ * @brief Returns pointer of ITensor among native and migrant tensors
+ *
+ * Native Tensor is a tensor that is managed by this backend
+ * Migrant Tensor is a tensor that is imported from another backend
+ *
* @note Return tensor cannot be used longer than dynamic tensor manager
*/
virtual std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &) = 0;
/**
- * @brief Returns pointer of ITensor among managed tensors
+ * @brief Returns pointer of ITensor among native tensors
*
- * Unlike @c getITensor , this function only searches from managed tensors
- * @note Return tensor cannot be used longer than dynamic tensor manager
+ * Unlike @c getITensor , this function only searches from native tensors
+ *
+ * @note Returned tensor cannot be used longer than dynamic tensor manager
*/
- virtual std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &) = 0;
+ virtual std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &) = 0;
};
} // namespace backend
@@ -73,68 +78,67 @@ public:
std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
{
static_assert(std::is_base_of<ITensor, T_Tensor>::value, "T_Tensor must derive from ITensor.");
- auto external_tensor = _external.find(ind);
- if (external_tensor != _external.end())
+ auto external_tensor = _migrant.find(ind);
+ if (external_tensor != _migrant.end())
return external_tensor->second;
- return getManagedTensor(ind);
+ return getNativeTensor(ind);
}
- std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &ind) override
+ std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
{
- return getManagedTensor(ind);
+ return getNativeTensor(ind);
}
std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
{
- auto external_tensor = _external.find(ind);
- if (external_tensor != _external.end())
+ auto external_tensor = _migrant.find(ind);
+ if (external_tensor != _migrant.end())
{
if (external_tensor->second)
return external_tensor->second;
}
- return getManagedTensor(ind);
+ return getNativeTensor(ind);
}
- std::shared_ptr<T_Tensor> getManagedTensor(const ir::OperandIndex &ind)
+ std::shared_ptr<T_Tensor> getNativeTensor(const ir::OperandIndex &ind)
{
- auto tensor = _managed.find(ind);
- if (tensor != _managed.end())
+ auto tensor = _native.find(ind);
+ if (tensor != _native.end())
return tensor->second;
return nullptr;
}
- bool setExternalTensor(const ir::OperandIndex &ind,
- const std::shared_ptr<IPortableTensor> &tensor)
+ bool setMigrantTensor(const ir::OperandIndex &ind, const std::shared_ptr<IPortableTensor> &tensor)
{
// TODO Uncomment this as two tensors for an index is not allowed.
// But now it is temporarily allowed as a workaround. External one hides Managed one.
- // auto itr = _managed.find(ind);
- // if (itr != _managed.end() && itr->second != nullptr && tensor != nullptr)
+ // auto itr = _native.find(ind);
+ // if (itr != _native.end() && itr->second != nullptr && tensor != nullptr)
// throw std::runtime_error{
- // "Tried to set an external tensor but an managed tensor already exists."};
- _external[ind] = tensor;
+ // "Tried to set an migrant tensor but an native tensor already exists."};
+ _migrant[ind] = tensor;
return true;
}
- void setManagedTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
+ void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
{
- auto itr = _external.find(ind);
- if (itr != _external.end() && itr->second != nullptr && tensor != nullptr)
+ auto itr = _migrant.find(ind);
+ if (itr != _migrant.end() && itr->second != nullptr && tensor != nullptr)
throw std::runtime_error{
- "Tried to set a managed tensor but an external tensor already exists."};
- _managed[ind] = tensor;
+ "Tried to set a native tensor but an migrant tensor already exists."};
+ _native[ind] = tensor;
}
- const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &managed_tensors() { return _managed; }
+ const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &native_tensors() { return _native; }
- const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &external_tensors()
+ const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &migrant_tensors()
{
- return _external;
+ return _migrant;
}
private:
- ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _external;
- ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _managed;
+ ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _migrant;
+ ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _native;
};
} // namespace backend
diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
index 6ddacc7bc..a7e034a91 100644
--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
@@ -19,7 +19,7 @@
#include "MemoryManager.h"
-#include "backend/ITensorManager.h"
+#include "backend/IStaticTensorManager.h"
#include "ir/OperandIndexMap.h"
#include "ir/OperandInfo.h"
#include "TensorRegistry.h"
@@ -31,7 +31,7 @@ namespace backend
namespace cpu_common
{
-class StaticTensorManager : public backend::ITensorManager
+class StaticTensorManager : public backend::IStaticTensorManager
{
public:
StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg);
diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h
index 379143baf..b3391a3da 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInference.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInference.h
@@ -99,6 +99,7 @@ private:
void visit(const ir::operation::LogicalNot &op) override;
void visit(const ir::operation::LogicalOr &op) override;
void visit(const ir::operation::Logistic &op) override;
+ void visit(const ir::operation::L2Normalization &op) override;
void visit(const ir::operation::MatrixBandPart &op) override;
void visit(const ir::operation::Max &op) override;
void visit(const ir::operation::Min &op) override;
diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h
index 113c34809..601c1bfb3 100644
--- a/runtime/onert/core/include/exec/DynamicShapeInference.h
+++ b/runtime/onert/core/include/exec/DynamicShapeInference.h
@@ -72,6 +72,7 @@ public:
void visit(const ir::operation::LogicalNot &op) override;
void visit(const ir::operation::LogicalOr &op) override;
void visit(const ir::operation::Logistic &op) override;
+ void visit(const ir::operation::L2Normalization &op) override;
void visit(const ir::operation::MatrixBandPart &op) override;
void visit(const ir::operation::Max &op) override;
void visit(const ir::operation::Min &op) override;
diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h
index 5fac54e26..e3b5d19f5 100644
--- a/runtime/onert/core/include/ir/Operations.Include.h
+++ b/runtime/onert/core/include/ir/Operations.Include.h
@@ -103,3 +103,4 @@
#include "ir/operation/BatchMatMul.h"
#include "ir/operation/FusedBatchNorm.h"
#include "ir/operation/LogSoftmax.h"
+#include "ir/operation/Quantize.h"
diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst
index 9d0642fba..03a2aa21e 100644
--- a/runtime/onert/core/include/ir/Operations.lst
+++ b/runtime/onert/core/include/ir/Operations.lst
@@ -106,3 +106,4 @@ OP(MatrixBandPart)
OP(BatchMatMul)
OP(FusedBatchNorm)
OP(LogSoftmax)
+OP(Quantize)
diff --git a/runtime/onert/core/include/ir/operation/LogSoftmax.h b/runtime/onert/core/include/ir/operation/LogSoftmax.h
index 26a92d7f8..391b4ba4a 100644
--- a/runtime/onert/core/include/ir/operation/LogSoftmax.h
+++ b/runtime/onert/core/include/ir/operation/LogSoftmax.h
@@ -48,7 +48,7 @@ public:
public:
void accept(OperationVisitor &v) const override;
- OpCode opcode() const final { return OpCode::Softmax; }
+ OpCode opcode() const final { return OpCode::LogSoftmax; }
public:
const Param &param() const { return _param; }
diff --git a/runtime/onert/core/include/ir/operation/Pad.h b/runtime/onert/core/include/ir/operation/Pad.h
index a48606196..00481cd50 100644
--- a/runtime/onert/core/include/ir/operation/Pad.h
+++ b/runtime/onert/core/include/ir/operation/Pad.h
@@ -33,7 +33,7 @@ public:
{
INPUT = 0,
PAD = 1,
- // VALUE = 2 Not allow padding value operand yet
+ VALUE = 2
};
public:
diff --git a/runtime/onert/core/include/ir/operation/Quantize.h b/runtime/onert/core/include/ir/operation/Quantize.h
new file mode 100644
index 000000000..2533ce432
--- /dev/null
+++ b/runtime/onert/core/include/ir/operation/Quantize.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_IR_OPERATION_QUANTIZE_H__
+#define __ONERT_IR_OPERATION_QUANTIZE_H__
+
+#include "ir/Operation.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+class Quantize : public Operation
+{
+public:
+ enum Input
+ {
+ INPUT = 0,
+ };
+
+public:
+ Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs);
+
+public:
+ void accept(OperationVisitor &v) const override;
+ OpCode opcode() const final { return OpCode::Quantize; }
+};
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
+
+#endif // __ONERT_IR_OPERATION_QUANTIZE_H__
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
index 32a80412b..c374abaee 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
@@ -36,7 +36,7 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<cpu_common::Ten
void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
{
// NOTE Handle user tensors first
- auto user_tensor = _user_tensors->getManagedTensor(ind);
+ auto user_tensor = _user_tensors->getNativeTensor(ind);
if (user_tensor)
{
// User tensors cannot be reallocated.
@@ -47,8 +47,8 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
user_tensor->setShape(new_shape);
}
- // NOTE Then handle managed tensors
- auto tensor = _tensors->getManagedTensor(ind);
+ // NOTE Then handle native tensors
+ auto tensor = _tensors->getNativeTensor(ind);
assert(tensor);
bool previously_dynamic = tensor->is_dynamic();
@@ -101,9 +101,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
const ir::OperandInfo &tensor_info,
ir::Layout backend_layout)
{
- assert(_tensors->getManagedTensor(ind) == nullptr);
+ assert(_tensors->getNativeTensor(ind) == nullptr);
auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout);
- _tensors->setManagedTensor(ind, tensor);
+ _tensors->setNativeTensor(ind, tensor);
}
void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
@@ -130,7 +130,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
auto &input_set = find->second;
for (auto input_ind : input_set)
{
- if (!_tensors->getManagedTensor(input_ind)->is_dynamic())
+ if (!_tensors->getNativeTensor(input_ind)->is_dynamic())
continue;
_dynamic_mem_mgr->deallocate(input_ind);
@@ -141,7 +141,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
{
- if (!_tensors->getManagedTensor(output_ind)->is_dynamic())
+ if (!_tensors->getNativeTensor(output_ind)->is_dynamic())
return;
_dynamic_mem_mgr->deallocate(output_ind);
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
index 4b683fb58..eb83b7de4 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
@@ -81,23 +81,23 @@ void KernelGenerator::visit(const ir::operation::If &node)
std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
for (const auto input_index : node.getInputs())
{
- auto input_alloc = getTensor(input_index);
+ auto input_tensor = getTensor(input_index);
- input_tensors.emplace_back(input_alloc);
+ input_tensors.emplace_back(input_tensor);
}
std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
exec::DynAllocInfoMap outputs_dyn_alloc_info;
for (const auto output_index : node.getOutputs())
{
- auto output_alloc = getTensor(output_index);
+ auto output_tensor = getTensor(output_index);
- output_tensors.emplace_back(output_alloc);
+ output_tensors.emplace_back(output_tensor);
const auto output_tensor_builder = getTensorBuilder(output_index);
if (output_tensor_builder->supportDynamicTensor())
{
auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
- outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
+ outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
}
}
@@ -146,24 +146,24 @@ void KernelGenerator::visit(const ir::operation::While &node)
std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
for (const auto input_index : node.getInputs())
{
- auto input_alloc = getTensor(input_index);
+ auto input_tensor = getTensor(input_index);
- input_tensors.emplace_back(input_alloc);
+ input_tensors.emplace_back(input_tensor);
}
std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
for (const auto output_index : node.getOutputs())
{
- auto output_alloc = getTensor(output_index);
+ auto output_tensor = getTensor(output_index);
- output_tensors.emplace_back(output_alloc);
+ output_tensors.emplace_back(output_tensor);
const auto output_tensor_builder = getTensorBuilder(output_index);
if (output_tensor_builder->supportDynamicTensor())
{
auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
- outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
+ outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
}
}
@@ -199,7 +199,7 @@ KernelGenerator::getTensorBuilder(const ir::OperandIndex &index)
for (auto tensor_builder : _tensor_builder_set)
{
auto reg = tensor_builder->tensorRegistry();
- auto tensor = reg ? reg->getManagedITensor(index) : tensor_builder->tensorAt(index);
+ auto tensor = reg ? reg->getNativeITensor(index) : tensor_builder->tensorAt(index);
if (tensor)
{
ret = tensor_builder;
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
index 16cd3ec63..5bddb9185 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
@@ -92,7 +92,7 @@ void TensorBuilder::allocate()
std::shared_ptr<ITensor> TensorBuilder::tensorAt(const ir::OperandIndex &ind)
{
// NOTE Find from User Tensor Registry first
- // FIXME There may be both user tensor and managed tensor for a `ind` which is a waste
+ // FIXME There may be both user tensor and native tensor for a `ind` which is a waste
auto user_tensor = _user_tensor_reg->getITensor(ind);
auto tensor = _tensor_reg->getITensor(ind);
if (user_tensor)
@@ -107,7 +107,7 @@ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->ite
std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
{
- return _tensor_reg->getManagedTensor(ind);
+ return _tensor_reg->getNativeTensor(ind);
}
std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
@@ -123,7 +123,7 @@ std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
void TensorBuilder::setUserTensor(const ir::OperandIndex &ind,
const std::shared_ptr<UserTensor> &tensor)
{
- _user_tensor_reg->setManagedTensor(ind, tensor);
+ _user_tensor_reg->setNativeTensor(ind, tensor);
}
} // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h
index ce94ea028..b9b2d52b7 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h
+++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h
@@ -68,6 +68,7 @@ public:
void set_dynamic() override { _dynamic = true; }
ir::Shape getShape() const override { return _info.shape(); }
void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); }
+ bool is_constant() const override { return false; }
private:
ir::OperandInfo _info;
diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
index 0ccf7000b..ede403b59 100644
--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
@@ -35,7 +35,7 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
{
VERBOSE_F() << ind << std::endl;
- auto tensor = _tensors->getManagedTensor(ind);
+ auto tensor = _tensors->getNativeTensor(ind);
assert(tensor);
bool previously_dynamic = tensor->is_dynamic();
@@ -88,9 +88,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
const ir::OperandInfo &tensor_info,
ir::Layout backend_layout)
{
- assert(_tensors->getManagedTensor(ind) == nullptr);
+ assert(_tensors->getNativeTensor(ind) == nullptr);
auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
- _tensors->setManagedTensor(ind, tensor);
+ _tensors->setNativeTensor(ind, tensor);
}
void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
@@ -117,7 +117,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
auto &input_set = find->second;
for (auto input_ind : input_set)
{
- auto *tensor = _tensors->getManagedTensor(input_ind).get();
+ auto *tensor = _tensors->getNativeTensor(input_ind).get();
if (!tensor->is_dynamic())
continue;
@@ -131,7 +131,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
{
- auto *tensor = _tensors->getManagedTensor(output_ind).get();
+ auto *tensor = _tensors->getNativeTensor(output_ind).get();
if (!tensor->is_dynamic())
return;
diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
index 47bea35df..8604542eb 100644
--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
@@ -33,7 +33,7 @@ StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &
void StaticTensorManager::allocateConsts(void)
{
- for (auto &pair : _tensors->managed_tensors())
+ for (auto &pair : _tensors->native_tensors())
{
const auto &ind = pair.first;
auto tensor = pair.second;
@@ -42,9 +42,9 @@ void StaticTensorManager::allocateConsts(void)
auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size());
tensor->setBuffer(mem_alloc);
auto buffer = mem_alloc->base();
- VERBOSE(CPU_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
- << "): " << static_cast<void *>(buffer)
- << "size : " << tensor->total_size() << std::endl;
+ VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
+ << "): " << static_cast<void *>(buffer)
+ << "size : " << tensor->total_size() << std::endl;
}
}
}
@@ -53,7 +53,7 @@ void StaticTensorManager::allocateNonconsts(void)
{
_nonconst_mgr->allocate();
- for (auto &pair : _tensors->managed_tensors())
+ for (auto &pair : _tensors->native_tensors())
{
const auto &ind = pair.first;
auto tensor = pair.second;
@@ -62,8 +62,8 @@ void StaticTensorManager::allocateNonconsts(void)
auto *buffer = _nonconst_mgr->getBuffer(ind);
tensor->setBuffer(buffer);
- VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
- << "): " << static_cast<void *>(buffer) << std::endl;
+ VERBOSE(CPU_COMMON_StaticTensorManager) << "TENSOR(#" << ind.value()
+ << "): " << static_cast<void *>(buffer) << std::endl;
}
}
}
@@ -76,18 +76,18 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
bool as_const)
{
- assert(!_tensors->getManagedTensor(ind));
+ assert(!_tensors->getNativeTensor(ind));
auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
- _tensors->setManagedTensor(ind, tensor);
+ _tensors->setNativeTensor(ind, tensor);
_as_constants[ind] = as_const;
}
void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
{
- assert(_tensors->getManagedTensor(ind));
+ assert(_tensors->getNativeTensor(ind));
// This method is called only when a tensor has proper shape
- assert(!_tensors->getManagedTensor(ind)->is_dynamic());
+ assert(!_tensors->getNativeTensor(ind)->is_dynamic());
if (!_as_constants[ind])
_nonconst_mgr->claimPlan(ind, size);
@@ -95,10 +95,10 @@ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
{
- assert(_tensors->getManagedTensor(ind));
+ assert(_tensors->getNativeTensor(ind));
// This method is called only when a tensor has proper shape
- assert(!_tensors->getManagedTensor(ind)->is_dynamic());
+ assert(!_tensors->getNativeTensor(ind)->is_dynamic());
if (!_as_constants[ind])
_nonconst_mgr->releasePlan(ind);
@@ -106,7 +106,7 @@ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
{
- for (const auto &it : _tensors->managed_tensors())
+ for (const auto &it : _tensors->native_tensors())
fn(it.first);
}
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index f3f69ad1a..8439b6a11 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -201,18 +201,35 @@ ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
// Add tensor to controlflow TensorRegistry.
cf_tensor_builder->setUserTensor(ind, tensor);
ret.push_back(tensor);
-
- // Set other tensors as external tensors
- for (auto &tensor_builder : tensor_builders)
- {
- // FIXME This is a workaround registering all user tensors to all backends
- // FIXME Handle when it is failed
- tensor_builder->setExternalTensor(ind, tensor);
- }
}
return ret;
}
+void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph,
+ TensorBuilders &tensor_builders)
+{
+ lowered_graph.op_seqs().iterate(
+ [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
+ auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
+ auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
+ for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
+ ir::Remove::UNDEFINED)
+ {
+ // If an OpSequence input/output tensor does not have a own tensor object,
+ // it must be using external tensors, so find the tensor from other tensor builders and
+ // set the tensor to this tensor builder if portable
+ if (!backend_ctx->tensor_builder->tensorAt(ind))
+ {
+ auto tensor = tensor_builders.getITensor(ind);
+ assert(tensor); // The tensor must have been created in one of TensorBuilders
+ auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
+ if (ptensor)
+ backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor);
+ }
+ }
+ });
+}
+
exec::IExecutor *
ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
const compiler::CompilerOptions &options,
@@ -265,6 +282,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_
tensor_builder->prepare();
}
+ prepareExternalTensors(*lowered_graph, tensor_builders);
+
ExecutionBuilder builder;
// Generate kernels
@@ -367,6 +386,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
tensor_builder->prepare();
}
+ prepareExternalTensors(*lowered_graph, tensor_builders);
+
ExecutionBuilder builder;
// Generate kernels
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
index 1e82b9838..418e5a764 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
@@ -22,6 +22,7 @@
#include "backend/ITensor.h"
#include "exec/IExecutor.h"
#include "ir/LoweredGraph.h"
+#include "TensorBuilders.h"
namespace onert
{
@@ -48,6 +49,8 @@ private:
static std::vector<std::shared_ptr<backend::ITensor>>
initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
const ir::OperandIndexSequence &indices);
+ static void prepareExternalTensors(ir::LoweredGraph &lowered_graph,
+ TensorBuilders &tensor_builders);
static exec::IExecutor *
createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
const compiler::CompilerOptions &options,
diff --git a/runtime/onert/core/src/compiler/HEScheduler.h b/runtime/onert/core/src/compiler/HEScheduler.h
index f5075390d..d8ceca9c8 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.h
+++ b/runtime/onert/core/src/compiler/HEScheduler.h
@@ -51,16 +51,12 @@ public:
* @param[in] backend_resolver backend resolver
*/
HEScheduler(const backend::BackendContexts &backend_contexts, const CompilerOptions &options)
- : _backend_contexts{backend_contexts}, _is_supported{}, _backends_avail_time{}, _ops_eft{},
+ : _is_supported{}, _backends_avail_time{}, _ops_eft{},
_op_to_rank{std::make_shared<ir::OperationIndexMap<int64_t>>()},
_is_profiling_mode{options.he_profiling_mode},
_is_linear_exec{options.executor == "Linear"},
_is_parallel_exec{options.executor == "Parallel"}
{
- // Workaround to avoid unused-private-field warning
- // TODO use _backend_contexts and remove workaround
- (void)_backend_contexts;
-
for (auto &entry : backend_contexts)
{
_all_backends.push_back(entry.first);
@@ -165,7 +161,6 @@ private:
// whether it should assign these backends to these nodes:
// * It stores false for unsupported nodes
// * During rank calculation with enabled profiling mode it stores true for supported nodes
- const backend::BackendContexts &_backend_contexts;
std::unordered_map<const backend::Backend *, std::unordered_map<std::string, bool>> _is_supported;
// Finishing and starting time of each backend
std::unordered_map<const backend::Backend *, std::map<int64_t, int64_t>> _backends_avail_time;
@@ -175,8 +170,7 @@ private:
std::unique_ptr<compiler::BackendResolver> _backend_resolver;
std::unique_ptr<exec::ExecTime> _exec_time;
const ir::Graph *_graph{nullptr};
- std::vector<const backend::Backend *>
- _all_backends; // TODO Remove this and use _backend_contexts instead
+ std::vector<const backend::Backend *> _all_backends;
const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to controlflow_backend
bool _is_profiling_mode;
bool _is_linear_exec;
diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc
index 5c545aedd..fa5ee27db 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.cc
+++ b/runtime/onert/core/src/compiler/OperationValidator.cc
@@ -41,6 +41,21 @@ OperationValidator::OperationValidator(const ir::Graph &graph)
{
}
+void OperationValidator::checkUnaryOp(const ir::Operation &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(0)};
+
+ // Check if I/O types match
+ OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+
+ if (_ctx.at(output_index).info().isDynamic())
+ return;
+
+ // Check if I/O shapes match
+ OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
void OperationValidator::operator()()
{
// There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
@@ -53,16 +68,7 @@ void OperationValidator::operator()()
[&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
}
-void OperationValidator::visit(const ir::operation::Abs &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- if (_ctx.at(output_index).info().isDynamic())
- return;
-
- const auto input_index{node.getInputs().at(0)};
-
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Abs &node) { checkUnaryOp(node); }
void OperationValidator::visit(const ir::operation::AvgPool2D &node)
{
@@ -292,17 +298,7 @@ void OperationValidator::visit(const ir::operation::RNN &node)
num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
}
-void OperationValidator::visit(const ir::operation::Round &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- const auto input_index{node.getInputs().at(ir::operation::Round::Input::INPUT)};
-
- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
- if (_ctx.at(output_index).info().isDynamic())
- return;
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Round &node) { checkUnaryOp(node); }
void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
{
@@ -393,17 +389,7 @@ void OperationValidator::visit(const ir::operation::EmbeddingLookup &node)
}
}
-void OperationValidator::visit(const ir::operation::Exp &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
-
- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
- if (_ctx.at(output_index).info().isDynamic())
- return;
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Exp &node) { checkUnaryOp(node); }
void OperationValidator::visit(const ir::operation::ExpandDims &node)
{
@@ -419,17 +405,7 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node)
OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
}
-void OperationValidator::visit(const ir::operation::Floor &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- const auto input_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
-
- OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
- if (_ctx.at(output_index).info().isDynamic())
- return;
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Floor &node) { checkUnaryOp(node); }
void OperationValidator::visit(const ir::operation::HashtableLookup &node)
{
@@ -789,6 +765,25 @@ void OperationValidator::visit(const ir::operation::LSTM &node)
}
}
+void OperationValidator::visit(const ir::operation::L2Normalization &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ if (_ctx.at(ofm_index).info().isDynamic())
+ return;
+
+ const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
+
+ auto ifm_shape = _ctx.at(ifm_index).shape();
+ auto ofm_shape = _ctx.at(ofm_index).shape();
+
+ OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
+
+ for (auto i = 0; i < ifm_shape.rank(); i++)
+ {
+ OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
+ }
+}
+
void OperationValidator::visit(const ir::operation::Unpack &node)
{
const auto num{node.param().num};
@@ -904,35 +899,11 @@ void OperationValidator::visit(const ir::operation::Split &node)
OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
}
-void OperationValidator::visit(const ir::operation::Cos &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- if (_ctx.at(output_index).info().isDynamic())
- return;
-
- const auto input_index{node.getInputs().at(0)};
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
-
-void OperationValidator::visit(const ir::operation::Sin &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- if (_ctx.at(output_index).info().isDynamic())
- return;
+void OperationValidator::visit(const ir::operation::Cos &node) { checkUnaryOp(node); }
- const auto input_index{node.getInputs().at(0)};
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Sin &node) { checkUnaryOp(node); }
-void OperationValidator::visit(const ir::operation::RSQRT &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- if (_ctx.at(output_index).info().isDynamic())
- return;
-
- const auto input_index{node.getInputs().at(0)};
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::RSQRT &node) { checkUnaryOp(node); }
void OperationValidator::visit(const ir::operation::Shape &node)
{
@@ -972,35 +943,11 @@ void OperationValidator::visit(const ir::operation::While &node)
// TODO Add to validate with subgraphs
}
-void OperationValidator::visit(const ir::operation::Neg &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- if (_ctx.at(output_index).info().isDynamic())
- return;
+void OperationValidator::visit(const ir::operation::Neg &node) { checkUnaryOp(node); }
- const auto input_index{node.getInputs().at(0)};
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Log &node) { checkUnaryOp(node); }
-void OperationValidator::visit(const ir::operation::Log &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- if (_ctx.at(output_index).info().isDynamic())
- return;
-
- const auto input_index{node.getInputs().at(0)};
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
-
-void OperationValidator::visit(const ir::operation::LogicalNot &node)
-{
- const auto output_index{node.getOutputs().at(0)};
- if (_ctx.at(output_index).info().isDynamic())
- return;
-
- const auto input_index{node.getInputs().at(0)};
- OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::LogicalNot &node) { checkUnaryOp(node); }
void OperationValidator::visit(const ir::operation::SquaredDifference &node)
{
@@ -1118,5 +1065,25 @@ void OperationValidator::visit(const ir::operation::LogSoftmax &node)
OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
}
+
+void OperationValidator::visit(const ir::operation::Quantize &node)
+{
+ VERBOSE(Quantize) << "Configure Quantize operation" << std::endl;
+
+ OP_REQUIRES(node.getInputs().size() == 1);
+ OP_REQUIRES(node.getOutputs().size() == 1);
+
+ const auto input_index{node.getInputs().at(0)};
+ const auto output_index{node.getOutputs().at(0)};
+
+ OP_REQUIRES(_ctx.at(input_index).typeInfo().type() == ir::DataType::FLOAT32);
+
+ if (_ctx.at(output_index).info().isDynamic())
+ return;
+
+ OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
+
+ OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
} // namespace compiler
} // namespace onert
diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h
index 6ceafe8b1..55a4dd508 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.h
+++ b/runtime/onert/core/src/compiler/OperationValidator.h
@@ -70,6 +70,7 @@ public:
void visit(const ir::operation::DepthToSpace &node) override;
void visit(const ir::operation::Pack &node) override;
void visit(const ir::operation::LSTM &node) override;
+ void visit(const ir::operation::L2Normalization &node) override;
void visit(const ir::operation::Unpack &node) override;
void visit(const ir::operation::Pad &node) override;
void visit(const ir::operation::Min &node) override;
@@ -93,9 +94,10 @@ public:
void visit(const ir::operation::Range &node) override;
void visit(const ir::operation::MatrixBandPart &node) override;
void visit(const ir::operation::LogSoftmax &node) override;
+ void visit(const ir::operation::Quantize &node) override;
private:
- void checkReduceOp(const ir::OperandIndex input_index, const ir::OperandIndex output_index);
+ void checkUnaryOp(const ir::Operation &node);
private:
// TODO Remove _ctx field
diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc
index 5a58f2e9d..66de5999e 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInference.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc
@@ -497,6 +497,11 @@ void StaticShapeInferer::visit(const ir::operation::Logistic &op)
handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::Input::INPUT));
}
+void StaticShapeInferer::visit(const ir::operation::L2Normalization &op)
+{
+ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT));
+}
+
void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op)
{
handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT));
diff --git a/runtime/onert/core/src/compiler/TensorBuilders.h b/runtime/onert/core/src/compiler/TensorBuilders.h
index 4bb7413b1..c0a1ebc04 100644
--- a/runtime/onert/core/src/compiler/TensorBuilders.h
+++ b/runtime/onert/core/src/compiler/TensorBuilders.h
@@ -23,6 +23,7 @@
#include "backend/Backend.h"
#include "backend/controlflow/Config.h"
#include "backend/controlflow/TensorBuilder.h"
+#include "util/logging.h"
namespace onert
{
@@ -66,6 +67,17 @@ public:
return _cf_tensor_builder;
}
+ std::shared_ptr<backend::ITensor> getITensor(ir::OperandIndex ind)
+ {
+ for (auto &tensor_builder : _tensor_builders)
+ {
+ auto tensor = tensor_builder->tensorAt(ind);
+ if (tensor)
+ return tensor;
+ }
+ return nullptr;
+ }
+
private:
std::unordered_set<std::shared_ptr<backend::ITensorBuilder>> _tensor_builders;
std::shared_ptr<backend::controlflow::TensorBuilder> _cf_tensor_builder;
diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc
index 1b8202978..28e92ba14 100644
--- a/runtime/onert/core/src/exec/DynamicShapeInference.cc
+++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc
@@ -442,6 +442,11 @@ void DynamicShapeInferer::visit(const ir::operation::Logistic &op)
handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::INPUT));
}
+void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op)
+{
+ handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT));
+}
+
void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op)
{
handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT));
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index a7409b90c..864ccb31a 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -46,7 +46,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
{
auto tensor_registry = tensor_builder->tensorRegistry();
assert(tensor_registry);
- tensor = tensor_registry->getManagedITensor(ind);
+ tensor = tensor_registry->getNativeITensor(ind);
if (tensor != nullptr)
{
if (tensor_builder->supportDynamicTensor())
@@ -71,7 +71,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
{
auto tensor_registry = tensor_builder->tensorRegistry();
assert(tensor_registry);
- tensor = tensor_registry->getManagedITensor(ind);
+ tensor = tensor_registry->getNativeITensor(ind);
if (tensor != nullptr)
{
if (tensor_builder->supportDynamicTensor())
diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc
index d2e3627b4..c8dce698d 100644
--- a/runtime/onert/core/src/interp/operations/Pad.cc
+++ b/runtime/onert/core/src/interp/operations/Pad.cc
@@ -69,8 +69,8 @@ void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITenso
const int32_t *pad_ptr = reinterpret_cast<const int32_t *>(pad_buffer);
float *output_ptr = reinterpret_cast<float *>(output_buffer);
- nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, output_ptr,
- nullptr);
+ nnfw::cker::Pad<float>(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape,
+ output_ptr, nullptr);
}
void invokePad(const ExecEnv *env, const ir::Operation &node)
diff --git a/runtime/onert/core/src/ir/LoweredGraph.cc b/runtime/onert/core/src/ir/LoweredGraph.cc
index 6e93a23e9..f13808993 100644
--- a/runtime/onert/core/src/ir/LoweredGraph.cc
+++ b/runtime/onert/core/src/ir/LoweredGraph.cc
@@ -122,9 +122,6 @@ LoweredGraph::LoweredGraph(const Graph &graph, const compiler::CompilerOptions &
pass::PermutationInsertionPass pi_pass(*this);
pi_pass.run();
- // Implemented code no longer works.
- // pass::PermutationEliminationPass pe_pass(*this);
- // pe_pass.run();
_op_seqs.dump("merged and sorted operations with permutation", _graph.operations());
}
diff --git a/runtime/onert/core/src/ir/operation/Quantize.cc b/runtime/onert/core/src/ir/operation/Quantize.cc
new file mode 100644
index 000000000..0e3d5b69b
--- /dev/null
+++ b/runtime/onert/core/src/ir/operation/Quantize.cc
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ir/operation/Quantize.h"
+
+#include "ir/OperationVisitor.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+void Quantize::accept(OperationVisitor &v) const { v.visit(*this); }
+
+Quantize::Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
+ : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+{
+}
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
deleted file mode 100644
index 9e0291ef9..000000000
--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "PermutationEliminationPass.h"
-
-#include "ir/Operand.h"
-#include "ir/operand/LowerInfo.h"
-#include "ir/Graph.h"
-#include "backend/IConfig.h"
-#include "util/logging.h"
-
-namespace onert
-{
-namespace ir
-{
-namespace pass
-{
-void PermutationEliminationPass::callback(const OperandIndex &inp_index, Operand &object)
-{
- if (_graph.getInputs().contains(inp_index))
- {
- eliminateInput(inp_index, object);
- }
- else if (_graph.getOutputs().contains(inp_index))
- {
- eliminateOutput(inp_index, object);
- }
-}
-
-void PermutationEliminationPass::eliminateInput(const OperandIndex &inp_index, Operand &object)
-{
- auto &model_inputs = _graph.getInputs();
-
- // get uses of the model's given input
- auto uses = object.getUses();
-
- // input must be used just by permutation
- if (uses.size() != 1)
- {
- return;
- }
-
- for (auto input_use : uses)
- {
- auto &perm_operation = _graph.operations().at(input_use);
- auto perm_inputs = perm_operation.getInputs();
-
- auto perm_outputs = perm_operation.getOutputs();
-
- if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, true))
- {
- return;
- }
-
- assert(perm_inputs.at(0) == inp_index);
-
- VERBOSE(PermutationEliminationPass::EliminateInput) << "remove NHWC_TO_NCHW permutation\n";
-
- // set model's new input, which was output of permutation
- model_inputs.replace(inp_index, perm_outputs.at(0));
-
- // remove model's input, which is also input of permutation
- _graph.removeOperand(inp_index);
-
- // remove permutation operation
- assert(_lowered_graph.op_seqs().containsOperation(input_use));
- auto op_seq_idx = _lowered_graph.op_seqs().getOperation(input_use);
- _lowered_graph.op_seqs().remove(op_seq_idx);
- _graph.operations().remove(input_use);
-
- VERBOSE(PermutationEliminationPass::EliminateInput)
- << inp_index.value() << " is model's input and is removed. New input is "
- << perm_outputs.at(0).value() << "\n"
- << input_use.value() << " is removed permutation operation\n";
- }
-}
-
-void PermutationEliminationPass::eliminateOutput(const OperandIndex &out_index, Operand &object)
-{
- auto &model_outputs = _graph.getOutputs();
-
- // get defs of the model's given output
- auto defs = object.getDef();
-
- // output must use just permutation
- if (defs.size() != 1)
- {
- return;
- }
-
- for (auto output_def : defs)
- {
- auto &perm_operation = _graph.operations().at(output_def);
- auto perm_outputs = perm_operation.getOutputs();
-
- auto perm_inputs = perm_operation.getInputs();
- if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, false))
- {
- return;
- }
-
- assert(perm_outputs.at(0) == out_index);
-
- VERBOSE(PermutationEliminationPass::EliminateOutput) << "remove NCHW_TO_NHWC permutation\n";
-
- // Update operations' output that is used by permute operand
- for (auto perm_input_index : perm_inputs)
- {
- auto &perm_input_operand = _graph.operands().at(perm_input_index);
- perm_input_operand.removeUse(output_def);
- }
-
- // set model's new output, which was input of permutation
- model_outputs.replace(out_index, perm_inputs.at(0));
-
- // remove model's output, which is also output of permutation
- _graph.removeOperand(out_index);
-
- // remove permutation operation
- assert(_lowered_graph.op_seqs().containsOperation(output_def));
- auto op_seq_idx = _lowered_graph.op_seqs().getOperation(output_def);
- _lowered_graph.op_seqs().remove(op_seq_idx);
- _graph.operations().remove(output_def);
-
- VERBOSE(PermutationEliminationPass::EliminateOutput)
- << out_index.value() << " is model's output and is removed. New output is "
- << perm_inputs.at(0).value() << "\n"
- << output_def.value() << " is removed permutation operation\n";
- }
-}
-
-bool PermutationEliminationPass::isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
- const OperandIndexSequence &out_indexes,
- bool is_for_model_input)
-{
- auto input_def_factors = _lowered_graph.getLowerInfo(inp_indexes.at(0))->def_factors();
- auto output_def_factors = _lowered_graph.getLowerInfo(out_indexes.at(0))->def_factors();
-
- auto input_layout = input_def_factors.getOnlyElement().layout();
- auto output_layout = output_def_factors.getOnlyElement().layout();
-
- if (input_def_factors.size() != 1 || output_def_factors.size() != 1)
- {
- return false;
- }
-
- // all operands' factor must be the same
- for (auto index : inp_indexes)
- {
- auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
- if (op_factor_set.size() != 1 ||
- input_layout != _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
- {
- return false;
- }
- }
- // all operands' factor must be the same
- for (auto index : out_indexes)
- {
- auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
- if (op_factor_set.size() != 1 ||
- output_layout !=
- _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
- {
- return false;
- }
- }
-
- if (is_for_model_input)
- {
- // check if this is NHWC_TO_NCHW permutation: must have single input, which is model's input
- return (inp_indexes.size() == 1 && input_layout == Layout::NHWC &&
- output_layout == Layout::NCHW);
- }
-
- // check if this is NCHW_TO_NHWC permutation: must have single output, which is model's output
- return (out_indexes.size() == 1 && input_layout == Layout::NCHW && output_layout == Layout::NHWC);
-}
-
-} // namespace pass
-} // namespace ir
-} // namespace onert
diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
deleted file mode 100644
index 1c8430062..000000000
--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
-#define __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
-
-#include "LoweredOperandPass.h"
-#include "ir/Operand.h"
-#include "ir/OperandIndexSequence.h"
-
-namespace onert
-{
-namespace ir
-{
-namespace pass
-{
-
-class PermutationEliminationPass : public LoweredOperandPass
-{
-public:
- using LoweredOperandPass::LoweredOperandPass;
-
-public:
- std::string id() override { return "PermutationEliminationPass"; }
-
- void callback(const OperandIndex &index, Operand &object) override;
-
-private:
- /**
- * @brief Remove Permute operation that permutates input
- *
- * Note: This function aslo removes model's input and
- * sets output of permutation as model's new input
- *
- * @param inp_index is the target operand index for the elimination
- * @param object is the target operand object for the elimination
- *
- * @return
- */
- void eliminateInput(const OperandIndex &inp_index, Operand &object);
-
- /**
- * @brief Remove Permute operation that permutates output of a model
- *
- * Note: This function aslo removes model's output and
- * sets input of permutation as model's new output
- *
- * @param out_index is the target operand index for the elimination
- * @param object is the target operand object for the elimination
- *
- * @return
- */
- void eliminateOutput(const OperandIndex &out_index, Operand &object);
-
- /**
- * @brief Determine if passed operands are permute layer's input and output, that must be
- * eliminated
- *
- * @param inp_index indexes of the input operand to operation
- * @param out_index indexes of the output operand to operation
- * @param is_for_model_input checking for model's input or output
- *
- * @return if it is permutation layer
- */
- bool isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
- const OperandIndexSequence &out_indexes, bool is_for_model_input);
-};
-
-} // namespace pass
-} // namespace ir
-} // namespace onert
-
-#endif // __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
diff --git a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
index 7c3da52a2..75efdd81e 100644
--- a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
+++ b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
@@ -62,27 +62,26 @@ void PermutationInsertionPass::callback(const OperandIndex &index, Operand &obje
auto insert_set = operand_li->use_factors() - operand_li->def_factors();
auto def_factor = operand_li->def_factors().getOnlyElement();
- auto compatible_backends = [](auto /* backend1 */, auto /* backend2 */) {
- // TODO If other issues for Permute elimination are resolved, enable this
- return false;
- /*
+ auto compatible_backends = [](auto backend1, auto backend2) {
// TODO This is a workaround for not inserting Permute between cpu and controlflow.
// To be general, we need another way of checking they are compatible.
const auto cf = backend::controlflow::Config::ID;
const auto cpu = "cpu";
const auto id1 = backend1->config()->id();
const auto id2 = backend2->config()->id();
- return (id1 == cpu && id2 == cf) // Allows no-Permute for Model inputs
- || (id1 == cf && id2 == cpu); // Allows no-Permute for Model outputs
- */
+ // NOTE This is to skip Permute insertion for model inputs(controlflow -> cpu), but not
+ // outputs. This function currently assumes that backend1 is Def and backend2 is Use. However
+ // it is going to be fixed soon.
+ // TODO make both ways work
+ return (id1 == cpu && id2 == cf);
};
for (auto factor : insert_set)
{
+ // Check exceptional cases that Permute ops are not inserted
if (factor.layout() == def_factor.layout() &&
compatible_backends(factor.backend(), def_factor.backend()))
{
- // For this factor we can just reuse existing operand - Permute is not added.
VERBOSE(PermutationInsertionPass) << "Permutation Insertion is skipped for operand "
<< index << " / as the tensor is compatible with backend "
<< factor.backend()->config()->id() << std::endl;
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index f5687ad1e..f7633463e 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -171,6 +171,8 @@ protected:
void loadBroadcastTo(const Operator *op, ir::Graph &subg);
void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
void loadLogSoftmax(const Operator *op, ir::Graph &subg);
+ void loadQuantize(const Operator *op, ir::Graph &subg);
+ void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
protected:
// Base address for mapped region for loading (if needed)
@@ -1123,6 +1125,22 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *o
std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs));
subg.addOperation(std::move(new_op));
}
+template <typename LoaderDomain, typename SpecificLoader>
+void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
+{
+ ir::OperandIndexSequence inputs;
+ ir::OperandIndexSequence outputs;
+ ir::operation::SpaceToDepth::Param param;
+
+ const auto *options = op->builtin_options_as_SpaceToDepthOptions();
+
+ param.block_size = options->block_size();
+
+ loadOperationIO(op, inputs, outputs);
+
+ std::unique_ptr<ir::Operation> new_op(new ir::operation::SpaceToDepth(inputs, outputs, param));
+ subg.addOperation(std::move(new_op));
+}
template <typename LoaderDomain, typename SpecificLoader>
void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir::Graph &subg)
@@ -1743,6 +1761,18 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadLogSoftmax(const Operator *op
}
template <typename LoaderDomain, typename SpecificLoader>
+void BaseLoader<LoaderDomain, SpecificLoader>::loadQuantize(const Operator *op, ir::Graph &subg)
+{
+ ir::OperandIndexSequence inputs;
+ ir::OperandIndexSequence outputs;
+
+ loadOperationIO(op, inputs, outputs);
+
+ std::unique_ptr<ir::Operation> new_op(new ir::operation::Quantize(inputs, outputs));
+ subg.addOperation(std::move(new_op));
+}
+
+template <typename LoaderDomain, typename SpecificLoader>
void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, ir::Graph &subg)
{
const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
@@ -1959,6 +1989,12 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
case BuiltinOperator::BuiltinOperator_LOG_SOFTMAX:
loadLogSoftmax(op, subg);
return;
+ case BuiltinOperator::BuiltinOperator_QUANTIZE:
+ loadQuantize(op, subg);
+ return;
+ case BuiltinOperator::BuiltinOperator_SPACE_TO_DEPTH:
+ loadSpaceToDepth(op, subg);
+ return;
default:
throw std::runtime_error(
std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
index 94791f8e6..00ffcb677 100644
--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
@@ -106,6 +106,33 @@ getReduceGenerator(const onert::ir::operation::Reduce::ReduceType reduce_type)
};
}
+template <typename T>
+Operation *CreateSimpleUnaryOp(const OperationFactory::Param &init_param, Operands &)
+{
+ assert(init_param.input_count == 1 && init_param.output_count == 1);
+
+ OperandIndexSequence outputs{init_param.outputs[0]};
+
+ // Each input should be interpreted as follows:
+ //
+ // 0 -> Input Tensor Index
+ OperandIndexSequence inputs{init_param.inputs[0]};
+
+ return new T{inputs, outputs};
+}
+
+// A generator function for binary ops with no params
+template <typename T>
+Operation *createSimpleBinaryOp(const OperationFactory::Param &init_param, Operands &)
+{
+ assert(init_param.input_count == 2 && init_param.output_count == 1);
+
+ OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+ OperandIndexSequence outputs{init_param.outputs[0]};
+
+ return new T{inputs, outputs};
+}
+
} // namespace
OperationFactory &OperationFactory::get()
@@ -116,20 +143,10 @@ OperationFactory &OperationFactory::get()
OperationFactory::OperationFactory()
{
- _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = [](const OperationFactory::Param &init_param,
- Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- // 1 -> Block size Index
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
- return new operation::BatchToSpaceND{inputs, outputs};
- };
+ // Each input should be interpreted as follows:
+ // 0 -> Input Tensor Index
+ // 1 -> Block size Index
+ _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = createSimpleBinaryOp<operation::BatchToSpaceND>;
_map[ANEURALNETWORKS_DEPTHWISE_CONV_2D] = [](const OperationFactory::Param &init_param,
Operands &operands) {
@@ -724,44 +741,11 @@ OperationFactory::OperationFactory()
return new operation::Squeeze{inputs, outputs, param};
};
- _map[ANEURALNETWORKS_TANH] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::Tanh{inputs, outputs};
- };
-
- _map[ANEURALNETWORKS_LOG] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
+ _map[ANEURALNETWORKS_TANH] = CreateSimpleUnaryOp<operation::Tanh>;
- OperandIndexSequence outputs{init_param.outputs[0]};
+ _map[ANEURALNETWORKS_LOG] = CreateSimpleUnaryOp<operation::Log>;
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::Log{inputs, outputs};
- };
-
- _map[ANEURALNETWORKS_LOGISTIC] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::Logistic{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_LOGISTIC] = CreateSimpleUnaryOp<operation::Logistic>;
_map[ANEURALNETWORKS_DIV] = [](const OperationFactory::Param &init_param, Operands &operands) {
assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -784,36 +768,16 @@ OperationFactory::OperationFactory()
return new operation::Div{inputs, outputs, param};
};
- _map[ANEURALNETWORKS_EXP] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::Exp{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_EXP] = CreateSimpleUnaryOp<operation::Exp>;
// ANEURALNETWORKS_EXP_EX is deprecated
// TODO Remove ANEURALNETWORKS_EXP_EX
_map[ANEURALNETWORKS_EXP_EX] = _map[ANEURALNETWORKS_EXP];
- _map[ANEURALNETWORKS_EXPAND_DIMS] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- // 1 -> Axis Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
- return new operation::ExpandDims{inputs, outputs};
- };
+ // Each input should be interpreted as follows:
+ // 0 -> Input Tensor Index
+ // 1 -> Axis Tensor Index
+ _map[ANEURALNETWORKS_EXPAND_DIMS] = createSimpleBinaryOp<operation::ExpandDims>;
_map[ANEURALNETWORKS_GREATER] = [](const OperationFactory::Param &init_param, Operands &) {
assert(init_param.input_count == 2 && init_param.output_count == 1);
@@ -982,19 +946,7 @@ OperationFactory::OperationFactory()
return new operation::Comparison{inputs, outputs, param};
};
- _map[ANEURALNETWORKS_LOGICAL_AND] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> input0 Tensor Index
- // 1 -> input1 Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
- return new operation::LogicalAnd{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_LOGICAL_AND] = createSimpleBinaryOp<operation::LogicalAnd>;
// ANEURALNETWORKS_LOGICAL_AND_EX is deprecated
// TODO Remove ANEURALNETWORKS_LOGICAL_AND_EX
@@ -1018,18 +970,7 @@ OperationFactory::OperationFactory()
return new operation::LogicalAnd{inputs, outputs};
};
- _map[ANEURALNETWORKS_RSQRT] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::RSQRT{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_RSQRT] = CreateSimpleUnaryOp<operation::RSQRT>;
_map[ANEURALNETWORKS_SELECT] = [](const OperationFactory::Param &init_param, Operands &) {
assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -1065,18 +1006,7 @@ OperationFactory::OperationFactory()
// TODO Remove ANEURALNETWORKS_RSQRT_EX
_map[ANEURALNETWORKS_RSQRT_EX] = _map[ANEURALNETWORKS_RSQRT];
- _map[ANEURALNETWORKS_RELU] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::ReLU{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_RELU] = CreateSimpleUnaryOp<operation::ReLU>;
_map[ANEURALNETWORKS_RESIZE_BILINEAR] = [](const OperationFactory::Param &init_param,
Operands &operands) {
@@ -1098,31 +1028,9 @@ OperationFactory::OperationFactory()
return new operation::ResizeBilinear{inputs, outputs, param};
};
- _map[ANEURALNETWORKS_RELU1] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
+ _map[ANEURALNETWORKS_RELU1] = CreateSimpleUnaryOp<operation::ReLU1>;
- return new operation::ReLU1{inputs, outputs};
- };
-
- _map[ANEURALNETWORKS_RELU6] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::ReLU6{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_RELU6] = CreateSimpleUnaryOp<operation::ReLU6>;
_map[ANEURALNETWORKS_REVERSE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
assert(init_param.input_count == 2 && init_param.output_count == 1);
@@ -1438,18 +1346,7 @@ OperationFactory::OperationFactory()
return new operation::LogicalOr{inputs, outputs};
};
- _map[ANEURALNETWORKS_LOGICAL_NOT] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::LogicalNot{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_LOGICAL_NOT] = CreateSimpleUnaryOp<operation::LogicalNot>;
// ANEURALNETWORKS_LOGICAL_NOT_EX is deprecated
// TODO Remove ANEURALNETWORKS_LOGICAL_NOT_EX
@@ -1649,35 +1546,13 @@ OperationFactory::OperationFactory()
// TODO Remove ANEURALNETWORKS_GATHER_EX
_map[ANEURALNETWORKS_GATHER_EX] = _map[ANEURALNETWORKS_GATHER];
- _map[ANEURALNETWORKS_NEG] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::Neg{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_NEG] = CreateSimpleUnaryOp<operation::Neg>;
// ANEURALNETWORKS_NEG_EX is deprecated
// TODO Remove ANEURALNETWORKS_NEG_EX
_map[ANEURALNETWORKS_NEG_EX] = _map[ANEURALNETWORKS_NEG];
- _map[ANEURALNETWORKS_ABS] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::Abs{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_ABS] = CreateSimpleUnaryOp<operation::Abs>;
// ANEURALNETWORKS_ABS_EX is deprecated
// TODO Remove ANEURALNETWORKS_ABS_EX
@@ -1704,18 +1579,7 @@ OperationFactory::OperationFactory()
// TODO Remove ANEURALNETWORKS_ARGMAX_EX
_map[ANEURALNETWORKS_ARGMAX_EX] = _map[ANEURALNETWORKS_ARGMAX];
- _map[ANEURALNETWORKS_DEQUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 1 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- OperandIndexSequence inputs{init_param.inputs[0]};
-
- return new operation::Dequantize{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_DEQUANTIZE] = CreateSimpleUnaryOp<operation::Dequantize>;
_map[ANEURALNETWORKS_MEAN] = [](const OperationFactory::Param &init_param, Operands &operands) {
assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -1841,31 +1705,24 @@ OperationFactory::OperationFactory()
};
_map[ANEURALNETWORKS_PAD] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count >= 1);
+ assert(init_param.input_count >= 2 && init_param.input_count <= 3 &&
+ init_param.output_count >= 1);
OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+ if (init_param.input_count == 3)
+ {
+ inputs.append(OperandIndex{init_param.inputs[2]});
+ }
OperandIndexSequence outputs{init_param.outputs[0]};
return new operation::Pad{inputs, outputs};
};
- _map[ANEURALNETWORKS_MINIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
+ _map[ANEURALNETWORKS_PAD_V2] = _map[ANEURALNETWORKS_PAD];
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
- OperandIndexSequence outputs{init_param.outputs[0]};
+ _map[ANEURALNETWORKS_MINIMUM] = createSimpleBinaryOp<operation::Min>;
- return new operation::Min{inputs, outputs};
- };
-
- _map[ANEURALNETWORKS_MAXIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
-
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- return new operation::Max{inputs, outputs};
- };
+ _map[ANEURALNETWORKS_MAXIMUM] = createSimpleBinaryOp<operation::Max>;
_map[ANEURALNETWORKS_ONE_HOT_EX] = [](const OperationFactory::Param &init_param,
Operands &operands) {
@@ -1948,34 +1805,15 @@ OperationFactory::OperationFactory()
return new operation::Range{inputs, outputs};
};
- _map[ANEURALNETWORKS_POW] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
+ // Each input should be interpreted as follows:
+ // 0 -> LHS Tensor Index
+ // 1 -> RHS Tensor Index
+ _map[ANEURALNETWORKS_POW] = createSimpleBinaryOp<operation::Pow>;
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> LHS Tensor Index
- // 1 -> RHS Tensor Index
-
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
- return new operation::Pow{inputs, outputs};
- };
-
- _map[ANEURALNETWORKS_FILL_EX] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
-
- // Each input should be interpreted as follows:
- //
- // 0 -> A tensor, specifying the input.
- // 1 -> A 1-D tensor, specifying the value
-
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- return new operation::Fill{inputs, outputs};
- };
+ // Each input should be interpreted as follows:
+ // 0 -> A tensor, specifying the input.
+ // 1 -> A 1-D tensor, specifying the value
+ _map[ANEURALNETWORKS_FILL_EX] = createSimpleBinaryOp<operation::Fill>;
_map[ANEURALNETWORKS_ZEROS_LIKE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
assert(init_param.input_count == 1 && init_param.output_count == 1);
@@ -1989,20 +1827,10 @@ OperationFactory::OperationFactory()
return new operation::ZerosLike{inputs, outputs};
};
- _map[ANEURALNETWORKS_TILE] = [](const OperationFactory::Param &init_param, Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- // 1 -> Multiple Tensor Index
-
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
- return new operation::Tile{inputs, outputs};
- };
+ // Each input should be interpreted as follows:
+ // 0 -> Input Tensor Index
+ // 1 -> Multiple Tensor Index
+ _map[ANEURALNETWORKS_TILE] = createSimpleBinaryOp<operation::Tile>;
_map[ANEURALNETWORKS_MATRIX_BAND_PART_EX] = [](const OperationFactory::Param &init_param,
Operands &) {
@@ -2064,21 +1892,9 @@ OperationFactory::OperationFactory()
return new operation::Einsum{inputs, outputs, param};
};
- _map[ANEURALNETWORKS_BROADCAST_TO_EX] = [](const OperationFactory::Param &init_param,
- Operands &) {
- assert(init_param.input_count == 2 && init_param.output_count == 1);
-
- OperandIndexSequence outputs{init_param.outputs[0]};
-
- // Each input should be interpreted as follows:
- //
- // 0 -> Input Tensor Index
- // 1 -> int32, int64, An 1-D int tensor Index
-
- OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
- return new operation::BroadcastTo{inputs, outputs};
- };
+ // 0 -> Input Tensor Index
+ // 1 -> int32, int64, An 1-D int tensor Index
+ _map[ANEURALNETWORKS_BROADCAST_TO_EX] = createSimpleBinaryOp<operation::BroadcastTo>;
_map[ANEURALNETWORKS_FUSED_BATCH_NORM_V3_EX] = [](const OperationFactory::Param &init_param,
Operands &operands) {
@@ -2133,6 +1949,15 @@ OperationFactory::OperationFactory()
return new operation::LogSoftmax{inputs, outputs, param};
};
+
+ _map[ANEURALNETWORKS_QUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
+ assert(init_param.input_count == 1 && init_param.output_count == 1);
+
+ OperandIndexSequence inputs{init_param.inputs[0]};
+ OperandIndexSequence outputs{init_param.outputs[0]};
+
+ return new operation::Quantize{inputs, outputs};
+ };
}
Operation *OperationFactory::create(ANeuralNetworksOperationType type,
diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc
index cc0434764..0fcf372c3 100644
--- a/runtime/onert/test/core/exec/ExecInstance.cc
+++ b/runtime/onert/test/core/exec/ExecInstance.cc
@@ -73,9 +73,8 @@ public:
// Compile
auto subgs = std::make_shared<onert::ir::Subgraphs>();
subgs->push(onert::ir::SubgraphIndex{0}, graph);
- auto compiler = new onert::compiler::Compiler{subgs};
- executors = compiler->compile();
- delete compiler;
+ onert::compiler::Compiler compiler{subgs};
+ executors = compiler.compile();
}
public:
@@ -98,19 +97,17 @@ TEST(ExecInstance, simple)
float output_buffer[4] = {};
const float output_expected[4] = {5, -2, 0, -1};
- auto execution = new onert::exec::Execution(executors);
+ onert::exec::Execution execution{executors};
- execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
- execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
- execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
- execution->execute();
+ execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+ execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+ execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+ execution.execute();
for (auto i = 0; i < 4; i++)
{
EXPECT_EQ(output_buffer[i], output_expected[i]);
}
-
- delete execution;
}
TEST(ExecInstance, twoCompile)
@@ -118,7 +115,7 @@ TEST(ExecInstance, twoCompile)
auto mockup = CompiledMockUpModel();
auto graph = mockup.graph;
auto executors1 = mockup.executors;
- auto execution1 = new onert::exec::Execution(executors1);
+ onert::exec::Execution execution1{executors1};
auto input1 = IOIndex{0};
auto input2 = IOIndex{1};
@@ -129,38 +126,34 @@ TEST(ExecInstance, twoCompile)
float exe1_output_buffer[4] = {};
const float exe1_output_expected[4] = {5, -2, 0, -1};
- execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
- execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
- execution1->setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
+ execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+ execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+ execution1.setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
// Make new executor: compile again
auto subgs = std::make_shared<onert::ir::Subgraphs>();
subgs->push(onert::ir::SubgraphIndex{0}, graph);
- auto compiler = new onert::compiler::Compiler{subgs};
- std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler->compile();
- auto execution2 = new onert::exec::Execution(executors2);
+ onert::compiler::Compiler compiler{subgs};
+ std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler.compile();
+ onert::exec::Execution execution2{executors2};
const float exe2_input1_buffer[4] = {2, 1, -2, 0};
const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
float exe2_output_buffer[4] = {};
const float exe2_output_expected[4] = {2, 5, -2, 7};
- execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
- execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
- execution2->setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
+ execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+ execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+ execution2.setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
- execution1->execute();
- execution2->execute();
+ execution1.execute();
+ execution2.execute();
for (auto i = 0; i < 4; i++)
{
EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
}
-
- delete compiler;
- delete execution1;
- delete execution2;
}
// Support two initialized execution instance then ordered execution
@@ -178,32 +171,29 @@ TEST(ExecInstance, twoExecution)
const float exe1_output_expected[4] = {5, -2, 0, -1};
const float exe2_output_expected[4] = {2, 5, -2, 7};
- auto execution1 = new onert::exec::Execution(executors);
- execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
- execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
- execution1->setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
+ onert::exec::Execution execution1{executors};
+ execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+ execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+ execution1.setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
const float exe2_input1_buffer[4] = {2, 1, -2, 0};
const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
float exe2_output_buffer[4] = {};
// Make new execution
- auto execution2 = new onert::exec::Execution(executors);
- execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
- execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
- execution2->setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
+ onert::exec::Execution execution2{executors};
+ execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+ execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+ execution2.setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
- execution1->execute();
- execution2->execute();
+ execution1.execute();
+ execution2.execute();
for (auto i = 0; i < 4; i++)
{
EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
}
-
- delete execution1;
- delete execution2;
}
class Inference
@@ -222,14 +212,12 @@ public:
auto input2 = IOIndex{1};
auto output1 = IOIndex{0};
- auto execution = new onert::exec::Execution(_executors);
- execution->setInput(input1, reinterpret_cast<const void *>(_input1), 16);
- execution->setInput(input2, reinterpret_cast<const void *>(_input2), 16);
- execution->setOutput(output1, reinterpret_cast<void *>(_output), 16);
+ onert::exec::Execution execution{_executors};
+ execution.setInput(input1, reinterpret_cast<const void *>(_input1), 16);
+ execution.setInput(input2, reinterpret_cast<const void *>(_input2), 16);
+ execution.setOutput(output1, reinterpret_cast<void *>(_output), 16);
- execution->execute();
-
- delete execution;
+ execution.execute();
}
private:
@@ -288,20 +276,18 @@ TEST(ExecInstance, async)
float output_buffer[4] = {};
const float output_expected[4] = {5, -2, 0, -1};
- auto execution = new onert::exec::Execution(executors);
+ onert::exec::Execution execution{executors};
- execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
- execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
- execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
- execution->startExecute();
- execution->waitFinish();
+ execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+ execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+ execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+ execution.startExecute();
+ execution.waitFinish();
for (auto i = 0; i < 4; i++)
{
EXPECT_EQ(output_buffer[i], output_expected[i]);
}
-
- delete execution;
}
} // namespace
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
index e50b94118..005f61c94 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
GeneratedTests.cast_float16_to_quant8_overflow
GeneratedTests.cast_float32_to_float16
GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_float32_to_int32_nnfw
GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
GeneratedTests.cast_quant8_to_float16
GeneratedTests.concat_dynamic_nnfw
GeneratedTests.conv_dynamic_nnfw
@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
GeneratedTests.gather_float16_8
GeneratedTests.greater_dynamic_float_nnfw
GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
GeneratedTests.less_dynamic_float_nnfw
GeneratedTests.less_equal_dynamic_float_nnfw
GeneratedTests.log_4D_float_nnfw
@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
GeneratedTests.one_hot_ex_dynamic_nnfw
GeneratedTests.pack_ex_dynamic_nnfw
GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
GeneratedTests.pow_2D_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw_2
GeneratedTests.pow_broadcast_float_nnfw_3
GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
GeneratedTests.range_ex_float_1
GeneratedTests.range_ex_float_1_all_constant_inputs
GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
index c9edee585..d987bf17b 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
GeneratedTests.cast_float16_to_quant8_overflow
GeneratedTests.cast_float32_to_float16
GeneratedTests.cast_float32_to_float16_relaxed
-GeneratedTests.cast_float32_to_quant8_overflow
-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
GeneratedTests.cast_quant8_to_float16
GeneratedTests.concat_dynamic_nnfw
GeneratedTests.conv_dynamic_nnfw
@@ -73,6 +70,7 @@ GeneratedTests.gather_float16_8
GeneratedTests.greater_dynamic_float_nnfw
GeneratedTests.greater_equal_boolean
GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
GeneratedTests.less_boolean
GeneratedTests.less_dynamic_float_nnfw
GeneratedTests.less_equal_dynamic_float_nnfw
@@ -112,11 +110,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
GeneratedTests.one_hot_ex_dynamic_nnfw
GeneratedTests.pack_ex_dynamic_nnfw
GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
GeneratedTests.pow_2D_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw_2
GeneratedTests.pow_broadcast_float_nnfw_3
GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
GeneratedTests.range_ex_float_1
GeneratedTests.range_ex_float_1_all_constant_inputs
GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
index 3cce4f3e3..bc0ae0f65 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
GeneratedTests.hashtable_lookup_float
GeneratedTests.hashtable_lookup_float_4D_nnfw
GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
GeneratedTests.l2_pool_float
GeneratedTests.l2_pool_float_2
GeneratedTests.l2_pool_float_large
@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
GeneratedTests.neg
GeneratedTests.neg_3D_int_nnfw
GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
GeneratedTests.prelu
GeneratedTests.prelu_broadcast_float_1_nnfw
GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
GeneratedTests.prelu_weight_as_input_quant8_2
GeneratedTests.prelu_weight_as_input_quant8_3
GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
GeneratedTests.reduce_max_quant8
GeneratedTests.reduce_max_quant8_1_nnfw
GeneratedTests.reduce_max_quant8_2
@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
GeneratedTests.select_v1_2_two_dim_quant8
GeneratedTests.slice_5
GeneratedTests.slice_6
-GeneratedTests.slice_7
GeneratedTests.slice_8
GeneratedTests.slice_zero_sized
GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
GeneratedTests.space_to_depth_quant8_1
GeneratedTests.space_to_depth_quant8_2
GeneratedTests.sqrt_
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
index e50b94118..005f61c94 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
GeneratedTests.cast_float16_to_quant8_overflow
GeneratedTests.cast_float32_to_float16
GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_float32_to_int32_nnfw
GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
GeneratedTests.cast_quant8_to_float16
GeneratedTests.concat_dynamic_nnfw
GeneratedTests.conv_dynamic_nnfw
@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
GeneratedTests.gather_float16_8
GeneratedTests.greater_dynamic_float_nnfw
GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
GeneratedTests.less_dynamic_float_nnfw
GeneratedTests.less_equal_dynamic_float_nnfw
GeneratedTests.log_4D_float_nnfw
@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
GeneratedTests.one_hot_ex_dynamic_nnfw
GeneratedTests.pack_ex_dynamic_nnfw
GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
GeneratedTests.pow_2D_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw_2
GeneratedTests.pow_broadcast_float_nnfw_3
GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
GeneratedTests.range_ex_float_1
GeneratedTests.range_ex_float_1_all_constant_inputs
GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
index 55cfe398f..051fbc78f 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
GeneratedTests.cast_float16_to_quant8_overflow
GeneratedTests.cast_float32_to_float16
GeneratedTests.cast_float32_to_float16_relaxed
-GeneratedTests.cast_float32_to_quant8_overflow
-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
GeneratedTests.cast_quant8_to_float16
GeneratedTests.concat_dynamic_nnfw
GeneratedTests.conv_dynamic_nnfw
@@ -73,6 +70,7 @@ GeneratedTests.greater_dynamic_float_nnfw
GeneratedTests.greater_equal_boolean
GeneratedTests.greater_equal_dynamic_float_nnfw
GeneratedTests.less_boolean
+GeneratedTests.l2_normalization_quant8_nnfw
GeneratedTests.less_dynamic_float_nnfw
GeneratedTests.less_equal_dynamic_float_nnfw
GeneratedTests.log_4D_float_nnfw
@@ -111,11 +109,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
GeneratedTests.one_hot_ex_dynamic_nnfw
GeneratedTests.pack_ex_dynamic_nnfw
GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
GeneratedTests.pow_2D_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw_2
GeneratedTests.pow_broadcast_float_nnfw_3
GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
GeneratedTests.range_ex_float_1
GeneratedTests.range_ex_float_1_all_constant_inputs
GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
index 3cce4f3e3..bc0ae0f65 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
GeneratedTests.hashtable_lookup_float
GeneratedTests.hashtable_lookup_float_4D_nnfw
GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
GeneratedTests.l2_pool_float
GeneratedTests.l2_pool_float_2
GeneratedTests.l2_pool_float_large
@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
GeneratedTests.neg
GeneratedTests.neg_3D_int_nnfw
GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
GeneratedTests.prelu
GeneratedTests.prelu_broadcast_float_1_nnfw
GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
GeneratedTests.prelu_weight_as_input_quant8_2
GeneratedTests.prelu_weight_as_input_quant8_3
GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
GeneratedTests.reduce_max_quant8
GeneratedTests.reduce_max_quant8_1_nnfw
GeneratedTests.reduce_max_quant8_2
@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
GeneratedTests.select_v1_2_two_dim_quant8
GeneratedTests.slice_5
GeneratedTests.slice_6
-GeneratedTests.slice_7
GeneratedTests.slice_8
GeneratedTests.slice_zero_sized
GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
GeneratedTests.space_to_depth_quant8_1
GeneratedTests.space_to_depth_quant8_2
GeneratedTests.sqrt_
diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp
index 08118cac1..069d367e3 100644
--- a/tests/nnapi/nnapi_gtest.skip.noarch.interp
+++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp
@@ -188,6 +188,7 @@ GeneratedTests.hashtable_lookup_quant8
GeneratedTests.l2_normalization
GeneratedTests.l2_normalization_2
GeneratedTests.l2_normalization_large
+GeneratedTests.l2_normalization_quant8_nnfw
GeneratedTests.l2_pool_float
GeneratedTests.l2_pool_float_2
GeneratedTests.l2_pool_float_large
@@ -312,6 +313,12 @@ GeneratedTests.pack_ex_2D_int_2
GeneratedTests.pack_ex_dynamic_nnfw
GeneratedTests.pad_dynamic_nnfw
GeneratedTests.pad_quant8_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
GeneratedTests.pow_2D_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw
GeneratedTests.pow_broadcast_float_nnfw_2
@@ -331,6 +338,15 @@ GeneratedTests.prelu_weight_as_input_quant8
GeneratedTests.prelu_weight_as_input_quant8_2
GeneratedTests.prelu_weight_as_input_quant8_3
GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
GeneratedTests.range_ex_float_1
GeneratedTests.range_ex_float_1_all_constant_inputs
GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
index 3cce4f3e3..bc0ae0f65 100644
--- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
GeneratedTests.hashtable_lookup_float
GeneratedTests.hashtable_lookup_float_4D_nnfw
GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
GeneratedTests.l2_pool_float
GeneratedTests.l2_pool_float_2
GeneratedTests.l2_pool_float_large
@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
GeneratedTests.neg
GeneratedTests.neg_3D_int_nnfw
GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
GeneratedTests.prelu
GeneratedTests.prelu_broadcast_float_1_nnfw
GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
GeneratedTests.prelu_weight_as_input_quant8_2
GeneratedTests.prelu_weight_as_input_quant8_3
GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
GeneratedTests.reduce_max_quant8
GeneratedTests.reduce_max_quant8_1_nnfw
GeneratedTests.reduce_max_quant8_2
@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
GeneratedTests.select_v1_2_two_dim_quant8
GeneratedTests.slice_5
GeneratedTests.slice_6
-GeneratedTests.slice_7
GeneratedTests.slice_8
GeneratedTests.slice_zero_sized
GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
GeneratedTests.space_to_depth_quant8_1
GeneratedTests.space_to_depth_quant8_2
GeneratedTests.sqrt_
diff --git a/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
new file mode 100644
index 000000000..ca3770cb0
--- /dev/null
+++ b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
@@ -0,0 +1,30 @@
+#
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model = Model()
+in0 = Input("op1", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
+out0 = Output("op2", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
+model = model.Operation("L2_NORMALIZATION", in0).To(out0)
+
+# Example 1. Input in operand 0,
+input0 = {in0: # input 0
+ [0, 5, 12]}
+output0 = {out0: # output 0
+ [51, 54, 58]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
index c500741c2..c500741c2 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
index 3dfaff64b..3dfaff64b 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
index 5b27f4963..5b27f4963 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
index 5ee4b06d7..5ee4b06d7 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
index 391d5cfb6..391d5cfb6 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
index b67c2b834..b67c2b834 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/quantize.mod.py b/tests/nnapi/specs/V1_2/quantize.mod.py
index a42624dce..a42624dce 100644
--- a/tests/nnapi/specs/skip/V1_2/quantize.mod.py
+++ b/tests/nnapi/specs/V1_2/quantize.mod.py
diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
index 67f246728..c6c6355ba 100644
--- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
@@ -51,19 +51,24 @@ TEST_F(ValidationTestAddModelLoaded, output_tensorinfo)
ASSERT_EQ(tensor_info.dims[0], 1);
}
-TEST_F(ValidationTestAddModelLoaded, neg_run_001)
+TEST_F(ValidationTestAddModelLoaded, neg_run)
{
- ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+ // nnfw_prepare is not called
+ ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
}
-TEST_F(ValidationTestAddModelLoaded, neg_set_input_001)
+TEST_F(ValidationTestAddModelLoaded, neg_set_input)
{
- ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+ // nnfw_prepare is not called
+ ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+ NNFW_STATUS_INVALID_STATE);
}
-TEST_F(ValidationTestAddModelLoaded, neg_set_output_001)
+TEST_F(ValidationTestAddModelLoaded, neg_set_output)
{
- ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+ // nnfw_prepare is not called
+ ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+ NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestAddModelLoaded, neg_get_input_size)
@@ -81,7 +86,7 @@ TEST_F(ValidationTestAddModelLoaded, neg_load_model)
// load model twice
ASSERT_EQ(nnfw_load_model_from_file(
_session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
- NNFW_STATUS_ERROR);
+ NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestAddModelLoaded, neg_output_tensorinfo)
diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
index 1bb418231..0f4a4af32 100644
--- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
@@ -102,7 +102,7 @@ TEST_F(ValidationTestAddSessionPrepared, neg_run_during_async_run)
{
SetInOutBuffers();
ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR);
- EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+ EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR);
}
@@ -152,13 +152,13 @@ TEST_F(ValidationTestAddSessionPrepared, neg_load_model)
// Load model twice
ASSERT_EQ(nnfw_load_model_from_file(
_session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
- NNFW_STATUS_ERROR);
+ NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestAddSessionPrepared, neg_prepare)
{
// Call Prepare twice
- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
}
// TODO Validation check when "nnfw_run" is called without input & output tensor setting
diff --git a/tests/nnfw_api/src/ValidationTestSessionCreated.cc b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
index 2675aa758..01832db3a 100644
--- a/tests/nnfw_api/src/ValidationTestSessionCreated.cc
+++ b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
@@ -58,7 +58,7 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_1)
nnfw_load_model_from_file(
_session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()),
NNFW_STATUS_ERROR);
- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
@@ -67,52 +67,52 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
_session,
NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()),
NNFW_STATUS_ERROR);
- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestSessionCreated, neg_prepare_001)
{
// nnfw_load_model_from_file was not called
- ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestSessionCreated, neg_run_001)
{
// nnfw_load_model_from_file and nnfw_prepare was not called
- ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestSessionCreated, neg_set_input_001)
{
- // Invalid state
- ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+ NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestSessionCreated, neg_set_output_001)
{
- // Invalid state
- ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+ NNFW_STATUS_INVALID_STATE);
}
TEST_F(ValidationTestSessionCreated, neg_get_input_size)
{
uint32_t size = 10000;
- ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_ERROR);
- ASSERT_EQ(size, 10000);
+ ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_INVALID_STATE);
+ ASSERT_EQ(size, 10000); // Remain unchanged
}
TEST_F(ValidationTestSessionCreated, neg_get_output_size)
{
uint32_t size = 10000;
- ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_ERROR);
- ASSERT_EQ(size, 10000);
+ ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_INVALID_STATE);
+ ASSERT_EQ(size, 10000); // Remain unchanged
}
TEST_F(ValidationTestSessionCreated, neg_output_tensorinfo)
{
nnfw_tensorinfo tensor_info;
// model is not loaded
- ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_INVALID_STATE);
// model is not loaded and tensor_info is null
- ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_ERROR);
+ ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_INVALID_STATE);
}
diff --git a/tests/scripts/benchmark_nnapi.sh b/tests/scripts/benchmark_nnapi.sh
index c7f44c52a..af797287f 100755
--- a/tests/scripts/benchmark_nnapi.sh
+++ b/tests/scripts/benchmark_nnapi.sh
@@ -18,7 +18,6 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source $MY_PATH/common.sh
-BENCHMARK_RUN_TEST_SH=
BENCHMARK_DRIVER_BIN=
BENCHMARK_REPORT_DIR=
BENCHMARK_MODELS_FILE=
@@ -30,7 +29,7 @@ EXECUTORS="Linear Parallel" #TODO: accept this list as argument
function Usage()
{
- echo "Usage: ./$0 --reportdir=. --runtestsh=tests/scripts/framework/run_test.sh --driverbin=Product/out/bin/tflite_run"
+ echo "Usage: ./$0 --reportdir=. --driverbin=Product/out/bin/tflite_run"
}
for i in "$@"
@@ -43,9 +42,6 @@ do
--test_op)
TEST_OP="true"
;;
- --runtestsh=*)
- BENCHMARK_RUN_TEST_SH=${i#*=}
- ;;
--driverbin=*)
BENCHMARK_DRIVER_BIN=${i#*=}
;;
@@ -147,9 +143,8 @@ function run_onert_with_all_config()
local REPORT_MODEL_DIR=$2
local PAUSE_TIME_IN_SEC=$3
local BENCHMARK_DRIVER_BIN=$4
- local BENCHMARK_RUN_TEST_SH=$5
- local EXECUTORS=$6
- local BACKEND_LIST=$7
+ local EXECUTORS=$5
+ local BACKEND_LIST=$6
export USE_NNAPI=1
@@ -163,18 +158,18 @@ function run_onert_with_all_config()
done
export BACKENDS=$BACKENDS_TO_USE
if [ "$TEST_OP" == "false" ]; then
- profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
+ profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
fi
for executor in $EXECUTORS; do
export EXECUTOR=$executor
if [ "$TEST_OP" == "false" ]; then
- run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $executor
+ run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $executor
fi
for backend in $BACKEND_LIST; do
export OP_BACKEND_ALLOPS=$backend
run_benchmark_and_print "tflite_onert_"$executor"_executor_$backend" "TFLite onert $executor Executor $backend"\
- $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
+ $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
done
done
unset USE_NNAPI EXECUTOR OP_BACKEND_ALLOPS BACKENDS
@@ -215,14 +210,14 @@ function run_benchmark_test()
# TFLite+CPU
unset USE_NNAPI
- run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
+ run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
# run onert
if [ "$TEST_OP" == "true" ]; then
# Operation test don't need to test each scheduler
- run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "Linear" "$BACKEND_LIST"
+ run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "Linear" "$BACKEND_LIST"
else
- run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "$EXECUTORS" "$BACKEND_LIST"
+ run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "$EXECUTORS" "$BACKEND_LIST"
fi
if [[ $i -ne $(echo $BENCHMARK_MODEL_LIST | wc -w)-1 ]]; then
diff --git a/tests/scripts/common.sh b/tests/scripts/common.sh
index 88002909c..b2799c2d8 100755
--- a/tests/scripts/common.sh
+++ b/tests/scripts/common.sh
@@ -18,13 +18,12 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
function get_result_of_benchmark_test()
{
- local RUN_TEST_SH=$1
- local DRIVER_BIN=$2
- local MODEL=$3
- local LOG_FILE=$4
+ local DRIVER_BIN=$1
+ local MODEL=$2
+ local LOG_FILE=$3
local RET=0
- $RUN_TEST_SH --driverbin="$DRIVER_BIN -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
+ $MY_PATH/framework/run_test.sh --driverbin="$DRIVER_BIN -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
RET=$?
if [[ $RET -ne 0 ]]; then
echo "Testing $MODEL aborted... exit code: $RET"
@@ -68,7 +67,7 @@ function run_benchmark_and_print()
LOG_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.txt
RESULT_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.result
print_with_dots $MSG
- RESULT=$(get_result_of_benchmark_test $BENCHMARK_RUN_TEST_SH $DRIVER_BIN $MODEL $LOG_FILE)
+ RESULT=$(get_result_of_benchmark_test $DRIVER_BIN $MODEL $LOG_FILE)
echo "$RESULT ms"
print_result_of_benchmark_test "$MSG" "$RESULT" $RESULT_FILE
sleep $PAUSE_TIME_IN_SEC
diff --git a/tests/scripts/framework/run_test.sh b/tests/scripts/framework/run_test.sh
index 44b714974..9440c52c3 100755
--- a/tests/scripts/framework/run_test.sh
+++ b/tests/scripts/framework/run_test.sh
@@ -28,10 +28,12 @@ function Usage()
echo "Usage: ./$0 --driverbin={such as tflite_run} {tests to test or empty for all of tests}"
echo "Usage: ./$0 --driverbin=Product/out/bin/tflite_run --reportdir=report --tapname=verification.tap avgpool1 avgpool2"
echo ""
- echo "--download - (default=off) Download model files. Other options is ignored"
- echo "--driverbin - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
- echo "--reportdir - (default=report) directory to place tap files"
- echo "--tapname - (default=framework_test.tap) file name to be written for tap"
+ echo "--download - (default=on) Download model files"
+ echo "--run - (default=on) Test model files"
+ echo "--driverbin - (default=../../Product/out/bin/tflite_run) Runner for runnning model tests"
+ echo "--reportdir - (default=report) Directory to place tap files"
+ echo "--tapname - (default=framework_test.tap) File name to be written for tap"
+ echo "--md5 - (default=on) MD5 check when download model files"
echo ""
}
@@ -43,9 +45,13 @@ function need_download()
return 0;
fi
# Ignore checking md5 in cache
+ # TODO Use "--md5" option only and remove IGNORE_MD5 environment variable
if [ ! -z $IGNORE_MD5 ] && [ "$IGNORE_MD5" == "1" ]; then
return 1
fi
+ if [ "$MD5_CHECK" = "off" ]; then
+ return 1
+ fi
LOCAL_HASH=$(md5sum $LOCAL_PATH | awk '{ print $1 }')
REMOTE_HASH=$(curl -ss $REMOTE_URL | md5sum | awk '{ print $1 }')
@@ -60,7 +66,9 @@ function need_download()
DRIVER_BIN=""
TAP_NAME="framework_test.tap"
TEST_LIST=()
-DOWNLOAD_MODE="off"
+DOWNLOAD_MODEL="on"
+RUN_TEST="on"
+MD5_CHECK="on"
# Support environment variable setting for mirror server
FIXED_MODELFILE_SERVER="${MODELFILE_SERVER:-}"
@@ -84,6 +92,12 @@ do
--download=*)
DOWNLOAD_MODE=${i#*=}
;;
+ --md5=*)
+ MD5_CHECK=${i#*=}
+ ;;
+ --run=*)
+ RUN_TEST=${i#*=}
+ ;;
*)
TEST_LIST+=( $i )
;;
@@ -100,7 +114,7 @@ if [ ! -n "$DRIVER_BIN" ]; then
fi
# Check test driver setting
-if [ ! -e $DRIVER_BIN ] && [ "$DOWNLOAD_MODE" != "on" ]; then
+if [ ! -e $DRIVER_BIN ] && [ "$RUN_TEST" = "on" ]; then
echo "Cannot find test driver" $DRIVER_BIN ": please set proper DRIVER_BIN"
exit 1
fi
@@ -139,33 +153,9 @@ run_tests()
TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME
MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME
- MODELFILE_URL="$MODELFILE_SERVER_PATH/$MODELFILE_NAME"
- if [ -n "$FIXED_MODELFILE_SERVER" ]; then
- MODELFILE_URL="$FIXED_MODELFILE_SERVER/$MODELFILE_NAME"
- fi
-
- # Download model file
- if [ ! -e $TEST_CACHE_PATH ]; then
- mkdir -p $TEST_CACHE_PATH
- fi
-
- # Download unless we have it in cache (Also check md5sum)
- if need_download "$MODELFILE" "$MODELFILE_URL"; then
- echo ""
- echo "Download test file for $TEST_NAME"
- echo "======================"
-
- rm -f $MODELFILE # Remove invalid file if exists
- pushd $TEST_CACHE_PATH
- wget -nv $MODELFILE_URL
- if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
- unzip -o $MODELFILE_NAME
- fi
- popd
- fi
# Find model file for downloaded by zip
- if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
+ if [ "${MODELFILE_NAME##*.}" = "zip" ]; then
pushd $TEST_CACHE_PATH
MODELFILE=$TEST_CACHE_PATH/$(ls *.tflite)
popd
@@ -178,7 +168,6 @@ run_tests()
# Run driver to test framework
$DRIVER_BIN $MODELFILE
- #$DRIVER_BIN $MODELFILE
if [[ $? -eq 0 ]]; then
echo "ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME
else
@@ -268,10 +257,11 @@ find_tests()
mkdir -p $REPORT_DIR
TESTS_TO_RUN=$(find_tests ${TEST_LIST[@]})
-if [[ "$DOWNLOAD_MODE" == "on" ]]; then
+if [ "$DOWNLOAD_MODEL" = "on" ]; then
download_tests $TESTS_TO_RUN
- exit 0;
fi
-run_tests $TESTS_TO_RUN
+if [ "$RUN_TEST" = "on" ]; then
+ run_tests $TESTS_TO_RUN
+fi
exit $?
diff --git a/tests/scripts/test-driver.sh b/tests/scripts/test-driver.sh
index 615fc2c13..a720b1537 100755
--- a/tests/scripts/test-driver.sh
+++ b/tests/scripts/test-driver.sh
@@ -38,7 +38,6 @@ function Usage()
echo "etc."
echo "--framework_driverbin - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
echo "--verification_driverbin - (default=../../Product/out/bin/nnapi_test) runner for runnning verification tests"
- echo "--runtestsh - (default=\$ARTIFACT_PATH/tests/scripts/framework/run_test.sh) run_test.sh with path where it is for framework test and verification"
echo "--unittestdir - (default=\$ARTIFACT_PATH/Product/out/unittest) directory that has unittest binaries for unit test"
echo ""
echo "--reportdir - (default=\$ARTIFACT_PATH/report) directory to save report"
@@ -49,7 +48,6 @@ TEST_DRIVER_DIR="$( cd "$( dirname "${BASH_SOURCE}" )" && pwd )"
ARTIFACT_PATH="$TEST_DRIVER_DIR/../../"
FRAMEWORK_DRIVER_BIN=""
VERIFICATION_DRIVER_BIN=""
-RUN_TEST_SH=""
UNIT_TEST_DIR=""
ALLTEST_ON="true"
UNITTEST_ON="false"
@@ -74,9 +72,6 @@ do
--verification_driverbin=*)
VERIFICATION_DRIVER_BIN=${i#*=}
;;
- --runtestsh=*)
- RUN_TEST_SH=${i#*=}
- ;;
--unittestdir=*)
UNIT_TEST_DIR=${i#*=}
;;
@@ -116,15 +111,6 @@ done
ARTIFACT_PATH="$(readlink -f $ARTIFACT_PATH)"
-if [ -z "$RUN_TEST_SH" ]; then
- RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/framework/run_test.sh
-fi
-
-if [ ! -e "$RUN_TEST_SH" ]; then
- echo "Cannot find $RUN_TEST_SH"
- exit 1
-fi
-
if [ -z "$UNIT_TEST_DIR" ]; then
UNIT_TEST_DIR=$ARTIFACT_PATH/Product/out/unittest
fi
@@ -149,7 +135,6 @@ if [ "$FRAMEWORKTEST_ON" == "true" ]; then
fi
$TEST_DRIVER_DIR/test_framework.sh \
- --runtestsh=$RUN_TEST_SH \
--driverbin=$FRAMEWORK_DRIVER_BIN \
--reportdir=$REPORT_DIR \
--tapname=framework_test.tap \
@@ -166,7 +151,6 @@ if [ "$ALLTEST_ON" == "true" ] || [ "$VERIFICATION_ON" == "true" ]; then
# verification uses the same script as frameworktest does
$TEST_DRIVER_DIR/test_framework.sh \
- --runtestsh=$RUN_TEST_SH \
--driverbin=$VERIFICATION_DRIVER_BIN \
--reportdir=$REPORT_DIR \
--tapname=verification_test.tap \
@@ -180,7 +164,6 @@ if [ "$BENCHMARK_ONERT_OP_ON" == "true" ]; then
$TEST_DRIVER_DIR/benchmark_nnapi.sh \
--test_op \
- --runtestsh=$RUN_TEST_SH \
--driverbin=$DRIVER_BIN \
--reportdir=$REPORT_DIR/benchmark_op \
--modelfilepath=$ARTIFACT_PATH/tests/scripts/framework
diff --git a/tests/scripts/test_framework.sh b/tests/scripts/test_framework.sh
index 1d9751562..bd86cd371 100755
--- a/tests/scripts/test_framework.sh
+++ b/tests/scripts/test_framework.sh
@@ -14,7 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-FWTEST_RUN_TEST_SH=
+MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
FWTEST_DRIVER_BIN=
FWTEST_REPORT_DIR=
FWTEST_TAP_NAME=
@@ -25,7 +26,6 @@ function Usage()
{
echo "Usage Example:"
echo "./$0 \\"
- echo " --runtestsh=tests/scripts/framework/run_test.sh \\ # Test runner script path"
echo " --driverbin=Product/out/bin/tflite_run \\ # Test driver path"
echo " --frameworktest_list_file=tests/scripts/list/frameworktest_list.armv7l.cpu.txt \\"
echo " --reportdir=report \\ # Directory for the report files will be saved"
@@ -42,9 +42,6 @@ do
-h|--help|help)
Usage
;;
- --runtestsh=*)
- FWTEST_RUN_TEST_SH=${i#*=}
- ;;
--driverbin=*)
FWTEST_DRIVER_BIN=${i#*=}
;;
@@ -67,7 +64,6 @@ do
shift
done
-[ ! -z "$FWTEST_RUN_TEST_SH" ] || Usage
[ ! -z "$FWTEST_DRIVER_BIN" ] || Usage
[ ! -z "$FWTEST_REPORT_DIR" ] || Usage
[ ! -z "$FWTEST_TAP_NAME" ] || Usage
@@ -86,7 +82,7 @@ if [ ! -z "$FRAMEWORKTEST_LIST_FILE" ]; then
MODELLIST=$(cat "${FRAMEWORKTEST_LIST_FILE}")
fi
-$FWTEST_RUN_TEST_SH --driverbin=$FWTEST_DRIVER_BIN \
+$MY_PATH/framework/run_test.sh --driverbin=$FWTEST_DRIVER_BIN \
--reportdir=$FWTEST_REPORT_DIR \
--tapname=$FWTEST_TAP_NAME \
${MODELLIST:-} \
diff --git a/tests/tools/nnpackage_run/CMakeLists.txt b/tests/tools/nnpackage_run/CMakeLists.txt
index 0e333a010..ec45db4f6 100644
--- a/tests/tools/nnpackage_run/CMakeLists.txt
+++ b/tests/tools/nnpackage_run/CMakeLists.txt
@@ -33,7 +33,7 @@ target_include_directories(nnpackage_run PRIVATE src)
target_include_directories(nnpackage_run PRIVATE ${Boost_INCLUDE_DIRS})
target_link_libraries(nnpackage_run onert_core onert tflite_loader)
-target_link_libraries(nnpackage_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite jsoncpp)
+target_link_libraries(nnpackage_run nnfw_lib_tflite jsoncpp)
target_link_libraries(nnpackage_run nnfw-dev)
target_link_libraries(nnpackage_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
target_link_libraries(nnpackage_run nnfw_lib_benchmark)
diff --git a/tests/tools/nnpackage_run/src/args.cc b/tests/tools/nnpackage_run/src/args.cc
index 0dbcafc33..cb4a7dbaa 100644
--- a/tests/tools/nnpackage_run/src/args.cc
+++ b/tests/tools/nnpackage_run/src/args.cc
@@ -16,6 +16,7 @@
#include "args.h"
+#include <functional>
#include <iostream>
#include <json/json.h>
@@ -105,6 +106,75 @@ Args::Args(const int argc, char **argv)
void Args::Initialize(void)
{
+ auto process_nnpackage = [&](const std::string &package_filename) {
+ _package_filename = package_filename;
+
+ std::cerr << "Package Filename " << _package_filename << std::endl;
+ if (_package_filename.empty())
+ {
+ // TODO Print usage instead of the below message
+ std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
+ << "\n";
+
+ exit(1);
+ }
+ else
+ {
+ if (access(_package_filename.c_str(), F_OK) == -1)
+ {
+ std::cerr << "nnpackage not found: " << _package_filename << "\n";
+ }
+ }
+ };
+
+ auto process_output_sizes = [&](const std::string &output_sizes_json_str) {
+ Json::Value root;
+ Json::Reader reader;
+ if (!reader.parse(output_sizes_json_str, root, false))
+ {
+ std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
+ exit(1);
+ }
+
+ auto arg_map = argArrayToMap(root);
+ for (auto &pair : arg_map)
+ {
+ uint32_t key = pair.first;
+ Json::Value &val_json = pair.second;
+ if (!val_json.isUInt())
+ {
+ std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
+ exit(1);
+ }
+ uint32_t val = val_json.asUInt();
+ _output_sizes[key] = val;
+ }
+ };
+
+ auto process_shape_prepare = [&](const std::string &shape_str) {
+ try
+ {
+ handleShapeParam(_shape_prepare, shape_str);
+ }
+ catch (const std::exception &e)
+ {
+ std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
+ exit(1);
+ }
+ };
+
+ auto process_shape_run = [&](const std::string &shape_str) {
+ try
+ {
+ handleShapeParam(_shape_run, shape_str);
+ }
+ catch (const std::exception &e)
+ {
+ std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
+ exit(1);
+ }
+ };
+
// General options
po::options_description general("General options", 100);
@@ -112,32 +182,33 @@ void Args::Initialize(void)
general.add_options()
("help,h", "Print available options")
("version", "Print version and exit immediately")
- ("nnpackage", po::value<std::string>()->required())
+ ("nnpackage", po::value<std::string>()->required()->notifier(process_nnpackage))
#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
- ("dump,d", po::value<std::string>()->default_value(""), "Output filename")
- ("load,l", po::value<std::string>()->default_value(""), "Input filename")
+ ("dump,d", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _dump_filename = v; }), "Output filename")
+ ("load,l", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _load_filename = v; }), "Input filename")
#endif
- ("output_sizes", po::value<std::string>(),
+ ("output_sizes", po::value<std::string>()->notifier(process_output_sizes),
"The output buffer size in JSON 1D array\n"
"If not given, the model's output sizes are used\n"
"e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd tensor to 80.\n")
- ("num_runs,r", po::value<int>()->default_value(1), "The number of runs")
- ("warmup_runs,w", po::value<int>()->default_value(0), "The number of warmup runs")
- ("run_delay,t", po::value<int>()->default_value(-1), "Delay time(ms) between runs (as default no delay")
- ("gpumem_poll,g", po::value<bool>()->default_value(false), "Check gpu memory polling separately")
- ("mem_poll,m", po::value<bool>()->default_value(false), "Check memory polling")
- ("write_report,p", po::value<bool>()->default_value(false),
+ ("num_runs,r", po::value<int>()->default_value(1)->notifier([&](const auto &v) { _num_runs = v; }), "The number of runs")
+ ("warmup_runs,w", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _warmup_runs = v; }), "The number of warmup runs")
+ ("run_delay,t", po::value<int>()->default_value(-1)->notifier([&](const auto &v) { _run_delay = v; }), "Delay time(ms) between runs (as default no delay")
+ ("gpumem_poll,g", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _gpumem_poll = v; }), "Check gpu memory polling separately")
+ ("mem_poll,m", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _mem_poll = v; }), "Check memory polling")
+ ("write_report,p", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _write_report = v; }),
"Write report\n"
"{exec}-{nnpkg}-{backend}.csv will be generated.\n"
"e.g. nnpackage_run-UNIT_Add_000-acl_cl.csv.\n"
"{nnpkg} name may be changed to realpath if you use symbolic-link.")
- ("shape_prepare", po::value<std::string>()->default_value("[]"),
+ ("shape_prepare", po::value<std::string>()->default_value("[]")->notifier(process_shape_prepare),
"set shape of specified tensor before compilation\n"
"e.g. '[0, [1, 2], 2, []]' to set 0th tensor to [1, 2] and 2nd tensor to [].\n")
- ("shape_run", po::value<std::string>()->default_value("[]"),
+ ("shape_run", po::value<std::string>()->default_value("[]")->notifier(process_shape_run),
"set shape of specified tensor right before running\n"
"e.g. '[1, [1, 2]]` to set 1st tensor to [1, 2].\n")
- ("verbose_level,v", po::value<int>()->default_value(0), "Verbose level\n"
+ ("verbose_level,v", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }),
+ "Verbose level\n"
"0: prints the only result. Messages btw run don't print\n"
"1: prints result and message btw run\n"
"2: prints all of messages to print\n")
@@ -180,158 +251,23 @@ void Args::Parse(const int argc, char **argv)
return;
}
- po::notify(vm);
try
{
-#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
- if (vm.count("dump"))
- {
- _dump_filename = vm["dump"].as<std::string>();
- }
-
- if (vm.count("load"))
- {
- _load_filename = vm["load"].as<std::string>();
- }
-#endif
-
- if (vm.count("nnpackage"))
- {
- _package_filename = vm["nnpackage"].as<std::string>();
-
- if (_package_filename.empty())
- {
- // TODO Print usage instead of the below message
- std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
- << "\n";
-
- exit(1);
- }
- else
- {
- if (access(_package_filename.c_str(), F_OK) == -1)
- {
- std::cerr << "nnpackage not found: " << _package_filename << "\n";
- }
- }
- }
-
- if (vm.count("output_sizes"))
- {
- auto output_sizes_json_str = vm["output_sizes"].as<std::string>();
-
- Json::Value root;
- Json::Reader reader;
- if (!reader.parse(output_sizes_json_str, root, false))
- {
- std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
- exit(1);
- }
-
- auto arg_map = argArrayToMap(root);
- for (auto &pair : arg_map)
- {
- uint32_t key = pair.first;
- Json::Value &val_json = pair.second;
- if (!val_json.isUInt())
- {
- std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
- exit(1);
- }
- uint32_t val = val_json.asUInt();
- _output_sizes[key] = val;
- }
- }
-
- if (vm.count("num_runs"))
- {
- _num_runs = vm["num_runs"].as<int>();
- }
-
- if (vm.count("warmup_runs"))
- {
- _warmup_runs = vm["warmup_runs"].as<int>();
- }
-
- if (vm.count("run_delay"))
- {
- _run_delay = vm["run_delay"].as<int>();
- }
-
- if (vm.count("gpumem_poll"))
- {
- _gpumem_poll = vm["gpumem_poll"].as<bool>();
- }
-
- if (vm.count("mem_poll"))
- {
- _mem_poll = vm["mem_poll"].as<bool>();
- // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
- if (_mem_poll && _warmup_runs == 0)
- {
- _warmup_runs = 1;
- }
- }
-
- if (vm.count("write_report"))
- {
- _write_report = vm["write_report"].as<bool>();
- }
-
- if (vm.count("verbose_level"))
- {
- _verbose_level = vm["verbose_level"].as<int>();
- }
+ po::notify(vm);
}
catch (const std::bad_cast &e)
{
- std::cerr << "error by bad cast" << e.what() << '\n';
+ std::cerr << "Bad cast error - " << e.what() << '\n';
exit(1);
}
- if (vm.count("shape_prepare"))
- {
- std::string shape_str;
- try
- {
- shape_str = vm["shape_prepare"].as<std::string>();
- }
- catch (const std::bad_cast &e)
- {
- std::cerr << "error by bad cast with '--shape_prepare' option" << e.what() << '\n';
- exit(1);
- }
- try
- {
- handleShapeParam(_shape_prepare, shape_str);
- }
- catch (const std::exception &e)
- {
- std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
- exit(1);
- }
- }
-
- if (vm.count("shape_run"))
+ // This must be run after `notify` as `_warm_up_runs` must have been processed before.
+ if (vm.count("mem_poll"))
{
- std::string shape_str;
- try
- {
- shape_str = vm["shape_run"].as<std::string>();
- }
- catch (const std::bad_cast &e)
+ // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
+ if (_mem_poll && _warmup_runs == 0)
{
- std::cerr << "error by bad cast with '--shape_run' option" << e.what() << '\n';
- exit(1);
- }
- try
- {
- handleShapeParam(_shape_run, shape_str);
- }
- catch (const std::exception &e)
- {
- std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
- exit(1);
+ _warmup_runs = 1;
}
}
}
diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc
index 34c075c1a..09ace4798 100644
--- a/tests/tools/nnpackage_run/src/h5formatter.cc
+++ b/tests/tools/nnpackage_run/src/h5formatter.cc
@@ -145,6 +145,7 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT64);
break;
}
+ case NNFW_TYPE_TENSOR_UINT8:
case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
{
H5::DataSet data_set =
@@ -159,13 +160,6 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT8);
break;
}
- case NNFW_TYPE_TENSOR_UINT8:
- {
- H5::DataSet data_set =
- value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8BE, data_space);
- data_set.write(outputs[i].data(), H5::PredType::NATIVE_UINT8);
- break;
- }
default:
throw std::runtime_error("nnpkg_run can dump f32, i32, qasymm8, bool and uint8.");
}
diff --git a/tests/tools/tflite_loader/CMakeLists.txt b/tests/tools/tflite_loader/CMakeLists.txt
index 5a9e3a8ff..0fe1c69de 100644
--- a/tests/tools/tflite_loader/CMakeLists.txt
+++ b/tests/tools/tflite_loader/CMakeLists.txt
@@ -17,7 +17,7 @@ add_executable(tflite_loader_test_tool ${SOURCES})
target_include_directories(tflite_loader_test_tool PRIVATE ${Boost_INCLUDE_DIRS})
target_link_libraries(tflite_loader_test_tool onert_core onert tflite_loader)
-target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_misc)
+target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite nnfw_lib_misc)
target_link_libraries(tflite_loader_test_tool ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
install(TARGETS tflite_loader_test_tool DESTINATION bin)
diff --git a/tests/tools/tflite_run/CMakeLists.txt b/tests/tools/tflite_run/CMakeLists.txt
index 19e7126b0..3f30d3e32 100644
--- a/tests/tools/tflite_run/CMakeLists.txt
+++ b/tests/tools/tflite_run/CMakeLists.txt
@@ -13,7 +13,7 @@ add_executable(tflite_run ${TFLITE_RUN_SRCS})
target_include_directories(tflite_run PRIVATE src)
target_include_directories(tflite_run PRIVATE ${Boost_INCLUDE_DIRS})
-target_link_libraries(tflite_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite)
+target_link_libraries(tflite_run nnfw_lib_tflite)
target_link_libraries(tflite_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
target_link_libraries(tflite_run nnfw_lib_benchmark)
diff --git a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
index cf3e54406..bbc5b3e6c 100755
--- a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
+++ b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
@@ -62,6 +62,7 @@ tflite
"
model_type=""
+tf_intf_version=""
for ext in $supported_model_types; do
[ -e "$indir/$tcname"."$ext" ] && model_type=$ext
@@ -73,7 +74,9 @@ if [[ "$model_type" == "" ]]; then
fi
if [[ "$model_type" == "pb" ]]; then
- $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" -o "$outdir"
+ [ -f "$indir/$tcname"."v2" ] && tf_intf_version="--v2"
+ $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" \
+ "$tf_intf_version" -o "$outdir"
else
$model2nnpkg -o "$outdir" "$indir/$tcname"."$model_type"
fi
diff --git a/tools/tflitefile_tool/select_operator.py b/tools/tflitefile_tool/select_operator.py
index 1ad44a389..333ca32f6 100755
--- a/tools/tflitefile_tool/select_operator.py
+++ b/tools/tflitefile_tool/select_operator.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
#
@@ -1180,23 +1180,6 @@ def GenerateModel(args, new_builder, sample_model, operator_list, new_input_tens
return tflite.Model.ModelEnd(new_builder)
-def Finish(new_builder, new_model):
- # Cusrom implementation: identifier
- # Python API don't support identifier input yet
- # Reference: Finish(self, rootTable)) in builder.py, Finish(uoffset_t root, const char *file_identifier, bool size_prefix) in flatbuffers.h
- new_builder.Prep(new_builder.minalign,
- flatbuffers.number_types.UOffsetTFlags.bytewidth)
-
- new_builder.PrependByte(0x33)
- new_builder.PrependByte(0x4c)
- new_builder.PrependByte(0x46)
- new_builder.PrependByte(0x54)
-
- new_builder.PrependUOffsetTRelative(new_model)
- new_builder.finished = True
- return new_builder.Head()
-
-
def main(args):
input_model_file = args.input_model
oplist_file = args.opcode_list
@@ -1343,7 +1326,7 @@ def main(args):
new_input_tensors, new_output_tensors, used_tensors_dic,
used_buffers_dic, used_opcodes_dic, used_subgraphs_dic)
- Finish(new_builder, new_model)
+ new_builder.Finish(new_model, file_identifier=b'TFL3')
new_buf = new_builder.Output()
output_model_file.write(new_buf)
diff --git a/tools/tflkit/README.md b/tools/tflkit/README.md
index a0c40c6fa..9e1883436 100644
--- a/tools/tflkit/README.md
+++ b/tools/tflkit/README.md
@@ -1,4 +1,4 @@
-# tflkit
+# tflkit
## Purpose
@@ -114,11 +114,11 @@ Number of all operators : 126 (total instrs: 11,484,469
### TensorFlow
-TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
+TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
### with tflkit
-The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in teh `NAME` environment. This tool requires an information file as a parameter. There is an [example file](info/convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in the `NAME` environment. This tool requires an information file as a parameter. There is an [example file](convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
Convert information:
* GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -176,7 +176,7 @@ The input and output file of this tool is a TensorFlow GraphDef file.
### with tflkit
-The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](info/optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
Optimize information:
* GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -207,7 +207,7 @@ The trained TensorFlow model can be trasformed by some variants to deploy it in
### with tflkit
-The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](info/transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
Transform information:
* GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -270,7 +270,7 @@ The [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorfl
### with tflkit
-The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](info/freeze.info) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](freeze.template) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
Freeze information:
* SAVED_MODEL : Full directory path with TensorFlow `SavedModel` file and variables.
diff --git a/tools/update_version/update-version b/tools/update_version/update-version
index 41693278b..1b77c10cd 100644
--- a/tools/update_version/update-version
+++ b/tools/update_version/update-version
@@ -40,11 +40,12 @@ fi
version=$1
-sed -i "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
-sed -i "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
+perl -pi -e "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
-IFS=. read M m p <<< $version
+perl -pi -e "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
+
+IFS=. read M m p <<< "$version"
hex=$(printf '0x%08x' $(( (($M << 24)) | (($m << 8)) | $p )))
-sed -i "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
+perl -pi -e "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
-sed -i "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
+perl -pi -e "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle