summaryrefslogtreecommitdiff
path: root/libs
diff options
context:
space:
mode:
Diffstat (limited to 'libs')
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h148
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h79
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h78
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h106
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h81
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h58
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h62
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h59
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h61
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h58
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h113
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h57
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h61
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h129
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h55
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h81
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h59
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h60
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h73
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h78
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h73
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h104
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h78
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h69
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h58
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h59
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h (renamed from libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h)92
-rw-r--r--libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h478
-rw-r--r--libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h113
-rw-r--r--libs/ARMComputeEx/arm_compute/core/TypesEx.h100
-rw-r--r--libs/ARMComputeEx/arm_compute/core/UtilsEx.h37
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h63
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h114
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h62
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h45
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h41
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h27
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h42
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h44
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h54
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h38
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h37
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h59
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h39
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h77
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h40
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h47
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h51
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h34
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h81
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h87
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h73
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h56
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h44
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h40
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h69
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h58
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h60
-rw-r--r--libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h83
-rw-r--r--libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp360
-rw-r--r--libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp123
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl89
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl94
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl74
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl30
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl70
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl84
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl56
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl86
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl93
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl69
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl84
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl57
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h565
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl26
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl88
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h38
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl48
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl86
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl72
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl26
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl45
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl26
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl74
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl88
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl60
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl152
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl69
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl163
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl69
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl75
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl104
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl63
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl26
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl26
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl26
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp211
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp159
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp216
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp117
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp173
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp17
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp212
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp109
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp114
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp77
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp21
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp177
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp89
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp166
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp185
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp149
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp126
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp54
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp129
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp181
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp198
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp238
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp113
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp170
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp (renamed from libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp)121
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp27
-rw-r--r--libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp294
-rw-r--r--libs/ARMComputeEx/src/core/UtilsEx.cpp34
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp35
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp120
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp46
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp28
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp39
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp1
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp40
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp28
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp29
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp28
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp4
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp29
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp28
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp50
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp39
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp28
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp36
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp3
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp121
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp123
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp51
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp29
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp28
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp39
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp307
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp30
-rw-r--r--libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp5
-rw-r--r--libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp74
-rw-r--r--libs/ARMComputeEx/src/runtime/topk_v2.h68
-rw-r--r--libs/CMakeLists.txt7
-rw-r--r--libs/cpp14/CMakeLists.txt2
-rw-r--r--libs/cpp14/include/cpp14/memory.h29
-rw-r--r--libs/misc/CMakeLists.txt13
-rw-r--r--libs/misc/examples/tensor_index_iterator.cpp (renamed from libs/util/examples/tensor_index_iterator.cpp)12
-rw-r--r--libs/misc/include/misc/EnvVar.h107
-rw-r--r--libs/misc/include/misc/benchmark.h87
-rw-r--r--libs/misc/include/misc/environment.h130
-rw-r--r--libs/misc/include/misc/feature/Index.h137
-rw-r--r--libs/misc/include/misc/feature/IndexIterator.h105
-rw-r--r--libs/misc/include/misc/feature/Object.h117
-rw-r--r--libs/misc/include/misc/feature/Reader.h69
-rw-r--r--libs/misc/include/misc/feature/Shape.h77
-rw-r--r--libs/misc/include/misc/feature/TextFormatter.h116
-rw-r--r--libs/misc/include/misc/fp32.h99
-rw-r--r--libs/misc/include/misc/kernel/IndexIterator.h102
-rw-r--r--libs/misc/include/misc/kernel/RandomObject.h77
-rw-r--r--libs/misc/include/misc/kernel/Reader.h60
-rw-r--r--libs/misc/include/misc/kernel/Shape.h68
-rw-r--r--libs/misc/include/misc/matrix/IndexIterator.h99
-rw-r--r--libs/misc/include/misc/matrix/Reader.h59
-rw-r--r--libs/misc/include/misc/matrix/Shape.h63
-rw-r--r--libs/misc/include/misc/tensor/Comparator.h95
-rw-r--r--libs/misc/include/misc/tensor/Diff.h70
-rw-r--r--libs/misc/include/misc/tensor/Index.h105
-rw-r--r--libs/misc/include/misc/tensor/IndexEnumerator.h131
-rw-r--r--libs/misc/include/misc/tensor/IndexFormatter.h75
-rw-r--r--libs/misc/include/misc/tensor/IndexIterator.h107
-rw-r--r--libs/misc/include/misc/tensor/NonIncreasingStride.h83
-rw-r--r--libs/misc/include/misc/tensor/Object.h100
-rw-r--r--libs/misc/include/misc/tensor/Reader.h58
-rw-r--r--libs/misc/include/misc/tensor/Shape.h152
-rw-r--r--libs/misc/include/misc/tensor/Zipper.h104
-rw-r--r--libs/misc/include/misc/vector.h52
-rw-r--r--libs/misc/include/misc/vector/Object.h92
-rw-r--r--libs/misc/include/misc/vector/Reader.h58
-rw-r--r--libs/misc/src/environment.cpp (renamed from libs/util/src/environment.cpp)10
-rw-r--r--libs/misc/src/tensor/Comparator.cpp (renamed from libs/util/src/tensor/Comparator.cpp)12
-rw-r--r--libs/misc/src/tensor/IndexFormatter.cpp (renamed from libs/util/src/tensor/IndexFormatter.cpp)6
-rw-r--r--libs/misc/src/tensor/NonIncreasingStride.cpp (renamed from libs/util/src/tensor/NonIncreasingStride.cpp)6
-rw-r--r--libs/misc/src/tensor/Shape.cpp (renamed from libs/util/src/tensor/Shape.cpp)6
-rw-r--r--libs/profiling/CMakeLists.txt5
-rw-r--r--libs/profiling/include/profiling/profile_buffer.h170
-rw-r--r--libs/profiling/include/profiling/profiler.h203
-rw-r--r--libs/profiling/include/profiling/profiling.h57
-rw-r--r--libs/profiling/include/profiling/time.h35
-rw-r--r--libs/profiling/src/profiling/time.cpp55
-rw-r--r--libs/support/CMakeLists.txt2
-rw-r--r--libs/support/nnapi/CMakeLists.txt6
-rw-r--r--libs/support/nnapi/src/Utils.cpp29
-rw-r--r--libs/support/tflite/CMakeLists.txt12
-rw-r--r--libs/support/tflite/src/TensorView.test.cpp53
-rw-r--r--libs/support/tflite/src/kernels/RSQRT.cpp83
-rw-r--r--libs/support/tflite/src/nnapi_delegate.cpp720
-rw-r--r--libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc41
-rw-r--r--libs/tflite/CMakeLists.txt12
-rw-r--r--libs/tflite/include/tflite/Assert.h45
-rw-r--r--libs/tflite/include/tflite/Diff.h199
-rw-r--r--libs/tflite/include/tflite/FeatureView.h106
-rw-r--r--libs/tflite/include/tflite/InputIndex.h60
-rw-r--r--libs/tflite/include/tflite/InterpreterSession.h99
-rw-r--r--libs/tflite/include/tflite/NNAPISession.h101
-rw-r--r--libs/tflite/include/tflite/OutputIndex.h60
-rw-r--r--libs/tflite/include/tflite/Quantization.h44
-rw-r--r--libs/tflite/include/tflite/Session.h69
-rw-r--r--libs/tflite/include/tflite/TensorLogger.h168
-rw-r--r--libs/tflite/include/tflite/TensorShapeUtils.h64
-rw-r--r--libs/tflite/include/tflite/TensorUtils.h54
-rw-r--r--libs/tflite/include/tflite/TensorView.h120
-rw-r--r--libs/tflite/include/tflite/ext/kernels/Abs.h (renamed from libs/support/nnapi/src/feature/Utils.cpp)34
-rw-r--r--libs/tflite/include/tflite/ext/kernels/CustomOps.h60
-rw-r--r--libs/tflite/include/tflite/ext/kernels/SquaredDifference.h76
-rw-r--r--libs/tflite/include/tflite/ext/kernels/TensorFlowMax.h75
-rw-r--r--libs/tflite/include/tflite/ext/kernels/TensorFlowSum.h41
-rw-r--r--libs/tflite/include/tflite/ext/kernels/register.h42
-rw-r--r--libs/tflite/include/tflite/ext/nnapi_delegate.h97
-rw-r--r--libs/tflite/include/tflite/interp/Builder.h53
-rw-r--r--libs/tflite/include/tflite/interp/FlatBufferBuilder.h64
-rw-r--r--libs/tflite/include/tflite/interp/FunctionBuilder.h67
-rw-r--r--libs/tflite/src/Diff.cpp (renamed from libs/support/tflite/src/Diff.cpp)218
-rw-r--r--libs/tflite/src/FeatureView.cpp (renamed from libs/support/tflite/src/FeatureView.cpp)11
-rw-r--r--libs/tflite/src/Quantization.cpp (renamed from libs/support/tflite/src/Quantization.cpp)2
-rw-r--r--libs/tflite/src/TensorShapeUtils.cpp (renamed from libs/support/tflite/src/TensorShapeUtils.cpp)11
-rw-r--r--libs/tflite/src/TensorView.test.cpp53
-rw-r--r--libs/tflite/src/ext/kernels/Abs.cpp103
-rw-r--r--libs/tflite/src/ext/kernels/SquaredDifference.cpp (renamed from libs/support/tflite/src/kernels/SquaredDifference.cpp)29
-rw-r--r--libs/tflite/src/ext/kernels/TensorFlowMax.cpp (renamed from libs/support/tflite/src/kernels/TensorFlowMax.cpp)85
-rw-r--r--libs/tflite/src/ext/kernels/TensorFlowSum.cpp400
-rw-r--r--libs/tflite/src/ext/kernels/register.cpp (renamed from libs/support/tflite/src/kernels/register.cpp)82
-rw-r--r--libs/tflite/src/ext/nnapi_delegate.cpp1209
-rw-r--r--libs/tflite/src/ext/nnapi_delegate_ex_AddOpsAndParams_lambda.inc106
-rw-r--r--libs/tflite/src/interp/FlatBufferBuilder.cpp (renamed from libs/support/tflite/src/interp/FlatBufferBuilder.cpp)12
-rw-r--r--libs/tflite/src/interp/FunctionBuilder.cpp (renamed from libs/support/tflite/src/interp/FunctionBuilder.cpp)8
-rw-r--r--libs/util/CMakeLists.txt23
-rw-r--r--libs/util/src/profiling/time.cc49
248 files changed, 17663 insertions, 4105 deletions
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
index 026487077..e4e752ef9 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -14,6 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLKernelLibraryEx.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines
+ * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL.
+ */
+
#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
@@ -27,58 +35,76 @@
namespace arm_compute
{
-/** CLKernelLibrary class */
+/**
+ * @brief Class to build OpenCL kernels added from nnfw
+ * */
class CLKernelLibraryEx
{
using StringSet = std::set<std::string>;
private:
- /** Default Constructor. */
+ /**
+ * @brief Construct a new CLKernelLibraryEx object
+ */
CLKernelLibraryEx();
public:
- /** Prevent instances of this class from being copied */
+ /**
+ * @brief Prevent instances of this class from being copied.
+ */
CLKernelLibraryEx(const CLKernelLibraryEx &) = delete;
- /** Prevent instances of this class from being copied */
+
+ /**
+ * @brief Prevent instances of this class from being copied.
+ */
const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete;
- /** Access the KernelLibrary singleton.
- * @return The KernelLibrary instance.
+
+ /**
+ * @brief Get the KernelLibrary singleton.
+ * @return The KernelLibrary instance
*/
static CLKernelLibraryEx &get();
- /** Initialises the kernel library.
- *
- * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded.
- * @param[in] context (Optional) CL context used to create programs.
- * @param[in] device (Optional) CL device for which the programs are created.
- */
- void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(),
- cl::Device device = cl::Device::getDefault())
+
+ /**
+ * @brief Initialise the kernel library.
+ * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+ * @param[in] context CL context used to create programs.
+ * @param[in] device CL device for which the programs are created.
+ * @return N/A
+ */
+ void init(std::string kernel_path, cl::Context context, cl::Device device)
{
_kernel_path = std::move(kernel_path);
_context = std::move(context);
_device = std::move(device);
}
- /** Sets the path that the kernels reside in.
- *
- * @param[in] kernel_path Path of the kernel.
+
+ /**
+ * @brief Set the path that the kernels reside in.
+ * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+ * @return N/A
*/
void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; };
- /** Gets the path that the kernels reside in.
+
+ /**
+ * @brief Get the path that the kernels reside in.
+ * @return the path of kernel files
*/
std::string get_kernel_path() { return _kernel_path; };
- /** Gets the source of the selected program.
- *
+
+ /**
+ * @brief Get the source of the selected program.
* @param[in] program_name Program name.
- *
* @return Source of the selected program.
*/
std::string get_program_source(const std::string &program_name);
- /** Sets the CL context used to create programs.
- *
+
+ /**
+ * @brief Set the CL context used to create programs.
* @note Setting the context also resets the device to the
* first one available in the new context.
- *
* @param[in] context A CL context.
+ * @return N/A
*/
void set_context(cl::Context context)
{
@@ -102,42 +128,56 @@ public:
}
}
- /** Accessor for the associated CL context.
- *
+ /**
+ * @brief Return associated CL context.
* @return A CL context.
*/
cl::Context &context() { return _context; }
- /** Sets the CL device for which the programs are created.
- *
+ /**
+ * @brief Set the CL device for which the programs are created.
* @param[in] device A CL device.
+ * @return N/A
*/
void set_device(cl::Device device) { _device = std::move(device); }
- /** Return the device version
- *
+ /**
+ * @brief Gets the CL device for which the programs are created.
+ * @return A CL device.
+ */
+ cl::Device &get_device() { return _device; }
+
+ /**
+ * @brief Return the device version
* @return The content of CL_DEVICE_VERSION
*/
std::string get_device_version();
- /** Creates a kernel from the kernel library.
- *
+
+ /**
+ * @brief Create a kernel from the kernel library.
* @param[in] kernel_name Kernel name.
* @param[in] build_options_set Kernel build options as a set.
- *
* @return The created kernel.
*/
Kernel create_kernel(const std::string &kernel_name,
const StringSet &build_options_set = {}) const;
- /** Find the maximum number of local work items in a workgroup can be supported for the kernel.
- *
+
+ /**
+ * @brief Find the maximum number of local work items in a workgroup can be supported for the
+ * kernel.
+ * @param[in] kernel kernel object
*/
+
size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
- /** Return the default NDRange for the device.
- *
+ /**
+ * @brief Return the default NDRange for the device.
+ * @return default NDRangeof the device
*/
cl::NDRange default_ndrange() const;
- /** Clear the library's cache of binary programs
+ /**
+ * @brief Clear the library's cache of binary programs
+ * @return N/A
*/
void clear_programs_cache()
{
@@ -145,29 +185,45 @@ public:
_built_programs_map.clear();
}
- /** Access the cache of built OpenCL programs */
+ /**
+ * @brief Access the cache of built OpenCL programs
+ * @return program map data structure of which key is name of kernel and value is
+ * kerel source name. (*.cl)
+ */
const std::map<std::string, cl::Program> &get_built_programs() const
{
return _built_programs_map;
}
- /** Add a new built program to the cache
- *
+ /**
+ * @brief Add a new built program to the cache
* @param[in] built_program_name Name of the program
* @param[in] program Built program to add to the cache
+ * @return N/A
*/
void add_built_program(const std::string &built_program_name, cl::Program program);
+ /**
+ * @brief Returns true if FP16 is supported by the CL device
+ * @return true if the CL device supports FP16
+ */
+ bool fp16_supported() const;
+
+ /**
+ * @brief Returns true if int64_base_atomics extension is supported by the CL device
+ * @return true if the CL device supports int64_base_atomics extension
+ */
+ bool int64_base_atomics_supported() const;
+
private:
- /** Load program and its dependencies.
- *
+ /**
+ * @brief Load program and its dependencies.
* @param[in] program_name Name of the program to load.
*/
const Program &load_program(const std::string &program_name) const;
- /** Concatenates contents of a set into a single string.
- *
+ /**
+ * @brief Concatenates contents of a set into a single string.
* @param[in] s Input set to concatenate.
- *
* @return Concatenated string.
*/
std::string stringify_set(const StringSet &s) const;
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h
new file mode 100644
index 000000000..dbda354d6
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OPENCLEX_H__
+#define __ARM_COMPUTE_OPENCLEX_H__
+
+#include <string>
+#include <utility>
+
+/* Configure the Khronos C++ wrapper to target OpenCL 1.2: */
+#ifndef ARM_COMPUTE_NO_EXCEPTIONS
+#define CL_HPP_ENABLE_EXCEPTIONS
+#endif // ARM_COMPUTE_NO_EXCEPTIONS
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#include <CL/cl2.hpp>
+
+namespace arm_compute
+{
+/** Class for loading OpenCL symbols. */
+class CLSymbolsEx final
+{
+private:
+ CLSymbolsEx() = default;
+ void load_symbols(void *handle);
+
+public:
+ /** Get the static instance of CLSymbols.
+ *
+ * @return The static instance of CLSymbols.
+ */
+ static CLSymbolsEx &get();
+ /** Load symbols from the given OpenCL library path.
+ *
+ * @param[in] library Path to the OpenCL library.
+ *
+ * @return True if loading the library is successful.
+ */
+ bool load(const std::string &library);
+ /** Load symbols from any of the default OpenCL library names.
+ *
+ * @return True if loading any library is successful.
+ */
+ bool load_default();
+
+#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr
+
+ DECLARE_FUNCTION_PTR(clGetEventInfo);
+ DECLARE_FUNCTION_PTR(clSetEventCallback);
+
+#undef DECLARE_FUNCTION_PTR
+
+private:
+ std::pair<bool, bool> _loaded{false, false};
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_OPENCLEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h
new file mode 100644
index 000000000..080cc47ef
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the activation layer kernel. */
+class CLActivationLayerExKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLActivationLayerExKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLActivationLayerExKernel(const CLActivationLayerExKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLActivationLayerExKernel &operator=(const CLActivationLayerExKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLActivationLayerExKernel(CLActivationLayerExKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLActivationLayerExKernel &operator=(CLActivationLayerExKernel &&) = default;
+ /** Default destructor */
+ ~CLActivationLayerExKernel() = default;
+ /** Set the input and output tensor.
+ *
+ * @note If the output tensor is a nullptr, the activation function will be performed in-place
+ *
+ * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will
+ * store the result
+ * of the activation function. Data types supported:
+ * QASYMM8/F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] act_info Activation layer information.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLActivationLayerKernel
+ *
+ * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor
+ * will store the result
+ * of the activation function. Data types supported: QASYMM8/F16/F32.
+ * @param[in] output Destination tensor info. Data type supported: same as @p input
+ * @param[in] act_info Activation layer information.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ActivationLayerInfoEx &act_info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h
new file mode 100644
index 000000000..b91a26159
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgMinMaxKernel.h
+ * @brief This file defines CLArgMinMaxKernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__
+#define __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the argminmax max kernel.
+ */
+class CLArgMinMaxKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Default constructor.
+ */
+ CLArgMinMaxKernel();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied
+ */
+ CLArgMinMaxKernel(const CLArgMinMaxKernel &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied
+ * @return Reference of this instance
+ */
+ CLArgMinMaxKernel &operator=(const CLArgMinMaxKernel &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved
+ */
+ CLArgMinMaxKernel(CLArgMinMaxKernel &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved
+ * @return Reference of this instance
+ */
+ CLArgMinMaxKernel &operator=(CLArgMinMaxKernel &&) = default;
+ /**
+ * @brief Initialise the kernel's input, output and border mode.
+ * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[out] output The output tensor, Data types supported: same as @p input.
+ * @param[in] argminmax_axis Axis to argminmax
+ * return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const uint32_t argminmax_axis,
+ ArgOperation op);
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLArgMinMaxKernel
+ * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * @param[in] argminmax_axis Axis to argminmax
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t argminmax_axis, ArgOperation op);
+
+ /*
+ * @brief Run CLArgMinMaxKernel op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+ /*
+ * @brief Run CLArgMinMaxKernel op on CPU
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run_on_cpu(cl::CommandQueue &queue);
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ uint32_t _argminmax_axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLargminmaxMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h
new file mode 100644
index 000000000..9a765f310
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the arithmetic subtraction kernel (support broadcasting)
+ *
+ * Arithmetic subtraction is computed by:
+ * @f[ output(x,y) = input1(x,y) - input2(x,y) @f]
+ */
+class CLArithmeticSubtractionExKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLArithmeticSubtractionExKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLArithmeticSubtractionExKernel(const CLArithmeticSubtractionExKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLArithmeticSubtractionExKernel &operator=(const CLArithmeticSubtractionExKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLArithmeticSubtractionExKernel(CLArithmeticSubtractionExKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLArithmeticSubtractionExKernel &operator=(CLArithmeticSubtractionExKernel &&) = default;
+ /** Default destructor */
+ ~CLArithmeticSubtractionExKernel() = default;
+
+ /** Initialise the kernel's inputs, output and convertion policy.
+ *
+ * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32.
+ * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8),
+ * S16/F16/F32.
+ * @param[in] policy Policy to use to handle overflow.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+ ConvertPolicy policy);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLArithmeticSubtractionExKernel
+ *
+ * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32.
+ * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32.
+ * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8),
+ * S16/F16/F32.
+ * @param[in] policy Policy to use to handle overflow.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, ConvertPolicy policy);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input1; /**< Source tensor 1 */
+ const ICLTensor *_input2; /**< Source tensor 2 */
+ ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h
new file mode 100644
index 000000000..1387897c9
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__
+#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform BATCH_TO_SPACE_ND operation */
+class CLBatchToSpaceNDKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLBatchToSpaceNDKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBatchToSpaceNDKernel(const CLBatchToSpaceNDKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLBatchToSpaceNDKernel &operator=(const CLBatchToSpaceNDKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLBatchToSpaceNDKernel(CLBatchToSpaceNDKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLBatchToSpaceNDKernel &operator=(CLBatchToSpaceNDKernel &&) = default;
+ /** Default destructor */
+ ~CLBatchToSpaceNDKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
new file mode 100644
index 000000000..ab33d9d3a
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/
+class CLBinaryLogicalOpKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLBinaryLogicalOpKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input1 Source tensor1.
+ * @param[in] input2 Source tensor2.
+ * @param[out] output Output tensor.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+ BinaryLogicalOperation op);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ ICLTensor *_output;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
index 6bd33bf8f..4c2feb903 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
@@ -14,6 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLCastKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLCastKernel class
+ */
+
#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
#define __ARM_COMPUTE_CLCASTKERNEL_H__
@@ -23,30 +30,62 @@ namespace arm_compute
{
class ICLTensor;
-/** OpenCL kernel to perform a cast operation */
+/**
+ * @brief Class to define OpenCL kernel for cast operation
+ */
class CLCastKernel : public ICLKernel
{
public:
- /** Default constructor */
+ /**
+ * @brief Construct CLCastKernel object
+ */
CLCastKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
CLCastKernel(const CLCastKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
CLCastKernel &operator=(const CLCastKernel &) = delete;
- /** Allow instances of this class to be moved */
+
+ /**
+ * @brief Construct CLCastKernel object using default move constructor
+ * @param[in] CLCastKernel object to move
+ */
CLCastKernel(CLCastKernel &&) = default;
- /** Allow instances of this class to be moved */
+
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param[in] CLCastKernel object to move
+ */
CLCastKernel &operator=(CLCastKernel &&) = default;
- /** Default destructor */
+
+ /**
+ * @brief Destruct this CLCastKernel object
+ */
~CLCastKernel() = default;
- /** Initialise the kernel's input and output.
- *
+
+ /**
+ * @brief Initialise the kernel's input and output.
* @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
* @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @return N/A
*/
void configure(const ICLTensor *input, ICLTensor *output);
- // Inherited methods overridden:
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h
new file mode 100644
index 000000000..f5f455993
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__
+#define __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to check if values in both tensors are equal*/
+class CLComparisonOpKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLComparisonOpKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLComparisonOpKernel(const CLComparisonOpKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLComparisonOpKernel &operator=(const CLComparisonOpKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLComparisonOpKernel(CLComparisonOpKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLComparisonOpKernel &operator=(CLComparisonOpKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input1 Source tensor1.
+ * @param[in] input2 Source tensor2.
+ * @param[out] output Output tensor.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+ const ComparisonOperation &op);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
new file mode 100644
index 000000000..60ec7a82a
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform depthTospace operation */
+class CLDepthToSpaceKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLDepthToSpaceKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
+ /** Default destructor */
+ ~CLDepthToSpaceKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
new file mode 100644
index 000000000..da075db69
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLEmbeddingLookupKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLEmbeddingLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform EmbeddingLookup operation with opencl kernel
+*/
+class CLEmbeddingLookupKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Construct a CLEmbeddingLookupKernel object
+ * */
+ CLEmbeddingLookupKernel();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete;
+
+ /**
+ * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor
+ * @param[in] CLEmbeddingLookupKernel object to move
+ * */
+ CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default;
+
+ /**
+ * @brief Move assignment operator
+ * @param[in] CLEmbeddingLookupKernel object to move
+ * */
+ CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default;
+
+ /**
+ * @brief Destruct this object
+ * */
+ ~CLEmbeddingLookupKernel() = default;
+
+ /**
+ * @brief Set the input and output of the kernel
+ * @param[in] input Source tensor.
+ * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] lookups Lookups are 1D tensor that values are indices into the first
+ * dimension of input.
+ * Data types supported: S32.
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLEmbeddingLookupKernel
+ * @param[in] input The input tensor info.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * @param[in] lookups Lookups info. Data types supported: S32.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *lookups);
+
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /** Source tensor */
+ ICLTensor *_output; /** Destination tensor */
+ const ICLTensor *_lookups; /** Lookups tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h
new file mode 100644
index 000000000..a6ea539f8
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLEXPKERNEL_H__
+#define __ARM_COMPUTE_CLEXPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform an exponential operation */
+class CLExpKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLExpKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLExpKernel(const CLExpKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLExpKernel &operator=(const CLExpKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLExpKernel(CLExpKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLExpKernel &operator=(CLExpKernel &&) = default;
+ /** Default destructor */
+ ~CLExpKernel() = default;
+ /** Set the source, destination of the kernel
+ *
+ * @param[in] input Source tensor. Data type supported: F32.
+ * @param[out] output Destination tensor. Data type supported: F32.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLEXPKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
index a51441aca..7e35a80b0 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
@@ -14,52 +14,85 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLGatherKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLGatherKernel class
+ */
+
#ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__
#define __ARM_COMPUTE_CLGATHERKERNEL_H__
#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
namespace arm_compute
{
class ICLTensor;
-/** Interface for the gather kernel.
- *
+/**
+ * @brief Class to define an interface for the gather kernel.
*/
class CLGatherKernel : public ICLKernel
{
public:
- /** Default constructor.*/
+ /**
+ * @brief Construct CLGatherKernel object
+ * */
CLGatherKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers). */
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ */
CLGatherKernel(const CLGatherKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers). */
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ */
CLGatherKernel &operator=(const CLGatherKernel &) = delete;
- /** Allow instances of this class to be moved */
+
+ /**
+ * @brief Construct CLGatherKernel object by using default move constructor
+ * @param[in] CLGatherKernel object to move
+ */
CLGatherKernel(CLGatherKernel &&) = default;
- /** Allow instances of this class to be moved */
+
+ /**
+ * @brief Move assignment operator
+ * @param[in] CLGatherKernel object to move
+ */
CLGatherKernel &operator=(CLGatherKernel &&) = default;
- /** Initialise the kernel's input, output and border mode.
- *
+
+ /**
+ * @brief Initialise the kernel's input, output and border mode.
* @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
* @param[in] input2 An input tensor. Data types supported: S32.
* @param[out] output The output tensor, Data types supported: same as @p input1.
+ * @return N/A
*/
void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
* CLGatherKernel
- *
* @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
* @param[in] input2 An input tensor. Data types supported: S32.
* @param[out] output The output tensor, Data types supported: same as @p input1.
- *
* @return a status
*/
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
const ITensorInfo *output);
- // Inherited methods overridden:
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
new file mode 100644
index 000000000..c3fc15637
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLHashtableLookupKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLHashtableLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform HashtableLookup operation with opencl kernel
+*/
+class CLHashtableLookupKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Construct a CLHashtableLookupKernel object
+ * */
+ CLHashtableLookupKernel();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete;
+
+ /**
+ * @brief Construct a CLHashtableLookupKernel object by using default move constructor
+ * @param[in] CLHashtableLookupKernel object to move
+ * */
+ CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default;
+
+ /**
+ * @brief Move assignment operator
+ * @param[in] CLHashtableLookupKernel object to move
+ * */
+ CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default;
+
+ /**
+ * @brief Destruct this object
+ * */
+ ~CLHashtableLookupKernel() = default;
+
+ /**
+ * @brief Set the input and output of the kernel
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input.
+ * @param[in] keys Keys 1D tensor. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+ * (True) or not (False). Data types supported: U8/QASYMM8
+ * @return N/A
+ */
+ void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input,
+ ICLTensor *output, ICLTensor *hits);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLHashtableLookupKernel
+ * @param[in] lookups The lookups tensor info. Data types supported: S32.
+ * @param[in] keys The keys tensor info. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input The input tensor info.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output The output tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup
+ * hits
+ * (True) or not (False). Data types supported: U8/QASYMM8
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits);
+
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_lookups; /** Lookups tensor */
+ const ICLTensor *_keys; /** Keys tensor */
+ const ICLTensor *_input; /** Source tensor */
+ ICLTensor *_output; /** Destination tensor */
+ ICLTensor *_hits; /** Hits tensor */
+ std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
new file mode 100644
index 000000000..ccbea147e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
+#define __ARM_COMPUTE_CLNEGKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a negation operation on tensor*/
+class CLNegKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLNegKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLNegKernel(const CLNegKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLNegKernel &operator=(const CLNegKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLNegKernel(CLNegKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLNegKernel &operator=(CLNegKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input Source tensor.
+ * @param[out] output Destination tensor.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h
new file mode 100644
index 000000000..181a6226a
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class CLNormalizationLayerExKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLNormalizationLayerExKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLNormalizationLayerExKernel(const CLNormalizationLayerExKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLNormalizationLayerExKernel &operator=(const CLNormalizationLayerExKernel &) = delete;
+ /** Default Move Constructor. */
+ CLNormalizationLayerExKernel(CLNormalizationLayerExKernel &&) = default;
+ /** Default move assignment operator */
+ CLNormalizationLayerExKernel &operator=(CLNormalizationLayerExKernel &&) = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions
+ * [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types supported:
+ * F16/F32.
+ * @param[out] output Destination tensor. Output will have the same number of dimensions as
+ * input. Data types supported: same as @p input.
+ * @param[in] norm_info Normalization layer information like the normalization type,
+ * normalization size and other parameters.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLNormalizationLayerKernel
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions
+ * [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types supported:
+ * F16/F32.
+ * @param[in] output Destination tensor. Output will have the same number of dimensions as
+ * input. Data types supported: same as @p input.
+ * @param[in] norm_info Normalization layer information like the normalization type, normalization
+ * size and other parameters.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ NormalizationLayerInfo norm_info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ BorderSize _border_size;
+ bool _is_in_map;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
new file mode 100644
index 000000000..eff1b8bd5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
+#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to calculate PReLU*/
+class CLPReLUKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLPReLUKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLPReLUKernel(const CLPReLUKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLPReLUKernel(CLPReLUKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input Source tensor1.
+ * @param[in] alpha Source tensor2.
+ * @param[out] output Output tensor.
+ */
+ void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_alpha;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h
new file mode 100644
index 000000000..cbaa2adee
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h
@@ -0,0 +1,60 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLPADLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform PAD operation */
+class CLPadLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLPadLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerKernel(const CLPadLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLPadLayerKernel(CLPadLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLPadLayerKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] pad_size Padding Size tensor. Data types supported : S32
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+ ICLTensor *_pad_size; /**< Padding Size tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYERKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h
new file mode 100644
index 000000000..3434deee8
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__
+#define __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform tensor permutation.
+ *
+ * Permutes given a permutation vector
+ */
+class CLPermuteExKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLPermuteExKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPermuteExKernel(const CLPermuteExKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPermuteExKernel &operator=(const CLPermuteExKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLPermuteExKernel(CLPermuteExKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLPermuteExKernel &operator=(CLPermuteExKernel &&) = default;
+ /** Set the input and output of the kernel.
+ *
+ * @param[in] input The input tensor to permute. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] output The output tensor. Data types supported: Same as @p input
+ * @param[in] perm Permutation vector
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLPermuteKernel
+ *
+ * @param[in] input First tensor input info. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] output Output tensor info. Data types supported: same as @p input.
+ * @param[in] perm Permutation vector
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PermutationVector &perm);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ PermutationVector _perm;
+};
+} // arm_compute
+#endif /*__ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
index cd2b255bc..d579f5d8f 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
@@ -14,68 +14,106 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLPixelWiseDivisionKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLPixelWiseDivisionKernel class
+ */
+
#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
#define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
namespace arm_compute
{
class ICLTensor;
-/** Interface for the pixelwise division kernel.
- *
+/**
+ * @brief Interface for the pixelwise division kernel.
*/
class CLPixelWiseDivisionKernel : public ICLKernel
{
public:
- /** Default constructor.*/
+ /**
+ * @brief Construct a CLPixelWiseDivisionKernel object
+ */
CLPixelWiseDivisionKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers). */
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ */
CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers). */
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ */
CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete;
- /** Allow instances of this class to be moved */
+
+ /**
+ * @brief Construct a CLPixelWiseDivisionKernel object by using move constructor
+ * @param[in] CLPixelWiseDivisionKernel object to move
+ */
CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default;
- /** Allow instances of this class to be moved */
+
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param[in] CLPixelWiseDivisionKernel object to move
+ */
CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default;
- /** Initialise the kernel's input, output and border mode.
- *
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+
+ /**
+ * @brief Initialise the kernel's input, output and border mode.
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32.
* @param[in] input2 An input tensor. Data types supported: same as @p input1.
* @param[out] output The output tensor, Data types supported: same as @p input1. Note:
- * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * U8 requires both inputs to be U8.
* @param[in] scale Scale to apply after division.
* Scale must be positive and its value must be either 1/255 or 1/2^n
- * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * where n is between 0 and 15.
* @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
* @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
* even.
+ * @return N/A
*/
void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
- /** Static function to check if given info will lead to a valid configuration of @ref
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
* CLPixelWiseDivisionKernel
- *
- * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32.
* @param[in] input2 An input tensor info. Data types supported: same as @p input1.
* @param[in] output The output tensor info, Data types supported: same as @p input1.
- * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * Note: U8 requires both inputs to be U8.
* @param[in] scale Scale to apply after division.
* Scale must be positive and its value must be either 1/255 or 1/2^n
- * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * where n is between 0 and 15.
* @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
* @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
- *
* @return a status
*/
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
RoundingPolicy rounding_policy);
- // Inherited methods overridden:
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
+
+ /**
+ * @brief The size of the border for that kernel
+ * @return The width in number of elements of the border.
+ */
BorderSize border_size() const override;
private:
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
deleted file mode 100644
index a7d96cc5c..000000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
-#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the pixelwise division kernel.
- *
- */
-class CLReduceMaxKernel : public ICLKernel
-{
-public:
- /** Default constructor.*/
- CLReduceMaxKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers). */
- CLReduceMaxKernel(const CLReduceMaxKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers). */
- CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLReduceMaxKernel(CLReduceMaxKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
- /** Initialise the kernel's input, output and border mode.
- *
- * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
- * @param[in] axis Axis to reduce
- * @param[out] output The output tensor, Data types supported: same as @p input1. Note:
- * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
- */
- void configure(const ICLTensor *input, int32_t axis, ICLTensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLReduceMaxKernel
- *
- * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
- * @param[in] axis Axis to reduce
- * @param[in] output The output tensor info, Data types supported: same as @p input1.
- * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
- void run_on_cpu(cl::CommandQueue &queue);
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
- int32_t _axis;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
new file mode 100644
index 000000000..a26a4a7fc
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperationKernel.h
+ * @brief This file defines CLReduceOperationKernel class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the reduce operation kernel
+ */
+class CLReduceOperationKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Default constructor
+ */
+ CLReduceOperationKernel();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLReduceOperationKernel(const CLReduceOperationKernel &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ */
+ CLReduceOperationKernel(CLReduceOperationKernel &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ */
+ CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default;
+ /**
+ * @brief Default destructor
+ */
+ ~CLReduceOperationKernel() = default;
+
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] input Source tensor. Data types supported: U8/S32/F32.
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce.
+ * @param[in] op Reduce operation to perform.
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+ ReduceOperation op);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLReduceOperationKernel.
+ * @param[in] input Source tensor info. Data types supported: U8/S32/F32.
+ * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce.
+ * @param[in] op Reduce operation to perform.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ReduceOperation op);
+
+ /*
+ * @brief Run CLReduceOperationKernel op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue CLQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ uint32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
deleted file mode 100644
index de9df3381..000000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
-#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reduction operation kernel */
-class CLReductionMeanKernel : public ICLKernel
-{
-public:
- /** Default constructor */
- CLReductionMeanKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLReductionMeanKernel(const CLReductionMeanKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLReductionMeanKernel(CLReductionMeanKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default;
- /** Default destructor */
- ~CLReductionMeanKernel() = default;
-
- /** Set the input and output tensors.
- *
- * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW.
- * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
- * Output will have the same number of dimensions as input.
- * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1
- */
- void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLReductionMeanKernel.
- *
- * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
- * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
- * input.
- * Output will have the same number of dimensions as input.
- * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- std::vector<uint32_t> axis);
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-private:
- const ICLTensor *_input;
- ICLTensor *_output;
- std::vector<uint32_t> _reduction_axis;
- BorderSize _border_size;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
new file mode 100644
index 000000000..68534f1ab
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */
+class CLSpaceToBatchNDKernel final : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLSpaceToBatchNDKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default;
+ /** Default destructor */
+ ~CLSpaceToBatchNDKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @note The data layout of input and output must be the same.
+ * @note The number of dimensions of input and output must be 4, and `spatial` dimensions
+ * are height and width.
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
+ * @param[in] block_size Block size tensor. Data types supported: S32.
+ * @param[in] padding_size Padding size tensor. Data types supported: S32.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
+ */
+ void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+ ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ const ICLTensor *_block_size; /**< Block size tensor */
+ const ICLTensor *_padding_size; /**< Padding size tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
new file mode 100644
index 000000000..be845a549
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform spaceTodepth operation */
+class CLSpaceToDepthKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLSpaceToDepthKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
+ /** Default destructor */
+ ~CLSpaceToDepthKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h
new file mode 100644
index 000000000..a4c44e35d
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__
+#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to return squared difference value of two tensors (x-y)^2*/
+class CLSquaredDifferenceKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLSquaredDifferenceKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLSquaredDifferenceKernel(const CLSquaredDifferenceKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLSquaredDifferenceKernel &operator=(const CLSquaredDifferenceKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLSquaredDifferenceKernel(CLSquaredDifferenceKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLSquaredDifferenceKernel &operator=(CLSquaredDifferenceKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input1 Source tensor1.
+ * @param[in] input2 Source tensor2.
+ * @param[out] output Output tensor.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h
index 248ae6635..6368c380e 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h
@@ -14,36 +14,64 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
+
+/**
+ * @file CLStridedSliceExKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLStridedSliceExKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__
#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
namespace arm_compute
{
class ICLTensor;
-/** Interface for the kernel to extract a strided slice of a tensor */
-class CLStridedSliceKernel : public ICLKernel
+/**
+* @brief Class to define an interface for the kernel to extract a strided slice of a tensor
+*/
+class CLStridedSliceExKernel : public ICLKernel
{
public:
- /** Default constructor */
- CLStridedSliceKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
- /** Default destructor */
- ~CLStridedSliceKernel() = default;
- /** Set the input and output of the kernel
- *
+ /**
+ * @brief Construct a CLStridedSliceExKernel object
+ * */
+ CLStridedSliceExKernel();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLStridedSliceExKernel(const CLStridedSliceExKernel &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLStridedSliceExKernel &operator=(const CLStridedSliceExKernel &) = delete;
+
+ /**
+ * @brief Construct a CLStridedSliceExKernel object by using default move constructor
+ * @param[in] CLStridedSliceExKernel object to move
+ * */
+ CLStridedSliceExKernel(CLStridedSliceExKernel &&) = default;
+
+ /**
+ * @brief Move assignment operator
+ * @param[in] CLStridedSliceExKernel object to move
+ * */
+ CLStridedSliceExKernel &operator=(CLStridedSliceExKernel &&) = default;
+
+ /**
+ * @brief Destruct this object
+ * */
+ ~CLStridedSliceExKernel() = default;
+
+ /**
+ * @brief Set the input and output of the kernel
* @param[in] input Source tensor. Data type supported:
- * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+ * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
* @param[out] output Destination tensor. Data type supported: Same as @p input
* @param[in] beginData The begin tensor. Data types supported: S32.
* The number of dimensions must be 1.
@@ -57,17 +85,17 @@ public:
* @param[in] beginMask Mask for begin
* @param[in] endMask Mask for end
* @param[in] shrinkAxisMask Mask for shrink axis.
- *
+ * @return N/A
*/
void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
int32_t shrinkAxisMask);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLStridedSliceKernel
- *
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLStridedSliceExKernel
* @param[in] input The input tensor info. Data types supported:
- * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+ * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
* @param[in] output The output tensor info, Data types supported: same as @p input1.
* @param[in] begin The begin tensor info. Data types supported: S32.
* The number of dimensions must be 1.
@@ -81,7 +109,6 @@ public:
* @param[in] beginMask Mask for begin
* @param[in] endMask Mask for end
* @param[in] shrinkAxisMask Mask for shrink axis.
- *
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output,
@@ -89,7 +116,16 @@ public:
const ITensorInfo *stride, int32_t beginMask, int32_t endMask,
int32_t shrinkAxisMask);
- // Inherited methods overridden:
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
@@ -103,4 +139,4 @@ private:
int32_t _shrinkAxisMask; /** Shrink axis mask */
};
} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
index 5c567f38e..eb2bad254 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -14,14 +14,18 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLTopKV2Kernel.h
+ * @brief This file defines classes for TopKV2Kernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
-#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/ICLKernel.h"
-#include <array>
-
// these parameters can be changed
#define _ITEMS 16 // number of items in a group
#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS
@@ -33,24 +37,59 @@ namespace arm_compute
{
class ICLTensor;
+/**
+ * @brief Class to define CLTopKV2Single
+ */
class CLTopKV2Single : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLTopKV2Single();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+ */
CLTopKV2Single(const CLTopKV2Single &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+ * @return Reference of this instance
+ */
CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+ */
CLTopKV2Single(CLTopKV2Single &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+ * @return Reference of this instance
+ */
CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[in] input An input tensor
+ * @param[in] topk_values Values of the top k predictions
+ * @param[in] topk_indices Indices of the top k predictions
+ * @param[in] indices Indices
+ * @param[in] temp_stack Temp stack
+ * @param[in] k K of the top k predictions
+ * @param[in] n Number times to quick-sort
+ * return N/A
+ */
void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n);
- // Inherited methods overridden:
+ /*
+ * @brief Run CLTopKV2Single op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
@@ -59,52 +98,121 @@ private:
ICLTensor *_topk_indices;
};
+/**
+ * @brief Class to define CLTopKV2Init
+ */
class CLTopKV2Init : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLTopKV2Init();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+ */
CLTopKV2Init(const CLTopKV2Init &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+ * @return Reference of this instance
+ */
CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+ */
CLTopKV2Init(CLTopKV2Init &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+ * @return Reference of this instance
+ */
CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[in] input An input tensor
+ * @param[in] in_key_buf Buffer of input key
+ * @param[in] in_ind_buf Buffer of input index
+ * @param[in] n Number times to quick-sort
+ * return N/A
+ */
void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n);
- // Inherited methods overridden:
+ /*
+ * @brief Run CLTopKV2Init op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
ICLTensor *_input;
};
+/**
+ * @brief Class to define CLRadixSortHistogram
+ */
class CLRadixSortHistogram : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLRadixSortHistogram();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+ */
CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+ * @return Reference of this instance
+ */
CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+ */
CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+ * @return Reference of this instance
+ */
CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] hist_buf Buffer of histogram
+ * @param[in] bits Number of bits to be used for radix sort
+ * @param[in] n Integer number size to sort
+ * return N/A
+ */
void configure(cl::Buffer *hist_buf, int bits, int n);
+ /**
+ * @brief Set pass
+ * @param[in] pass Passes made of in radix sort algorithm
+ * @param[in] in_key_buf Buffer of input key
+ * return N/A
+ */
void setPass(int pass, cl::Buffer *in_key_buf)
{
_pass = pass;
_in_key_buf = in_key_buf;
}
- // Inherited methods overridden:
+ /*
+ * @brief Run CLRadixSortHistogram op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
@@ -112,82 +220,210 @@ private:
cl::Buffer *_in_key_buf;
};
+/**
+ * @brief Class to define CLRadixSortScanHistogram
+ */
class CLRadixSortScanHistogram : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLRadixSortScanHistogram();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+ */
CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+ * @return Reference of this instance
+ */
CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+ */
CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+ * @return Reference of this instance
+ */
CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] hist_buf Buffer of histogram
+ * @param[out] glob_sum_buf Buffer of global sum
+ * @param[in] bits Number of bits to be used for radix sort
+ * return N/A
+ */
void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
- // Inherited methods overridden:
+ /*
+ * @brief Run CLRadixSortScanHistogram op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
};
+/**
+ * @brief Class to define CLRadixSortGlobalScanHistogram
+ */
class CLRadixSortGlobalScanHistogram : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLRadixSortGlobalScanHistogram();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+ */
CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+ * @return Reference of this instance
+ */
CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+ */
CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+ * @return Reference of this instance
+ */
CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] glob_sum_buf Buffer of global sum
+ * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram
+ * @param[in] bits Number of bits to be used for radix sort
+ * return N/A
+ */
void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits);
- // Inherited methods overridden:
+ /*
+ * @brief Run CLRadixSortGlobalScanHistogram op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
};
+/**
+ * @brief Class to define CLRadixSortPasteHistogram
+ */
class CLRadixSortPasteHistogram : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLRadixSortPasteHistogram();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+ */
CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+ * @return Reference of this instance
+ */
CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+ */
CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+ * @return Reference of this instance
+ */
CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] hist_buf Buffer of histogram
+ * @param[out] glob_sum_buf Buffer of global sum
+ * @param[in] bits Number of bits to be used for radix sort
+ * return N/A
+ */
void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
- // Inherited methods overridden:
+ /*
+ * @brief Run CLRadixSortPasteHistogram op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
};
+/**
+ * @brief Class to define CLRadixSortReorder
+ */
class CLRadixSortReorder : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLRadixSortReorder();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+ */
CLRadixSortReorder(const CLRadixSortReorder &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+ * @return Reference of this instance
+ */
CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+ */
CLRadixSortReorder(CLRadixSortReorder &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+ * @return Reference of this instance
+ */
CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] hist_buf Buffer of histogram
+ * @param[in] bits Number of bits to be used for radix sort
+ * @param[in] n Integer number size to sort
+ * return N/A
+ */
void configure(cl::Buffer *hist_buf, int bits, int n);
+ /**
+ * @brief Set pass
+ * @param[in] pass Passes made of in radix sort algorithm
+ * @param[in] in_key_buf Buffer of input key
+ * @param[out] out_key_buf Buffer of output key
+ * @param[in] in_ind_buf Buffer of input index
+ * @param[out] out_ind_buf Buffer of output index
+ * return N/A
+ */
void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
cl::Buffer *out_ind_buf)
{
@@ -197,7 +433,12 @@ public:
_in_ind_buf = in_ind_buf;
_out_ind_buf = out_ind_buf;
}
- // Inherited methods overridden:
+ /*
+ * @brief Run CLRadixSortReorder op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
@@ -208,47 +449,115 @@ private:
cl::Buffer *_out_ind_buf;
};
+/**
+ * @brief Class to define CLTopKV2FindFirstNegative
+ */
class CLTopKV2FindFirstNegative : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLTopKV2FindFirstNegative();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+ */
CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+ * @return Reference of this instance
+ */
CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+ */
CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+ * @return Reference of this instance
+ */
CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] first_negative_idx_buf Buffer of the first negative index
+ * @param[in] n Number times to find
+ * return N/A
+ */
void configure(cl::Buffer *first_negative_idx_buf, int n);
+ /**
+ * @brief Set output buffer
+ * @param[out] out_key_buf Buffer of output key
+ * return N/A
+ */
void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; }
- // Inherited methods overridden:
+ /*
+ * @brief Run CLTopKV2FindFirstNegative op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
cl::Buffer *_out_key_buf;
};
+/**
+ * @brief Class to define CLTopKV2ReorderNegatives
+ */
class CLTopKV2ReorderNegatives : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLTopKV2ReorderNegatives();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+ */
CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+ * @return Reference of this instance
+ */
CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+ */
CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+ * @return Reference of this instance
+ */
CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] first_negative_idx_buf Buffer of the first negative index
+ * @param[in] n Number times to find
+ * return N/A
+ */
void configure(cl::Buffer *first_negative_idx_buf, int n);
+ /**
+ * @brief Set buffers
+ * @param[in] in_key_buf Buffer of input key
+ * @param[out] out_key_buf Buffer of output key
+ * @param[in] in_ind_buf Buffer of input index
+ * @param[out] out_ind_buf Buffer of output index
+ * return N/A
+ */
void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
cl::Buffer *out_ind_buf)
{
@@ -258,7 +567,12 @@ public:
_out_ind_buf = out_ind_buf;
}
- // Inherited methods overridden:
+ /*
+ * @brief Run CLTopKV2ReorderNegatives op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
@@ -268,25 +582,63 @@ private:
cl::Buffer *_out_ind_buf;
};
+/**
+ * @brief Class to define CLTopKV2Store
+ */
class CLTopKV2Store : public ICLKernel
{
public:
- /** Constructor */
+ /**
+ * @brief Constructor
+ */
CLTopKV2Store();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+ */
CLTopKV2Store(const CLTopKV2Store &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+ * @return Reference of this instance
+ */
CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+ */
CLTopKV2Store(CLTopKV2Store &&) = default;
- /** Allow instances of this class to be moved */
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+ * @return Reference of this instance
+ */
CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] values Values tensor to store
+ * @param[out] indices Indices tensor to be used for store
+ * @param[in] k K of the top k predictions
+ * @param[in] n Number times to store
+ * return N/A
+ */
void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
+ /**
+ * @brief Set buffers
+ * @param[out] out_key_buf Buffer of output key
+ * @param[out] out_ind_buf Buffer of output index
+ * return N/A
+ */
void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
- // Inherited methods overridden:
+ /*
+ * @brief Run CLTopKV2Store op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
void run(const Window &window, cl::CommandQueue &queue) override;
private:
diff --git a/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h
new file mode 100644
index 000000000..f7bf72985
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class NENormalizationLayerExKernel : public INEKernel
+{
+public:
+ const char *name() const override { return "NENormalizationLayerKernel"; }
+ /** Default constructor */
+ NENormalizationLayerExKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NENormalizationLayerExKernel(const NENormalizationLayerExKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NENormalizationLayerExKernel &operator=(const NENormalizationLayerExKernel &) = delete;
+ /** Default Move Constructor. */
+ NENormalizationLayerExKernel(NENormalizationLayerExKernel &&) = default;
+ /** Default move assignment operator */
+ NENormalizationLayerExKernel &operator=(NENormalizationLayerExKernel &&) = default;
+ /** Default destructor */
+ ~NENormalizationLayerExKernel() = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions
+ * [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types
+ * supported: FP16/F32.
+ * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a
+ * single input with dimensions [width, height, IFM],
+ * Data type supported: same as @p input
+ * @param[out] output Destination tensor. Output will have the same number of dimensions as
+ * input. Data type supported: same as @p input
+ * @param[in] norm_info Normalization layer information like the normalization type,
+ * normalization size and other parameters.
+ */
+ void configure(const ITensor *input, const ITensor *input_squared, ITensor *output,
+ NormalizationLayerInfo norm_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NENormalizationLayerKernel
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions
+ * [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types
+ * supported: FP16/F32.
+ * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a
+ * single input with dimensions [width, height, IFM],
+ * Data type supported: same as @p input
+ * @param[in] output Destination tensor. Output will have the same number of dimensions as
+ * input. Data type supported: same as @p input
+ * @param[in] norm_info Normalization layer information like the normalization type,
+ * normalization size and other parameters.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared,
+ const ITensorInfo *output, NormalizationLayerInfo norm_info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+ BorderSize border_size() const override;
+
+private:
+ /** Function to perform normalization depending on the given template
+ * dimension. The second template parameter specifies whether the
+ * normalization has to be 1D or 2D.
+ *
+ * @note Only supported normalizations are:
+ * - 1D over X or Z
+ * - 2D over X and Y
+ *
+ * @param[in] window Region on which to execute the kernel.
+ */
+ template <DataType dt, unsigned int dim, bool do_2D_norm>
+ void normalize_float(const Window &window);
+
+ /** Common signature for all the specialised normalization functions
+ *
+ * @param[in] window Region on which to execute the kernel.
+ */
+ using NormalizationFunctionEx = void (NENormalizationLayerExKernel::*)(const Window &window);
+
+private:
+ NormalizationFunctionEx _func;
+ const ITensor *_input;
+ const ITensor *_input_squared;
+ ITensor *_output;
+ NormalizationLayerInfo _norm_info;
+ BorderSize _border_size;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/TypesEx.h b/libs/ARMComputeEx/arm_compute/core/TypesEx.h
new file mode 100644
index 000000000..8381f1cc6
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/TypesEx.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_TYPESEX_H__
+#define __ARM_COMPUTE_TYPESEX_H__
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+
+/** Available ArgIndex operations **/
+enum class ArgOperation
+{
+ MAX,
+ MIN,
+};
+
+/** Available reduce operations */
+enum class ReduceOperation
+{
+ MAX, /**< Max */
+ MEAN, /**< Mean */
+ SUM, /**< Sum */
+ MIN, /**< Min */
+};
+
+/** Available binary logical operations */
+enum class BinaryLogicalOperation
+{
+ AND, /**< AND */
+ OR, /**< OR */
+};
+
+enum class ComparisonOperation
+{
+ EQUAL, /**< EQUAL */
+ NOT_EQUAL, /**< NOT_EQUAL */
+};
+
+/** Activation Layer Information class */
+class ActivationLayerInfoEx
+{
+public:
+ /** Available activation functions */
+ enum class ActivationFunction
+ {
+ RSQRT /**< Inverse Square root ( \f$ f(x) = \rsqrt{x} \f$ )*/
+ };
+
+ ActivationLayerInfoEx() = default;
+ /** Default Constructor
+ *
+ * @param[in] f The activation function to use.
+ * @param[in] a (Optional) The alpha parameter used by some activation functions
+ * (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU,
+ * @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
+ * @param[in] b (Optional) The beta parameter used by some activation functions (@ref
+ * ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref
+ * ActivationFunction::TANH).
+ */
+ ActivationLayerInfoEx(ActivationFunction f, float a = 0.0f, float b = 0.0f)
+ : _act(f), _a(a), _b(b), _enabled(true)
+ {
+ }
+ /** Get the type of activation function */
+ ActivationFunction activation() const { return _act; }
+ /** Get the alpha value */
+ float a() const { return _a; }
+ /** Get the beta value */
+ float b() const { return _b; }
+ /** Check if initialised */
+ bool enabled() const { return _enabled; }
+
+private:
+ ActivationFunction _act = {ActivationLayerInfoEx::ActivationFunction::RSQRT};
+ float _a = {};
+ float _b = {};
+ bool _enabled = {false};
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TYPESEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/UtilsEx.h b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h
new file mode 100644
index 000000000..8dd68a0c3
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_UTILSEX_H__
+#define __ARM_COMPUTE_UTILSEX_H__
+
+#include "arm_compute/core/TypesEx.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <sstream>
+#include <string>
+
+namespace arm_compute
+{
+/** Translates a given activation function to a string.
+ *
+ * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string.
+ *
+ * @return The string describing the activation function.
+ */
+const std::string &string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act);
+}
+#endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h
new file mode 100644
index 000000000..7e578550f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLActivationLayerExKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class CLActivationLayerEx : public ICLSimpleFunction
+{
+public:
+ /** Set the input and output tensor.
+ *
+ * @note If the output tensor is a nullptr or is equal to the input, the activation function will
+ * be performed in-place
+ *
+ * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will
+ * store the result
+ * of the activation function. Data types supported:
+ * QASYMM8/F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] act_info Activation layer parameters.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLActivationLayer
+ *
+ * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor
+ * will store the result
+ * of the activation function. Data types supported: QASYMM8/F16/F32.
+ * @param[in] output Destination tensor info. Data type supported: same as @p input
+ * @param[in] act_info Activation layer information.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ActivationLayerInfoEx &act_info);
+};
+}
+#endif /* __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h
new file mode 100644
index 000000000..8044c58af
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgMinMax.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLArgMinMax class
+ */
+
+#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_H__
+#define __ARM_COMPUTE_CLARG_MIN_MAX_H__
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to execute CLArgMinMax operation
+ */
+class CLArgMinMax : public IFunction
+{
+public:
+ /**
+ * @brief Construct a new CLArgMinMax object
+ */
+ CLArgMinMax();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLArgMinMax(const CLArgMinMax &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLArgMinMax &operator=(const CLArgMinMax &) = delete;
+
+ /**
+ * @brief Construct a new CLArgMinMax object by using copy constructor
+ * @param[in] CLArgMinMax object to move
+ */
+ CLArgMinMax(CLArgMinMax &&) = default;
+
+ /**
+ * @brief Assign a CLArgMinMax object.
+ * @param[in] CLArgMinMax object to assign. This object will be moved.
+ */
+ CLArgMinMax &operator=(CLArgMinMax &&) = default;
+
+ /**
+ * @brief Initialise the kernel's inputs and outputs.
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[out] output The result of argminmaxMax operation. Data types supported: same as @p
+ * input.
+ * @param[in] axis Axis to argminmax. It must be sorted and no duplicates.
+ * @param[in] is_min True for ArgMin operation.
+ * @param[in] is_max Ture for ArgMax operation.
+ * @return N/A
+ */
+ void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> argminmax_axis,
+ ArgOperation op);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] axis Axis to argminmax
+ * @param[out] output The result of argminmaxMax operation. Data types supported: same as @p
+ * input.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis,
+ const ITensorInfo *output, ArgOperation op);
+
+ /**
+ * @brief Run the kernels contained in the function
+ * This operation works on CPU on GPU depending on the value of argminmax_MAX_RUN_ON_CPU macro
+ * in CLArgMinMax.cpp.
+ * If argminmax_MAX_RUN_ON_CPU == 1, CPU runs this operation.
+ * Otherwise GPU runs this operation.
+ * @return N/A
+ */
+ void run() override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ std::vector<uint32_t> _argminmax_axis;
+ ArgOperation _arg_op;
+
+ std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+ std::unique_ptr<CLArgMinMaxKernel[]> _argminmax_kernels{nullptr};
+ size_t _num_of_kernels;
+};
+}
+#endif /*__ARM_COMPUTE_CLargminmax_MAX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h
new file mode 100644
index 000000000..34e6c6334
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLArithmeticSubtractionExKernel
+ *
+ * @note The tensor data type for the inputs must be U8/S16/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class CLArithmeticSubtractionEx : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs, output and convertion policy.
+ *
+ * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32.
+ * The input tensor is [in, out] because its TensorInfo might be modified
+ * inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
+ * The input tensor is [in, out] because its TensorInfo might be modified
+ * inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8),
+ * S16/F16/F32.
+ * @param[in] policy Policy to use to handle overflow.
+ */
+ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLArithmeticSubtractionEx
+ *
+ * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32.
+ * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32.
+ * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8),
+ * S16/F16/F32.
+ * @param[in] policy Policy to use to handle overflow.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
new file mode 100644
index 000000000..d16a0762d
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBatchToSpaceNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLBatchToSpaceND : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] block_size A pointer to an array of integer values specifying block sizes
+ * for spatial dimension.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
new file mode 100644
index 000000000..061e34f26
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLBinaryLogicalOp : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8.
+ * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8.
+ * @param[out] output Output tensor. Data types supported: U8, QASYMM8.
+ */
+ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ BinaryLogicalOperation op);
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
index 63050067d..56b8408e2 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
@@ -14,30 +14,35 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLCast.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLCast class
+ */
+
#ifndef __ARM_COMPUTE_CLCAST_H__
#define __ARM_COMPUTE_CLCAST_H__
-#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
namespace arm_compute
{
class ICLTensor;
-/** Basic function to run @ref CLCastKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
+/**
+ * @brief Class to run @ref CLCastKernel.
+ * This converts the input tensor to the tensor of the output tensor's type.
*/
class CLCast : public ICLSimpleFunction
{
public:
- /** Initialise the kernel's input and output.
- *
- * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
- * The input tensor is [in, out] because its TensorInfo might be modified
- * inside the kernel.
- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ /**
+ * @brief Initialise the kernel's input and output
+ * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * The input tensor is [in, out] because its TensorInfo might be
+ * modified inside the kernel.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
*/
void configure(ICLTensor *input, ICLTensor *output);
};
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h
new file mode 100644
index 000000000..1b0d70e7f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_H__
+#define __ARM_COMPUTE_CLCOMPARISON_OP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLComparisonOp : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input1 Source tensor1. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] input2 Source tensor2. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ */
+ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ const ComparisonOperation &op);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
new file mode 100644
index 000000000..d78a6ada4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLDepthToSpaceKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLDepthToSpace : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[block_size] block size integer only
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+} // namesace arm_compute
+
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
new file mode 100644
index 000000000..257772a89
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class CLEmbeddingLookup : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input.
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h
new file mode 100644
index 000000000..2d0fc23a4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLEXP_H__
+#define __ARM_COMPUTE_CLEXP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLExpKernel */
+class CLExp : public ICLSimpleFunction
+{
+public:
+ /** Set the source, destination of the kernel
+ *
+ * @param[in] input Source tensor. Data type supported: F32.
+ * @param[out] output Destination tensor. Data type supported: F32.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLEXP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
index 3ae7afe14..f7fd3cda1 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
@@ -14,32 +14,43 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLGather.h
+ * @brief This file contains CLGather class
+ * @ingroup COM_AI_RUNTIME
+ */
+
#ifndef __ARM_COMPUTE_CLGATHER_H__
#define __ARM_COMPUTE_CLGATHER_H__
-#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
namespace arm_compute
{
class ICLTensor;
-/** Basic function to run @ref CLGatherKernel. */
+/**
+ * @brief Class to to run @ref CLGatherKernel.
+ */
class CLGather : public ICLSimpleFunction
{
public:
- /** Initialise the kernel's inputs, output and convertion policy.
- *
- * @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
- * @param[in] input2 An indexes tensor. Data types supported: S32.
- * @param[out] output The output tensor, Data types supported: same as @p input1.
- */
+ /**
+ * @brief Initialise the kernel's inputs, output and convertion policy.
+ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
+ * @param[in] input2 An indexes tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ * @return N/A
+ */
void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGather
- *
- * @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
- * @param[in] input2 An indexes tensor. Data types supported: S32.
- * @param[out] output The output tensor, Data types supported: same as @p input1.
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration
+ * of @ref CLGather
+ * @param[in] input1 An input tensor. Data types supported: U8/S32/F32.
+ * @param[in] input2 An indexes tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
* @return a status
*/
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
new file mode 100644
index 000000000..65aa6cbd5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class CLHashtableLookup : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input.
+ * @param[in] keys Keys 1D tensor. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+ * (True) or not (False). Data types supported: U8/QASYMM8
+ * @return N/A
+ */
+ void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
+ ICLTensor *output, ICLTensor *hits);
+};
+}
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
new file mode 100644
index 000000000..198a0fd4e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEG_H__
+#define __ARM_COMPUTE_CLNEG_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLNeg : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input Source tensor. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ *
+ */
+ void configure(ICLTensor *input, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEG_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h
new file mode 100644
index 000000000..4077245d5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute a normalization layer. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel
+ * -# @ref CLNormalizationLayerKernelEx
+ *
+ */
+class CLNormalizationLayerEx : public IFunction
+{
+public:
+ /** Default constructor */
+ CLNormalizationLayerEx();
+ /** Set the input and output tensors.
+ *
+ * @param[in, out] input Source tensor. 3 lower dims represent a single input with dimensions
+ * [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types
+ * supported: F16/F32 (Written to by the border handler)
+ * @param[out] output Destination tensor. Dimensions, data type and number of channels must
+ * match the input ones.
+ * @param[in] norm_info Normalization layer information like the normalization type,
+ * normalization size and other parameters.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLNormalizationLayer
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions
+ * [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data types supported:
+ * F16/F32
+ * @param[in] output Destination tensor. Dimensions, data type and number of channels must
+ * match the input ones.
+ * @param[in] norm_info Normalization layer information like the normalization type, normalization
+ * size and other parameters.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ CLNormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel to run */
+ CLFillBorderKernel _border_handler; /**< Kernel to handle borders */
+};
+}
+#endif /* __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
new file mode 100644
index 000000000..622a61b5e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_H__
+#define __ARM_COMPUTE_CLPRELU_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLPReLU : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input. Data types supported:
+ * QASYMM8/F16/F32.
+ * @param[in] alpha. Data types supported:
+ * QASYMM8/F16/F32.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ */
+ void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_H__*/
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
new file mode 100644
index 000000000..d6ea486d1
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -0,0 +1,47 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYEREX_H__
+#define __ARM_COMPUTE_CLPADLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPadLayerKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLPadLayerEx : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported:
+ * U8/QASYMM8/S16/S32/F16/F32.
+ * @param[out] output Output tensor. Data types supported:
+ * U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] pad_size Tensor for Padding values in NHWC format shape [n, 2],
+ * where n is the rank of tensor . Data types supported: S32
+ */
+ void configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h
new file mode 100644
index 000000000..9a0cc213c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPERMUTEEX_H__
+#define __ARM_COMPUTE_CLPERMUTEEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute an @ref CLPermuteKernel. */
+class CLPermuteEx : public ICLSimpleFunction
+{
+public:
+ /** Set the input and output tensors.
+ *
+ * @param[in] input The input tensor to permute. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] output The output tensor. Data types supported: Same as @p input
+ * @param[in] perm Permutation vector
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLPermute.
+ *
+ * @param[in] input First tensor input info. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] output Output tensor info. Data types supported: same as @p input.
+ * @param[in] perm Permutation vector
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PermutationVector &perm);
+};
+}
+#endif /*__ARM_COMPUTE_CLPERMUTEEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
index c1383e21f..b142d3a2e 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
@@ -14,53 +14,61 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLPixelWiseDivision.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLPixelWiseDivision class
+ */
#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
-#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
namespace arm_compute
{
class ICLTensor;
-/** Basic function to run @ref CLPixelWiseDivisionKernel. */
+/**
+ * @brief Class to run @ref CLPixelWiseDivisionKernel.
+ */
class CLPixelWiseDivision : public ICLSimpleFunction
{
public:
- /** Initialise the kernel's inputs, output and convertion policy.
- *
- * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ /**
+ * @brief Initialise the kernel's inputs, output and convertion policy.
+ * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32
* The input tensor is [in, out] because its TensorInfo might be
* modified inside the kernel in case of broadcasting of dimension 0.
* @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
* The input tensor is [in, out] because its TensorInfo might be
* modified inside the kernel in case of broadcasting of dimension 0.
* @param[out] output The output tensor, Data types supported: same as @p input1.
- * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * Note: U8 requires both inputs to be U8.
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or
- * 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * 1/2^n where n is between 0 and 15.
* @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
* @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
* even.
+ * @return N/A
*/
void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
- /** Static function to check if given info will lead to a valid configuration of @ref
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
* CLPixelWiseDivision
- *
- * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32
* @param[in] input2 An input tensor info. Data types supported: same as @p input1.
* @param[in] output The output tensor info, Data types supported: same as @p input1.
- * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * Note: U8 requires both inputs to be U8.
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n
- * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * where n is between 0 and 15.
* @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
* @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
- *
* @return a status
*/
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
deleted file mode 100644
index 14b473f33..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
-#define __ARM_COMPUTE_CLREDUCE_MAX_H__
-
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLTopKV2Kernel
- */
-class CLReduceMax : public IFunction
-{
-public:
- /** Constructor */
- CLReduceMax();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLReduceMax(const CLReduceMax &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLReduceMax &operator=(const CLReduceMax &) = delete;
- /** Allow instances of this class to be moved */
- CLReduceMax(CLReduceMax &&) = default;
- /** Allow instances of this class to be moved */
- CLReduceMax &operator=(CLReduceMax &&) = default;
- /** Initialise the kernel's inputs and outputs.
- *
- * @note When locations of min and max occurrences are requested, the reported number of locations
- * is limited to the given array size.
- *
- * @param[in] input Input image. Data types supported: F32
- * @param[in] axis Axis to reduce. Data type supported: S32
- * @param[out] output indices related to top k values. Data types supported: F32.
- */
- void configure(ICLTensor *input, int32_t axis, ICLTensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLPixelWiseDivision
- *
- * @param[in] input Input image. Data types supported: F32
- * @param[in] axis Axis to reduce. Data type supported: S32
- * @param[out] output indices related to top k values. Data types supported: F32. *
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- void run_on_cpu();
-
- int32_t _axis;
-
- ICLTensor *_input;
- ICLTensor *_output;
-
- std::unique_ptr<ICLKernel> _kernel;
-};
-}
-#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
new file mode 100644
index 000000000..e1a6f6ab4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperation.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLReduceOperation class
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATION_H__
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform ReduceOperation
+ */
+class CLReduceOperation : public IFunction
+{
+public:
+ /**
+ * @brief Construct a new ReduceOperation object
+ */
+ CLReduceOperation();
+
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] input Source tensor. Data types supported: U8/S32/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+ * @param[in] op Reduce operation to perform.
+ * @return N/A
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis,
+ ReduceOperation op);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLReduceOperation.
+ * @param[in] input Source tensor info. Data types supported: U8/S32/F32
+ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+ * @param[in] op Reduce operation to perform.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const std::set<uint32_t> &axis, const ReduceOperation &op);
+
+ /**
+ * @brief Run the OpenCL kernel for this operation
+ * @return N/A
+ */
+ void run() override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ std::set<uint32_t> _axis;
+
+ std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+ std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
deleted file mode 100644
index 2081518c1..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__
-#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__
-
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Perform reduction operation.
- */
-class CLReductionMean : public IFunction
-{
-public:
- /** Default Constructor.
- */
- CLReductionMean();
-
- /** Set the input and output tensors.
- *
- * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW.
- * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
- * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1
- */
- void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * CLReductionMean.
- *
- * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
- * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
- * input.
- * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- std::vector<uint32_t> axis);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- CLReductionMeanKernel _reduction_mean_kernel;
- CLFillBorderKernel _fill_border_kernel;
-};
-}
-#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
new file mode 100644
index 000000000..7e2df8986
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToBatchNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32.
+ * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape
+ * block_shape, and interleaves these blocks with the "batch" dimension such that in the output.
+ */
+class CLSpaceToBatchND : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @note The data layout of input and output must be the same.
+ * @note The number of dimensions of input and output must be 4, and `spatial` dimensions
+ * are height and width.
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
+ * @param[in] block_size Tensor of integer values specifying block sizes for spatial
+ * dimension.
+ * Data types supported: S32
+ * @param[in] padding_size Tensor of integer values specifying padding sizes for spatial
+ * dimension.
+ * Data types supported: S32
+ * @param[out] output Output tensor. Data types supported: same as @p input.
+ * Data layout supported: NCHW/NHWC
+ */
+ void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+ ICLTensor *output);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
new file mode 100644
index 000000000..17f762092
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
+#define __ARM_COMPUTE_CLSPACETODEPTH_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToDepthKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLSpaceToDepth : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[block_size] block size integer only
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h
new file mode 100644
index 000000000..3610ba71c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__
+#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLSquaredDifference : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input1 Source tensor1. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] input2 Source tensor2. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ */
+ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__*/
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
deleted file mode 100644
index f223a79be..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__
-
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLStridedSliceKernel */
-class CLStridedSlice : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's inputs and outputs
- *
- * @param[in] input First tensor input. Data type supported:
- * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
- * @param[out] output Output tensor. Data type supported: Same as @p input
- */
- void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
- ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
- int32_t shrinkAxisMask);
-};
-
-class CLStridedSliceCPU : public IFunction
-{
-public:
- /** Initialise inputs and outputs
- *
- * @param[in] input First tensor input.
- * @param[out] output Output tensor.
- */
- void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData,
- ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
- int32_t shrinkAxisMask);
-
- void run() override;
-
-private:
- void run_on_cpu();
-
- ICLTensor *_input;
- ICLTensor *_output;
- ICLTensor *_beginData;
- ICLTensor *_endData;
- ICLTensor *_stridesData;
- int32_t _beginMask;
- int32_t _endMask;
- int32_t _shrinkAxisMask;
-};
-}
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
new file mode 100644
index 000000000..6b26a85c8
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLStridedSlice.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
+ */
+
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLStridedSliceKernel
+ */
+class CLStridedSliceEx : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Initialise the kernel's inputs and outputs
+ * @param[in] input Tensor input. Data type supported:
+ * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] beginData 'begin' vector of strided slice operation
+ * @param[in] endData 'end' vector of strided slice operation
+ * @param[in] stridesData 'strides' vector of strided slice operation
+ * @param[in] beginMask If the ith bit is set, begin[i] is ignored
+ * @param[in] endMask If the ith bit is set, end[i] is ignored
+ * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the
+ * dimensionality by 1, taking on the value at index begin[i]
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+ ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask);
+};
+}
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
index 06cd1ee9b..5327e016f 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -14,51 +14,79 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/**
+ * @file CLTopKV2.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLTopKV2 class
+ */
#ifndef __ARM_COMPUTE_CLTOPK_V2_H__
#define __ARM_COMPUTE_CLTOPK_V2_H__
#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
-#include "arm_compute/runtime/CL/CLArray.h"
#include "arm_compute/runtime/IFunction.h"
namespace arm_compute
{
class ICLTensor;
-/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLTopKV2Kernel
+/**
+ * @brief Class to execute TopKV2 operation.
*/
class CLTopKV2 : public IFunction
{
public:
- /** Constructor */
+ /**
+ * @brief Construct a new CLTopKV2 object
+ */
CLTopKV2();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
CLTopKV2(const CLTopKV2 &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
CLTopKV2 &operator=(const CLTopKV2 &) = delete;
- /** Allow instances of this class to be moved */
+
+ /**
+ * @brief Construct a new CLTopKV2 object by using copy constructor
+ * @param[in] CLTopKV2 object to move
+ */
CLTopKV2(CLTopKV2 &&) = default;
- /** Allow instances of this class to be moved */
+
+ /**
+ * @brief Assign a CLTopKV2 object.
+ * @param[in] CLTopKV2 object to assign. This object will be moved.
+ */
CLTopKV2 &operator=(CLTopKV2 &&) = default;
- /** Initialise the kernel's inputs and outputs.
- *
- * @note When locations of min and max occurrences are requested, the reported number of locations
- * is limited to the given array size.
- *
+
+ /**
+ * @brief Initialise the kernel's inputs and outputs.
* @param[in] input Input image. Data types supported: U8/S16/F32.
* @param[in] k The value of `k`.
* @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if
* input type is F32.
- * @param[out] indices indices related to top k values. Data types supported: S32 if input type
+ * @param[out] indices Indices related to top k values. Data types supported: S32 if input type
* is U8/S16, F32 if input type is F32.
+ * @return N/A
*/
void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
int total_bits = 32, int bits = 4);
- // Inherited methods overridden:
+ /**
+ * @brief Run the kernels contained in the function
+ * Depending on the value of the following environment variables it works differently:
+ * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE",
+ * quick sort on GPU is used.
+ * - If the value of environment variable "ACL_TOPKV2" == ""GPU"",
+ * radix sort on GPU is used.
+ * - For other value, TopKV2 runs on CPU
+ * @return N/A
+ */
void run() override;
private:
diff --git a/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h
new file mode 100644
index 000000000..fa7408ecd
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to compute a normalization layer. This function calls the following NEON kernels:
+ *
+ * -# @ref NEPixelWiseMultiplicationKernel
+ * -# @ref NEFillBorderKernel
+ * -# @ref NENormalizationLayerKernelEx
+ *
+ */
+class NENormalizationLayerEx : public IFunction
+{
+public:
+ /** Default constructor */
+ NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions
+ * [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data type supported:
+ * F16/F32
+ * @param[out] output Destination with the same dimensions, data type and number of channels of
+ * @p input
+ * @param[in] norm_info Normalization layer information like the normalization type,
+ * normalization size and other parameters.
+ */
+ void configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NENormalizationLayer
+ *
+ * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions
+ * [width, height, IFM],
+ * and an optional 4th dimension for batch of inputs. Data type supported:
+ * F16/F32
+ * @param[in] output Destination with the same dimensions, data type and number of channels of
+ * @p input
+ * @param[in] norm_info Normalization layer information like the normalization type, normalization
+ * size and other parameters.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ MemoryGroup _memory_group; /**< Function memory group */
+ NENormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel */
+ NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
+ NEFillBorderKernel _border_handler; /**< Kernel to handle borders */
+ Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
+};
+}
+#endif /* __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index d535c5da4..05ecdeb22 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -38,255 +38,37 @@
using namespace arm_compute;
const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
- {"absdiff", "absdiff.cl"},
- {"accumulate", "accumulate.cl"},
- {"accumulate_squared", "accumulate.cl"},
- {"accumulate_weighted", "accumulate.cl"},
- {"activation_layer", "activation_layer.cl"},
- {"activation_layer_qa8", "activation_layer_qa8.cl"},
- {"activation_layer_logistic_qa8", "activation_layer_qa8.cl"},
- {"arithmetic_add", "arithmetic_op.cl"},
- {"arithmetic_sub", "arithmetic_op.cl"},
+ // ARMComputeEx kernels
+ {"activation_layer_ex", "activation_layer_ex.cl"},
+ {"arg_op", "arg_operation.cl"},
+ {"arithmetic_sub_ex", "arithmetic_op_ex.cl"},
{"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
- {"batchnormalization_layer_nchw", "batchnormalization_layer.cl"},
- {"batchnormalization_layer_nhwc", "batchnormalization_layer.cl"},
- {"bitwise_or", "bitwise_op.cl"},
- {"bitwise_and", "bitwise_op.cl"},
- {"bitwise_xor", "bitwise_op.cl"},
- {"bitwise_not", "bitwise_op.cl"},
+ {"batch_to_space_nd", "batch_to_space_nd.cl"},
+ {"binary_logical_op", "binary_logical_op.cl"},
{"cast", "cast.cl"},
{"cast_qasymm_in", "cast.cl"},
{"cast_qasymm_out", "cast.cl"},
- {"channel_combine_NV", "channel_combine.cl"},
- {"channel_combine_RGB888", "channel_combine.cl"},
- {"channel_combine_RGBA8888", "channel_combine.cl"},
- {"channel_combine_UYVY422", "channel_combine.cl"},
- {"channel_combine_YUYV422", "channel_combine.cl"},
- {"channel_shuffle_nchw", "channel_shuffle.cl"},
- {"channel_extract_NV12", "channel_extract.cl"},
- {"channel_extract_NV21", "channel_extract.cl"},
- {"channel_extract_RGB888", "channel_extract.cl"},
- {"channel_extract_RGBA8888", "channel_extract.cl"},
- {"channel_extract_UYVY422", "channel_extract.cl"},
- {"channel_extract_YUYV422", "channel_extract.cl"},
- {"combine_gradients_L1", "canny.cl"},
- {"combine_gradients_L2", "canny.cl"},
- {"concatenate_depth", "concatenate.cl"},
- {"concatenate_width", "concatenate.cl"},
- {"convolution_rectangle", "convolution_rectangle.cl"},
- {"col2im", "col2im.cl"},
- {"convert_depth_down", "depth_convert.cl"},
- {"convert_depth_up", "depth_convert.cl"},
- {"convert_fc_weights", "convert_fc_weights.cl"},
- {"convolution3x3_static", "convolution3x3.cl"},
- {"convolution5x5_static", "convolution5x5.cl"},
- {"convolution7x7_static", "convolution7x7.cl"},
- {"convolution9x9_static", "convolution9x9.cl"},
- {"convolution_separable1x5_static", "convolution5x5.cl"},
- {"convolution_separable5x1_static", "convolution5x5.cl"},
- {"convolution_separable1x7_static", "convolution7x7.cl"},
- {"convolution_separable7x1_static", "convolution7x7.cl"},
- {"convolution_separable1x9_static", "convolution9x9.cl"},
- {"convolution_separable9x1_static", "convolution9x9.cl"},
- {"copy_tensor", "copy_tensor.cl"},
- {"copy_plane", "channel_extract.cl"},
- {"copy_planes_3p", "channel_combine.cl"},
- {"copy_to_keypoint", "fast_corners.cl"},
- {"deconvolution_upsample", "deconvolution_layer.cl"},
- {"depthwise_convolution_3x3", "depthwise_convolution.cl"},
- {"depthwise_convolution_3x3_f16", "depthwise_convolution.cl"},
- {"depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl"},
- {"depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl"},
- {"depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl"},
- {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl"},
- {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl"},
- {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl"},
- {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl"},
- {"depthwise_im2col", "depthwise_convolution.cl"},
- {"depthwise_vector_to_tensor", "depthwise_convolution.cl"},
- {"depthwise_weights_reshape", "depthwise_convolution.cl"},
- {"dequantization_layer", "dequantization_layer.cl"},
- {"derivative", "derivative.cl"},
- {"dilate", "dilate.cl"},
- {"direct_convolution1x1", "direct_convolution1x1.cl"},
- {"direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl"},
- {"direct_convolution3x3", "direct_convolution3x3.cl"},
- {"direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl"},
- {"direct_convolution5x5", "direct_convolution5x5.cl"},
- {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"},
- {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
- {"erode", "erode.cl"},
- {"fast_corners", "fast_corners.cl"},
- {"fill_image_borders_constant", "fill_border.cl"},
- {"fill_image_borders_replicate", "fill_border.cl"},
- {"finalize", "optical_flow_pyramid_lk.cl"},
- {"floor_layer", "floor.cl"},
+ {"comparison_op", "comparison_op.cl"},
+ {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
+ {"depth_to_space", "depth_to_space.cl"},
+ {"embedding_lookup", "embedding_lookup.cl"},
+ {"exp_layer", "exp.cl"},
{"gather", "gather.cl"},
{"gather_1d", "gather.cl"},
{"gather_1d_out", "gather.cl"},
- {"gaussian1x5_sub_x", "gaussian_pyramid.cl"},
- {"gaussian5x1_sub_y", "gaussian_pyramid.cl"},
- {"gemm_accumulate_biases", "gemm.cl"},
- {"gemm_interleave4x4", "gemm.cl"},
- {"gemm_ma_f16", "gemm.cl"},
- {"gemm_ma_f32", "gemm.cl"},
- {"gemm_ma_qs8", "gemm.cl"},
- {"gemm_ma_qs16", "gemm.cl"},
- {"gemm_mv", "gemv.cl"},
- {"gemm_mv_quantized", "gemv.cl"},
- {"gemm_mm_interleaved_transposed_f16", "gemm.cl"},
- {"gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl"},
- {"gemm_mm_interleaved_transposed_f32", "gemm.cl"},
- {"gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl"},
- {"gemm_mm_interleaved_transposed_qs8", "gemm.cl"},
- {"gemm_mm_interleaved_transposed_qs16", "gemm.cl"},
- {"gemm_mm_floating_point", "gemm.cl"},
- {"gemm_mm_floating_point_f16_bifrost", "gemm.cl"},
- {"gemm_mm_floating_point_f32_bifrost", "gemm.cl"},
- {"gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl"},
- {"gemm_mm_qs8", "gemm.cl"},
- {"gemm_mm_qs16", "gemm.cl"},
- {"gemm_lc_vm_f32", "gemm.cl"},
- {"gemm_transpose1xW", "gemm.cl"},
- {"gemmlowp_matrix_a_reduction", "gemmlowp.cl"},
- {"gemmlowp_matrix_b_reduction", "gemmlowp.cl"},
- {"gemmlowp_mm_bifrost", "gemmlowp.cl"},
- {"gemmlowp_mm_midgard", "gemmlowp.cl"},
- {"gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl"},
- {"gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl"},
- {"gemmlowp_offset_contribution", "gemmlowp.cl"},
- {"gemmlowp_output_stage_quantize_down", "gemmlowp.cl"},
- {"gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl"},
- {"harris_score_3x3", "harris_corners.cl"},
- {"harris_score_5x5", "harris_corners.cl"},
- {"harris_score_7x7", "harris_corners.cl"},
- {"hist_border_kernel", "histogram.cl"},
- {"hist_border_kernel_fixed", "histogram.cl"},
- {"hist_local_kernel", "histogram.cl"},
- {"hist_local_kernel_fixed", "histogram.cl"},
- {"hog_block_normalization", "hog.cl"},
- {"hog_detector", "hog.cl"},
- {"hog_orientation_binning", "hog.cl"},
- {"hysteresis", "canny.cl"},
- {"im2col1x1_stridex1_dchw", "im2col.cl"},
- {"im2col3x3_dchw", "im2col.cl"},
- {"im2col5x5_dchw", "im2col.cl"},
- {"im2col11x11_padx0_pady0_dchw", "im2col.cl"},
- {"im2col_generic_dchw", "im2col.cl"},
- {"im2col_generic_padx0_pady0_dchw", "im2col.cl"},
- {"im2col_reduced_dchw", "im2col.cl"},
- {"init_level", "optical_flow_pyramid_lk.cl"},
- {"init_level_max", "optical_flow_pyramid_lk.cl"},
- {"init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl"},
- {"integral_horizontal", "integral_image.cl"},
- {"integral_vertical", "integral_image.cl"},
- {"IYUV_to_NV12_bt709", "color_convert.cl"},
- {"IYUV_to_RGB888_bt709", "color_convert.cl"},
- {"IYUV_to_RGBA8888_bt709", "color_convert.cl"},
- {"IYUV_to_YUV444_bt709", "color_convert.cl"},
- {"l2_normalize", "l2_normalize.cl"},
- {"lktracker_stage0", "optical_flow_pyramid_lk.cl"},
- {"lktracker_stage1", "optical_flow_pyramid_lk.cl"},
- {"magnitude_phase", "magnitude_phase.cl"},
- {"mean_stddev_accumulate", "mean_stddev.cl"},
- {"minmax", "minmaxloc.cl"},
- {"minmax_border", "minmaxloc.cl"},
- {"minmax_layer", "minmax_layer.cl"},
- {"minmaxloc", "minmaxloc.cl"},
- {"non_linear_filter_box3x3", "non_linear_filter3x3.cl"},
- {"non_linear_filter_cross3x3", "non_linear_filter3x3.cl"},
- {"non_linear_filter_disk3x3", "non_linear_filter3x3.cl"},
- {"non_linear_filter_box5x5", "non_linear_filter5x5.cl"},
- {"non_linear_filter_cross5x5", "non_linear_filter5x5.cl"},
- {"non_linear_filter_disk5x5", "non_linear_filter5x5.cl"},
- {"non_max_suppression", "nonmax.cl"},
- {"normalization_layer_cross_map", "normalization_layer.cl"},
- {"normalization_layer_in_map", "normalization_layer.cl"},
- {"NV12_to_IYUV_bt709", "color_convert.cl"},
- {"NV12_to_RGB888_bt709", "color_convert.cl"},
- {"NV12_to_RGBA8888_bt709", "color_convert.cl"},
- {"NV12_to_YUV444_bt709", "color_convert.cl"},
- {"NV21_to_IYUV_bt709", "color_convert.cl"},
- {"NV21_to_RGB888_bt709", "color_convert.cl"},
- {"NV21_to_RGBA8888_bt709", "color_convert.cl"},
- {"NV21_to_YUV444_bt709", "color_convert.cl"},
- {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
- {"permute_201", "permute.cl"},
- {"permute_120", "permute.cl"},
- {"permute_3201", "permute.cl"},
- {"pixelwise_mul_float", "pixelwise_mul_float.cl"},
- {"pixelwise_mul_int", "pixelwise_mul_int.cl"},
+ {"hashtable_lookup", "hashtable_lookup.cl"},
+ {"neg_tensor", "neg_tensor.cl"},
+ {"pad", "pad.cl"},
+ {"permute_generic", "permute_ex.cl"},
{"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
{"pixelwise_div_float", "pixelwise_div_float.cl"},
{"pixelwise_div_int", "pixelwise_div_int.cl"},
- {"pooling_layer_2", "pooling_layer.cl"},
- {"pooling_layer_3", "pooling_layer.cl"},
- {"pooling_layer_optimized_3", "pooling_layer.cl"},
- {"pooling_layer_7", "pooling_layer.cl"},
- {"pooling_layer_MxN_nchw", "pooling_layer.cl"},
- {"pooling_layer_MxN_nhwc", "pooling_layer.cl"},
- {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"},
- {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"},
- {"quantization_layer", "quantization_layer.cl"},
- {"reduce_max", "reduce_max.cl"},
- {"reduction_operation", "reduction_operation.cl"},
- {"reduction_mean", "reduction_mean.cl"},
- {"remap_nearest_neighbour", "remap.cl"},
- {"remap_bilinear", "remap.cl"},
- {"reshape_layer", "reshape_layer.cl"},
- {"reshape_to_columns", "convolution_layer.cl"},
- {"RGB888_to_IYUV_bt709", "color_convert.cl"},
- {"RGB888_to_NV12_bt709", "color_convert.cl"},
- {"RGB888_to_RGBA8888_bt709", "color_convert.cl"},
- {"RGB888_to_YUV444_bt709", "color_convert.cl"},
- {"RGBA8888_to_IYUV_bt709", "color_convert.cl"},
- {"RGBA8888_to_NV12_bt709", "color_convert.cl"},
- {"RGBA8888_to_RGB888_bt709", "color_convert.cl"},
- {"RGBA8888_to_YUV444_bt709", "color_convert.cl"},
- {"roi_pooling_layer", "roi_pooling_layer.cl"},
- {"scale_nearest_neighbour", "scale.cl"},
- {"scale_bilinear", "scale.cl"},
- {"scharr3x3", "scharr_filter.cl"},
- {"sobel3x3", "sobel_filter.cl"},
- {"sobel_separable5x1", "sobel_filter.cl"},
- {"sobel_separable1x5", "sobel_filter.cl"},
- {"sobel_separable7x1", "sobel_filter.cl"},
- {"sobel_separable1x7", "sobel_filter.cl"},
- {"softmax_layer_norm", "softmax_layer.cl"},
- {"softmax_layer_norm_quantized", "softmax_layer_quantized.cl"},
- {"softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl"},
- {"softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl"},
- {"softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl"},
- {"softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl"},
- {"strided_slice", "strided_slice.cl"},
- {"suppress_non_maximum", "canny.cl"},
- {"tablelookup_U8", "tablelookup.cl"},
- {"tablelookup_S16", "tablelookup.cl"},
- {"threshold_binary", "threshold.cl"},
- {"threshold_range", "threshold.cl"},
- {"transpose", "transpose.cl"},
- {"UYVY422_to_IYUV_bt709", "color_convert.cl"},
- {"UYVY422_to_NV12_bt709", "color_convert.cl"},
- {"UYVY422_to_RGB888_bt709", "color_convert.cl"},
- {"UYVY422_to_RGBA8888_bt709", "color_convert.cl"},
- {"warp_affine_nearest_neighbour", "warp_affine.cl"},
- {"warp_affine_bilinear", "warp_affine.cl"},
- {"warp_perspective_nearest_neighbour", "warp_perspective.cl"},
- {"warp_perspective_bilinear", "warp_perspective.cl"},
- {"winograd_filter_transform_2x2_3x3_nchw", "winograd.cl"},
- {"winograd_filter_transform_4x4_3x3_nchw", "winograd.cl"},
- {"winograd_filter_transform_4x4_5x5_nchw", "winograd.cl"},
- {"winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl"},
- {"winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl"},
- {"winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl"},
- {"winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl"},
- {"winograd_output_transform_2x2_3x3_nchw", "winograd.cl"},
- {"winograd_output_transform_4x4_3x3_nchw", "winograd.cl"},
- {"winograd_output_transform_4x4_5x5_nchw", "winograd.cl"},
- {"YUYV422_to_IYUV_bt709", "color_convert.cl"},
- {"YUYV422_to_NV12_bt709", "color_convert.cl"},
- {"YUYV422_to_RGB888_bt709", "color_convert.cl"},
- {"YUYV422_to_RGBA8888_bt709", "color_convert.cl"},
+ {"prelu", "prelu.cl"},
+ {"prelu_qasymm8", "prelu_quantized.cl"},
+ {"reduce_min_max", "reduce_operation.cl"},
+ {"reduce_sum_mean", "reduce_operation.cl"},
+ {"squared_difference", "squared_difference.cl"},
+ {"strided_slice_ex", "strided_slice_ex.cl"},
{"topkv2_init", "topkv2.cl"},
{"topkv2_find_first_negative", "topkv2.cl"},
{"topkv2_reorder_negatives", "topkv2.cl"},
@@ -296,23 +78,62 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
{"radixsort_pastehistograms", "topkv2_radixsort.cl"},
{"radixsort_reorder", "topkv2_radixsort.cl"},
{"topkv2_quicksort", "topkv2_quicksort.cl"},
+ {"space_to_batch_4d_nchw", "space_to_batch.cl"},
+ {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
+ {"space_to_depth", "space_to_depth.cl"},
};
const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
#ifdef EMBEDDED_KERNELS
{
+ "activation_layer_ex.cl",
+#include "./cl_kernels/activation_layer_ex.clembed"
+ },
+ {
+ "arg_operation.cl",
+#include "./cl_kernels/arg_operation.clembed"
+ },
+ {
+ "arithmetic_op_ex.cl",
+#include "./cl_kernels/arithmetic_op_ex.clembed"
+ },
+ {
+ "batch_to_space_nd.cl",
+#include "./cl_kernels/batch_to_space_nd.clembed"
+ },
+ {
"cast.cl",
#include "./cl_kernels/cast.clembed"
},
{
- "fixed_point.h",
-#include "./cl_kernels/fixed_point.hembed"
+ "comparison_op.cl",
+#include "./cl_kernels/comparison_op.clembed"
+ },
+ {
+ "comparison_op_quantized.cl",
+#include "./cl_kernels/comparison_op_quantized.clembed"
+ },
+ {
+ "embedding_lookup.cl",
+#include "./cl_kernels/embedding_lookup.clembed"
+ },
+ {
+ "depth_to_space.cl",
+#include "./cl_kernels/depth_to_space.clembed"
+ },
+ {
+ "exp.cl",
+#include "./cl_kernels/exp.clembed"
},
{
"gather.cl",
#include "./cl_kernels/gather.clembed"
},
{
+ "hashtable_lookup.cl",
+#include "./cl_kernels/hashtable_lookup.clembed"
+ },
+ {
"helpers.h",
#include "./cl_kernels/helpers.hembed"
},
@@ -321,6 +142,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
#include "./cl_kernels/helpers_asymm.hembed"
},
{
+ "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+ },
+ {
+ "neg_tensor.cl",
+#include "./cl_kernels/neg_tensor.clembed"
+ },
+ {
+ "pad.cl",
+#include "./cl_kernels/pad.clembed"
+ },
+ {
"pixelwise_div_float.cl",
#include "./cl_kernels/pixelwise_div_float.clembed"
},
@@ -329,16 +162,32 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
#include "./cl_kernels/pixelwise_div_int.clembed"
},
{
- "reduce_max.cl",
-#include "./cl_kernels/reduce_max.clembed"
+ "prelu.cl",
+#include "./cl_kernels/prelu.clembed"
+ },
+ {
+ "prelu_quantized.cl",
+#include "./cl_kernels/prelu_quantized.clembed"
+ },
+ {
+ "reduce_operation.cl",
+#include "./cl_kernels/reduce_operation.clembed"
+ },
+ {
+ "space_to_batch.cl",
+#include "./cl_kernels/space_to_batch.clembed"
},
{
- "reduction_mean.cl",
-#include "./cl_kernels/reduction_mean.clembed"
+ "space_to_depth.cl",
+#include "./cl_kernels/space_to_depth.clembed"
},
{
- "strided_slice.cl",
-#include "./cl_kernels/strided_slice.clembed"
+ "squared_difference.cl",
+#include "./cl_kernels/squared_difference.clembed"
+ },
+ {
+ "strided_slice_ex.cl",
+#include "./cl_kernels/strided_slice_ex.clembed"
},
{
"topkv2.cl",
@@ -352,6 +201,11 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
"topkv2_quicksort.cl",
#include "./cl_kernels/topkv2_quicksort.clembed"
},
+ {
+ "permute_ex.cl",
+#include "./cl_kernels/permute_ex.clembed"
+ },
+
#endif /* EMBEDDED_KERNELS */
};
@@ -359,7 +213,7 @@ CLKernelLibraryEx::CLKernelLibraryEx()
: _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
{
opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
- // CLKernelLibrary is built
+ // CLKernelLibraryEx is built
}
CLKernelLibraryEx &CLKernelLibraryEx::get()
@@ -380,7 +234,7 @@ Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
}
std::string concat_str;
- if (fp16_supported(_device))
+ if (fp16_supported())
{
concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
}
@@ -434,6 +288,13 @@ void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
_built_programs_map.emplace(built_program_name, program);
}
+bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); }
+
+bool CLKernelLibraryEx::int64_base_atomics_supported() const
+{
+ return device_supports_extension(_device, "cl_khr_int64_base_atomics");
+}
+
const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
{
const auto program_it = _programs_map.find(program_name);
@@ -525,6 +386,7 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con
cl::NDRange CLKernelLibraryEx::default_ndrange() const
{
+ // GPUTarget _target = get_target_from_device(_device);
cl::Device device = cl::Device::getDefault();
GPUTarget _target = get_target_from_device(device);
cl::NDRange default_range;
diff --git a/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp
new file mode 100644
index 000000000..cbda169fb
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/OpenCLEx.h"
+
+#include <dlfcn.h>
+#include <iostream>
+
+namespace arm_compute
+{
+CLSymbolsEx &CLSymbolsEx::get()
+{
+ static CLSymbolsEx symbols;
+ return symbols;
+}
+
+bool CLSymbolsEx::load_default()
+{
+ static const std::vector<std::string> libraries{"libOpenCL.so", "libGLES_mali.so", "libmali.so"};
+
+ if (_loaded.first)
+ {
+ return _loaded.second;
+ }
+
+ // Indicate that default loading has been tried
+ _loaded.first = true;
+
+ for (const auto &lib : libraries)
+ {
+ if (load(lib))
+ {
+ return true;
+ }
+ }
+
+ std::cerr << "Couldn't find any OpenCL library.\n";
+ return false;
+}
+
+bool CLSymbolsEx::load(const std::string &library)
+{
+ void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL);
+
+ if (handle == nullptr)
+ {
+ std::cerr << "Can't load " << library << ": " << dlerror() << "\n";
+ // Set status of loading to failed
+ _loaded.second = false;
+ return false;
+ }
+
+#define LOAD_FUNCTION_PTR(func_name, handle) \
+ func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name));
+
+ LOAD_FUNCTION_PTR(clGetEventInfo, handle);
+ LOAD_FUNCTION_PTR(clSetEventCallback, handle);
+
+#undef LOAD_FUNCTION_PTR
+
+ // Don't call dlclose(handle) or all the symbols will be unloaded !
+
+ // Disable default loading and set status to successful
+ _loaded = std::make_pair(true, true);
+
+ return true;
+}
+
+} // namespace arm_compute
+
+cl_int clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size,
+ void *param_value, size_t *param_value_size_ret)
+{
+ arm_compute::CLSymbolsEx::get().load_default();
+ auto func = arm_compute::CLSymbolsEx::get().clGetEventInfo_ptr;
+ if (func != nullptr)
+ {
+ return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int clSetEventCallback(cl_event event, cl_int command_exec_callback_type,
+ void(CL_CALLBACK *pfn_ev_notify)(cl_event ev, cl_int ev_cmd_exec_status,
+ void *user_data),
+ void *user_data)
+{
+ arm_compute::CLSymbolsEx::get().load_default();
+ auto func = arm_compute::CLSymbolsEx::get().clSetEventCallback_ptr;
+ if (func != nullptr)
+ {
+ return func(event, command_exec_callback_type, pfn_ev_notify, user_data);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
new file mode 100644
index 000000000..f54c7bde3
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#define CONST_ONE 1.f
+#define DIV_OP(a, b) ((a) / (b))
+#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a)))
+
+// Inverse Square-root Activation
+inline TYPE rsqrt_op(TYPE x)
+{
+ return RSQRT_OP(x);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+#if defined(ACT)
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer_ex(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load data
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+ // Perform activation
+ data = ACTIVATION_OP(ACT, data);
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+#endif /* defined(ACT) */
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
new file mode 100644
index 000000000..9a6921d7c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform arg_max/arg_min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: U32
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] axis Axis through which reduction occurs for max value index
+ * @param[in] dim Dimension across the axis to be reduced.
+ */
+
+__kernel void arg_op(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ const int axis,
+ const int dim)
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int indices[4] =
+ {
+ get_global_id(0),
+ get_global_id(1),
+ get_global_id(2) % DEPTH_OUT,
+ get_global_id(2) / DEPTH_OUT,
+ };
+
+ DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+ DATA_TYPE tval = value;
+ int idx = 0;
+ for(int i = 1; i < dim; ++i)
+ {
+ indices[axis] = i;
+
+ #if OP_CODE == 1 // ArgMax
+ value = max(value, *((__global DATA_TYPE *)
+ tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+ #elif OP_CODE == 2 //ArgMin
+ value = min(value, *((__global DATA_TYPE *)
+ tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+ #else
+ return;
+
+ #endif
+
+ if(tval!=value)
+ {
+ idx = indices[axis];
+ tval = value;
+ }
+ }
+
+ *((__global uint *)out.ptr) = idx;
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
new file mode 100644
index 000000000..2ed698951
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+/** This function subtracts one tensors from another.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void arithmetic_sub_ex(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load values
+ VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+ in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+ VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+ in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+ // Calculate and store result
+ vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
index 0c0a9ede6..5cd0a4309 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -2,32 +2,20 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2016, 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#include "helpers_asymm.h"
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-#endif /* FIXED_POINT_POSITION */
-
#ifdef SATURATE
#define ADD(x, y) add_sat((x), (y))
#define SUB(x, y) sub_sat((x), (y))
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
new file mode 100644
index 000000000..ad6a48a02
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
+/** Perform batch to space rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu
+t_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nd(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+ {
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int out_index[4]={0};
+ int in_index[4]={0};
+
+ out_index[0] = get_global_id(0);//W
+ out_index[1] = get_global_id(1);//H
+ out_index[2] = get_global_id(2) % DEPTH_OUT;//C
+ out_index[3] = get_global_id(2) / DEPTH_OUT;//N
+
+ in_index[0] = out_index[0]/BLOCK_SIZE1;
+ in_index[1] = out_index[1]/BLOCK_SIZE0;
+ in_index[2] = out_index[2];
+ in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
new file mode 100644
index 000000000..bea61f53e
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(OP_CODE) && defined(DATA_TYPE)
+/** returns truth value of the two input tensors for BINARY LOGICAL OP.
+ * where BINARY LOGICAL OP can be AND, OR.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[in] input2_ptr Pointer to the source tensor.Supported data types: QASYMM8
+ * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ */
+__kernel void binary_logical_op(
+ TENSOR3D_DECLARATION(input1),
+ TENSOR3D_DECLARATION(input2),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+ Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ #if OP_CODE == 1 // LOGICAL AND
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)
+ (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+
+ #elif OP_CODE == 2 // LOGICAL OR
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)
+ (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+
+ #else // OP NOT SUPPORTED
+ return
+
+ #endif
+}
+#endif //if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
index 113804cca..3d4675e5d 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -2,38 +2,34 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#include "helpers.h"
-#ifndef SCALE_IN
-#define SCALE_IN 1.0f
+#ifndef SCALE
+#define SCALE 1.0f
+#endif
+#ifndef OFFSET
+#define OFFSET 0
#endif
-#ifndef OFFSET_IN
-#define OFFSET_IN 0
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
#endif
+#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
/** Perform a cast operation on an input tensor.
*
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
* @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
@@ -65,9 +61,9 @@ __kernel void cast(
0, (__global DATA_TYPE_OUT *)output.ptr);
}
-
/** Perform a cast operation on an QASYMM8 input tensor.
- *
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
* @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
@@ -96,8 +92,8 @@ __kernel void cast_qasymm_in(
VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+ VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+ VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
@@ -108,7 +104,8 @@ __kernel void cast_qasymm_in(
/** Perform a cast operation on an QASYMM8 output tensor.
- *
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
* @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
@@ -137,8 +134,8 @@ __kernel void cast_qasymm_out(
VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
- VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
- VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+ VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+ VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
@@ -146,3 +143,4 @@ __kernel void cast_qasymm_out(
VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
0, (__global DATA_TYPE_OUT *)output.ptr);
}
+#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
new file mode 100644
index 000000000..765072556
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
+/** Returns truth value of comparison operators.
+ * Comparison operators may be equal, not_equal etc.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT,
+ * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input1_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[in] input2_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void comparison_op(
+ TENSOR3D_DECLARATION(input1),
+ TENSOR3D_DECLARATION(input2),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+ Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ #if OP_CODE == 1 //EQUAL
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)
+ (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr);
+
+ #elif OP_CODE == 2 //NOT_EQUAL
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)
+ (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+
+ #else // OP NOT SUPPORTED
+ return;
+
+ #endif
+}
+#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
new file mode 100644
index 000000000..1eb305f7b
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+
+/** Returns the truth value of comparison .
+ * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[in] input2_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void comparison_op_qasymm8(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
+ VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+
+ in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+ in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+ const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+ const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+ #if OPCODE == 1 //EQUAL QUANTIZED
+ VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
+
+ #elif OPCODE == 2 //NOT EQUAL QUANTIZED
+ VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
+
+ #else // OP NOT SUPPORTED
+ return;
+
+ #endif
+}
+#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
new file mode 100644
index 000000000..fef2243e7
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void depth_to_space(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+ {
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int out_index[4]={0};
+ int in_index[4]={0};
+
+ out_index[0] = get_global_id(0);//W
+ out_index[1] = get_global_id(1);//H
+ out_index[2] = get_global_id(2) % DEPTH_OUT;//C
+ out_index[3] = get_global_id(2) / DEPTH_OUT;//B
+
+ in_index[0] = out_index[0]/BLOCK_SIZE;
+ in_index[1] = out_index[1]/BLOCK_SIZE;
+ in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
+ in_index[3] = out_index[3];
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
new file mode 100644
index 000000000..348458fe9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform embedding_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32
+ * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes)
+ * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
+ */
+
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(lookups))
+{
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+ Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+ //lookup ids for based on the tensor dimensions
+ int lup_id[4] = {0};
+
+ lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+ :get_global_id(0);
+ lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+ :get_global_id(1);
+ lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+ :get_global_id(2)%DEPTH_OUT;
+ lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+ :get_global_id(2) / DEPTH_OUT;
+
+ in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+ + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+ VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+ 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
new file mode 100644
index 000000000..69d94f30a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Perform an exponential operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void exp_layer(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VSTORE(VEC_SIZE)
+ (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
deleted file mode 100644
index 7807533e2..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
+++ /dev/null
@@ -1,565 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_FIXED_POINT_H
-#define ARM_COMPUTE_FIXED_POINT_H
-
-#define TYPE_ALIAS(type, alias) \
- typedef type alias; \
- typedef type alias##x##1; \
- typedef type##2 alias##x##2; \
- typedef type##3 alias##x##3; \
- typedef type##4 alias##x##4; \
- typedef type##8 alias##x##8; \
- typedef type##16 alias##x##16;
-
-TYPE_ALIAS(char, qs8)
-TYPE_ALIAS(short, qs16)
-TYPE_ALIAS(int, qs32)
-
-#define qs8_MIN ((char)CHAR_MIN)
-#define qs8_MAX ((char)CHAR_MAX)
-#define qs16_MIN ((short)SHRT_MIN)
-#define qs16_MAX ((short)SHRT_MAX)
-#define qs32_MIN ((int)INT_MIN)
-#define qs32_MAX ((int)INT_MAX)
-
-#define qu8_MIN ((uchar)0)
-#define qu8_MAX ((uchar)UCHAR_MAX)
-#define qu16_MIN ((ushort)0)
-#define qu16_MAX ((ushort)USHRT_MAX)
-#define qu32_MIN ((uint)0)
-#define qu32_MAX ((uint)UINT_MAX)
-
-#define qs8_TYPE char
-#define qs8x1_TYPE char
-#define qs8x2_TYPE char2
-#define qs8x3_TYPE char3
-#define qs8x4_TYPE char4
-#define qs8x8_TYPE char8
-#define qs8x16_TYPE char16
-
-#define qs16_TYPE short
-#define qs16x1_TYPE short
-#define qs16x2_TYPE short2
-#define qs16x3_TYPE short3
-#define qs16x4_TYPE short4
-#define qs16x8_TYPE short8
-#define qs16x16_TYPE short16
-
-#define qs32_TYPE int
-#define qs32x1_TYPE int
-#define qs32x2_TYPE int2
-#define qs32x3_TYPE int3
-#define qs32x4_TYPE int4
-#define qs32x8_TYPE int8
-#define qs32x16_TYPE int16
-
-/* All internal constants are represented in the maximum supported fixed point format (QS16),
- * thus we define an additional shift parameter required to convert the constant
- * from the maximum supported format to the require one.
- */
-#define qs8_SHIFT 8
-#define qs16_SHIFT 0
-
-#undef VEC_DATA_TYPE_STR
-#undef VEC_DATA_TYPE
-#undef CONVERT_STR
-#undef CONVERT
-#undef CONVERT_SAT_STR
-#undef CONVERT_SAT
-
-#define VEC_DATA_TYPE_STR(type, size) type##x##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-
-#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x)))
-#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype)
-#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE)
-#define CONVERT(x, type) CONVERT_STR(x, type)
-
-#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x)))
-#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype)
-#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE)
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-/** Computes saturating absolute value of fixed point vector.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point absolute value.
- */
-#define ABSQ_SAT_IMPL(type) \
- inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); }
-
-ABSQ_SAT_IMPL(qs8x16)
-ABSQ_SAT_IMPL(qs16x8)
-
-#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a))
-#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
-
-/** Computes max of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point maximum.
- */
-#define MAXQ_IMPL(type) \
- inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); }
-
-MAXQ_IMPL(qs8x1)
-MAXQ_IMPL(qs8x2)
-MAXQ_IMPL(qs8x4)
-MAXQ_IMPL(qs8x8)
-MAXQ_IMPL(qs8x16)
-MAXQ_IMPL(qs16x1)
-MAXQ_IMPL(qs16x2)
-MAXQ_IMPL(qs16x4)
-MAXQ_IMPL(qs16x8)
-MAXQ_IMPL(qs16x16)
-
-#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
-#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
-
-/** Computes saturated addition of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point addition. The result is saturated in case of overflow
- */
-#define ADDQ_SAT_IMPL(type) \
- inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); }
-
-ADDQ_SAT_IMPL(qs8x1)
-ADDQ_SAT_IMPL(qs8x2)
-ADDQ_SAT_IMPL(qs8x4)
-ADDQ_SAT_IMPL(qs8x8)
-ADDQ_SAT_IMPL(qs8x16)
-ADDQ_SAT_IMPL(qs16x1)
-ADDQ_SAT_IMPL(qs16x2)
-ADDQ_SAT_IMPL(qs16x4)
-ADDQ_SAT_IMPL(qs16x8)
-ADDQ_SAT_IMPL(qs16x16)
-ADDQ_SAT_IMPL(qs32x1)
-ADDQ_SAT_IMPL(qs32x2)
-ADDQ_SAT_IMPL(qs32x4)
-ADDQ_SAT_IMPL(qs32x8)
-ADDQ_SAT_IMPL(qs32x16)
-
-#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b))
-#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
-
-/** Computes saturated subtraction of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point subtraction. The result is saturated in case of overflow
- */
-#define SUBQ_SAT_IMPL(type) \
- inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); }
-
-SUBQ_SAT_IMPL(qs8x1)
-SUBQ_SAT_IMPL(qs8x2)
-SUBQ_SAT_IMPL(qs8x4)
-SUBQ_SAT_IMPL(qs8x8)
-SUBQ_SAT_IMPL(qs8x16)
-SUBQ_SAT_IMPL(qs16x1)
-SUBQ_SAT_IMPL(qs16x2)
-SUBQ_SAT_IMPL(qs16x4)
-SUBQ_SAT_IMPL(qs16x8)
-SUBQ_SAT_IMPL(qs16x16)
-
-#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
-#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
-
-/* Multiply of two fixed point numbers
- *
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiplication.
- */
-#define MULQ_IMPL(type, itype) \
- inline type mul_##type(type VopA, type VopB, int fixed_point_position) \
- { \
- itype round_val = (itype)(1 << (fixed_point_position - 1)); \
- itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \
- return CONVERT((res >> (itype)fixed_point_position), type); \
- }
-
-MULQ_IMPL(qs8x8, qs16x8)
-MULQ_IMPL(qs16x8, qs32x8)
-MULQ_IMPL(qs8x16, qs16x16)
-MULQ_IMPL(qs16x16, qs32x16)
-
-#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position))
-#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position)
-
-/* Saturate multiply of two fixed point numbers
- *
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiplication. The result is saturated in case of overflow
- */
-#define MULQ_SAT_IMPL(type, itype) \
- inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \
- { \
- itype round_val = (itype)(1 << (fixed_point_position - 1)); \
- itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \
- return CONVERT_SAT((res >> (itype)fixed_point_position), type); \
- }
-
-MULQ_SAT_IMPL(qs8x1, qs16x1)
-MULQ_SAT_IMPL(qs8x2, qs16x2)
-MULQ_SAT_IMPL(qs8x3, qs16x3)
-MULQ_SAT_IMPL(qs8x4, qs16x4)
-MULQ_SAT_IMPL(qs8x8, qs16x8)
-MULQ_SAT_IMPL(qs8x16, qs16x16)
-MULQ_SAT_IMPL(qs16x1, qs32x1)
-MULQ_SAT_IMPL(qs16x2, qs32x2)
-MULQ_SAT_IMPL(qs16x3, qs32x3)
-MULQ_SAT_IMPL(qs16x4, qs32x4)
-MULQ_SAT_IMPL(qs16x8, qs32x8)
-MULQ_SAT_IMPL(qs16x16, qs32x16)
-
-#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \
- mul_sat_##type##x##size((a), (b), (position))
-#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \
- MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
-
-/** Saturate multiply-accumulate
- *
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiply-accumulate. The result is saturated in case of
- * overflow
- */
-#define MLAQ_SAT_IMPL(type, itype) \
- type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \
- { \
- itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \
- (itype)(1 << (fixed_point_position - 1))); \
- return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \
- }
-
-MLAQ_SAT_IMPL(qs8x8, qs16x8)
-MLAQ_SAT_IMPL(qs8x16, qs16x16)
-MLAQ_SAT_IMPL(qs16x8, qs32x8)
-
-#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
- mla_sat_##type##x##size((a), (b), (c), (position))
-#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \
- MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
-
-/** Saturate multiply-accumulate long
- *
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiply-accumulate long. The result is saturated in case
- * of overflow
- */
-#define MLALQ_SAT_IMPL(type, itype) \
- itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \
- { \
- itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \
- (itype)(1 << (fixed_point_position - 1))); \
- return add_sat(VopA, res >> (itype)fixed_point_position); \
- }
-
-MLALQ_SAT_IMPL(qs8x8, qs16x8)
-MLALQ_SAT_IMPL(qs16x8, qs32x8)
-
-#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
- mlal_sat_##type##x##size((a), (b), (c), (position))
-#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \
- MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
-
-/** Saturate division of two fixed point vectors
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point division. The result is saturated in case of overflow
- */
-#define DIVQ_SAT_IMPL(stype, type, itype) \
- inline type div_sat_##type(type VopA, type VopB, int fixed_point_position) \
- { \
- itype conv_a = CONVERT((VopA), itype); \
- itype denominator = CONVERT((VopB), itype); \
- itype numerator = conv_a << (itype)(fixed_point_position); \
- itype res = select((itype)(numerator / denominator), \
- select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \
- (itype)(denominator == (itype)0)); \
- return CONVERT_SAT((res), type); \
- }
-
-DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
-DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8)
-DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
-DIVQ_SAT_IMPL(qs8, qs8, qs16)
-DIVQ_SAT_IMPL(qs16, qs16, qs32)
-
-#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position))
-#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position)
-
-#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \
- div_sat_##type##x##size((a), (b), (position))
-#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \
- DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
-
-/** Saturate exponential of a fixed point vector
- *
- * @note Implemented approach uses taylor polynomial to approximate the exponential function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] size the number of the calculated elements.
- *
- * @return The result of the fixed point exponential. The result is saturated in case of overflow
- */
-#define EXPQ_IMPL(stype, type, size) \
- inline type exp_sat_##type(type VopA, int fixed_point_position) \
- { \
- type const_one = (type)(1 << (fixed_point_position)); \
- type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1); \
- type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one; \
- type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1); \
- type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1); \
- type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1); \
- type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1); \
- type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position); \
- type dec_m = m >> (type)fixed_point_position; \
- type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, \
- fixed_point_position); \
- alpha = CONVERT(abs_diff(VopA, alpha), type); \
- type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C); \
- sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B); \
- sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A); \
- sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \
- return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), \
- clz(sum) > dec_m); /* Saturate result if needed */ \
- }
-
-EXPQ_IMPL(qs8, qs8x2, 2)
-EXPQ_IMPL(qs8, qs8x4, 4)
-EXPQ_IMPL(qs8, qs8x8, 8)
-EXPQ_IMPL(qs8, qs8x16, 16)
-EXPQ_IMPL(qs16, qs16x2, 2)
-EXPQ_IMPL(qs16, qs16x4, 4)
-EXPQ_IMPL(qs16, qs16x8, 8)
-EXPQ_IMPL(qs16, qs16x16, 16)
-
-#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
-#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate logarithm of a fixed point vector
- *
- * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] size the number of the calculated elements.
- *
- * @return The result of the fixed point logarithm. The result is saturated in case of overflow
- */
-#define LOGQ_IMPL(stype, type, size) \
- inline type log_sat_##type(type VopA, int fixed_point_position) \
- { \
- type const_one = (type)(1 << (fixed_point_position)); \
- type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */ \
- type A = (type)(0x5C0F >> (14 - fixed_point_position)); /* 1.4384189 */ \
- type B = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */ \
- type C = (type)(0x2933 >> (15 - fixed_point_position)); /* 0.3218538 */ \
- type D = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */ \
- type inter_a = \
- select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \
- VopA < const_one); \
- type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position); \
- inter_a = inter_a >> shift_val; \
- inter_a = sub_sat(inter_a, const_one); \
- type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C); \
- sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B); \
- sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A); \
- sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position); \
- sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, \
- size, fixed_point_position); \
- return select(select(sum, -sum, VopA < const_one), (type)0, \
- VopA < (type)0); /* Saturate result if needed */ \
- }
-
-LOGQ_IMPL(qs8, qs8x16, 16)
-LOGQ_IMPL(qs16, qs16x8, 8)
-LOGQ_IMPL(qs16, qs16x16, 16)
-
-#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position))
-#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate inverse square root of a fixed point vector
- *
- * @note Implemented approach uses Newton's method to approximate the inverse square root function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] size the number of the calculated elements.
- *
- * @return The result of the fixed point inverse square root. The result is saturated in case of
- * overflow
- */
-#define INVSQRTQ_IMPL(stype, type, size) \
- inline type invsqrt_sat_##type(type VopA, int fixed_point_position) \
- { \
- type const_three = (type)(3 << (fixed_point_position)); \
- type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \
- type temp = select((type)(VopA >> shift_value), \
- select((type)stype##_MAX, (type)(VopA << (-shift_value)), \
- (type)(clz(VopA) > (-shift_value))), \
- (type)(shift_value < (type)0)); \
- type x = temp; \
- x = MUL_SAT_OP_EXPAND( \
- x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
- fixed_point_position), \
- temp, stype, size, fixed_point_position)), \
- stype, size, fixed_point_position) >> \
- 1; \
- x = MUL_SAT_OP_EXPAND( \
- x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
- fixed_point_position), \
- temp, stype, size, fixed_point_position)), \
- stype, size, fixed_point_position) >> \
- 1; \
- x = MUL_SAT_OP_EXPAND( \
- x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
- fixed_point_position), \
- temp, stype, size, fixed_point_position)), \
- stype, size, fixed_point_position) >> \
- 1; \
- if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */ \
- { \
- x = MUL_SAT_OP_EXPAND( \
- x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
- fixed_point_position), \
- temp, stype, size, fixed_point_position)), \
- stype, size, fixed_point_position) >> \
- 1; \
- x = MUL_SAT_OP_EXPAND( \
- x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \
- fixed_point_position), \
- temp, stype, size, fixed_point_position)), \
- stype, size, fixed_point_position) >> \
- 1; \
- } \
- type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \
- return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), \
- (type)(clz(x) > shift_value2)), \
- (type)(shift_value < (type)0)); /* Saturate result if needed */ \
- }
-
-INVSQRTQ_IMPL(qs8, qs8x1, 1)
-INVSQRTQ_IMPL(qs16, qs16x1, 1)
-INVSQRTQ_IMPL(qs8, qs8x16, 16)
-INVSQRTQ_IMPL(qs16, qs16x8, 8)
-
-#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position))
-#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate hyperbolic tangent of a fixed point vector
- *
- * tanh(x) = (e^2x - 1)/(e^2x + 1)
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] size the number of the calculated elements.
- *
- * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of
- * overflow
- */
-#define TANHQ_IMPL(stype, type, size) \
- inline type tanh_sat_##type(type VopA, int fixed_point_position) \
- { \
- type const_one = (type)(1 << (fixed_point_position)); \
- type const_two = (type)(2 << (fixed_point_position)); \
- type exp2x = \
- EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \
- stype, size, fixed_point_position); \
- type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size); \
- type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size); \
- return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position); \
- }
-
-TANHQ_IMPL(qs8, qs8x16, 16)
-TANHQ_IMPL(qs16, qs16x8, 8)
-
-#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position))
-#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position)
-
-#define floatx16 float16
-#define float16_TYPE float16
-
-#define CONVERTQ_DOWN_IMPL(in_type, out_type) \
- inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
- { \
- return CONVERT(a * (1 << fixed_point_position) + \
- select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
- out_type); \
- }
-
-CONVERTQ_DOWN_IMPL(float16, qs8x16)
-CONVERTQ_DOWN_IMPL(float16, qs16x16)
-
-#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type) \
- inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \
- { \
- return CONVERT_SAT(a * (1 << fixed_point_position) + \
- select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
- out_type); \
- }
-
-CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)
-CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16)
-
-#define CONVERTQ_UP_IMPL(in_type, out_type) \
- inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
- { \
- return CONVERT(a, out_type) / (1 << fixed_point_position); \
- }
-
-CONVERTQ_UP_IMPL(qs8x16, float16)
-CONVERTQ_UP_IMPL(qs16x16, float16)
-
-#define SQCVT_SAT_IMPL(type) \
- inline type sqcvt_##type##_sat(float a, int fixed_point_position) \
- { \
- return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \
- }
-
-SQCVT_SAT_IMPL(qs8)
-SQCVT_SAT_IMPL(qs16)
-
-#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position))
-#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position)
-
-#endif // ARM_COMPUTE_FIXED_POINT_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
index 25e20f5f2..6b767d6c9 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
@@ -2,25 +2,17 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#include "helpers.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
new file mode 100644
index 000000000..ed7409852
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform hashtable_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32
+ * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes)
+ * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
+ */
+__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(lookups))
+{
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+ Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+ int lup_id[4] = {0};
+
+ lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+ :get_global_id(0);
+ lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+ :get_global_id(1);
+ lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+ :get_global_id(2)%DEPTH_OUT;
+ lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+ :get_global_id(2) / DEPTH_OUT;
+
+ if (lup_id[NUM_DIMS-1] < 0)
+ {
+ VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
+ return;
+ }
+
+ in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+ + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+ VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+ 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 8143d2398..0e123ae0a 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -24,15 +24,23 @@
#ifndef ARM_COMPUTE_HELPER_H
#define ARM_COMPUTE_HELPER_H
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-#if defined(cl_arm_printf)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+ defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+ // defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
#pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(cl_arm_printf)
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
#define EXPAND(x) x
@@ -175,7 +183,7 @@ typedef struct Tensor4D
*
* @return An image object
*/
-Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
uint stride_x, uint step_x)
{
Vector vector = {
@@ -201,7 +209,7 @@ Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
*
* @return An image object
*/
-Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
uint stride_x, uint step_x, uint stride_y, uint step_y)
{
Image img = {.ptr = ptr,
@@ -230,7 +238,7 @@ Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
*
* @return A 3D tensor object
*/
-Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
uint offset_first_element_in_bytes,
uint stride_x, uint step_x, uint stride_y,
uint step_y, uint stride_z, uint step_z)
@@ -261,7 +269,7 @@ Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
*
* @return A 3D tensor object
*/
-Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr,
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
uint offset_first_element_in_bytes, uint stride_x,
uint step_x, uint stride_y, uint step_y, uint stride_z,
uint step_z)
@@ -276,7 +284,7 @@ Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr,
return tensor;
}
-Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr,
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
uint offset_first_element_in_bytes, uint stride_x,
uint step_x, uint stride_y, uint step_y, uint stride_z,
uint step_z, uint stride_w, uint step_w, uint mod_size)
@@ -299,7 +307,7 @@ Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr,
* @param[in] vec Pointer to the starting position of the buffer
* @param[in] x Relative X position
*/
-__global inline const uchar *vector_offset(const Vector *vec, int x)
+inline __global const uchar *vector_offset(const Vector *vec, int x)
{
return vec->ptr + x * vec->stride_x;
}
@@ -310,7 +318,7 @@ __global inline const uchar *vector_offset(const Vector *vec, int x)
* @param[in] x Relative X position
* @param[in] y Relative Y position
*/
-__global inline uchar *offset(const Image *img, int x, int y)
+inline __global uchar *offset(const Image *img, int x, int y)
{
return img->ptr + x * img->stride_x + y * img->stride_y;
}
@@ -322,7 +330,7 @@ __global inline uchar *offset(const Image *img, int x, int y)
* @param[in] y Relative Y position
* @param[in] z Relative Z position
*/
-__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
{
return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
}
@@ -335,7 +343,7 @@ __global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int
* @param[in] z Relative Z position
* @param[in] w Relative W position
*/
-__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
{
return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
w * tensor->stride_w;
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
new file mode 100644
index 000000000..e3aa463db
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Performs a negation of input tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: S16/S32/F16/F32.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void neg_tensor(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VSTORE(VEC_SIZE)
+ (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
new file mode 100644
index 000000000..ecf4696e9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu
+t_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ *
+ * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for
+ * batch), Top(for height), Left(for width) and Front(for depth) are
+ * required. Supported data type: S32
+ */
+
+__kernel void pad(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ const int4 pad_values)
+ {
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int index[4]={0};
+
+ index[0] = get_global_id(0);//W
+ index[1] = get_global_id(1);//H
+ index[2] = get_global_id(2) % DEPTH_OUT;//C
+ index[3] = get_global_id(2) / DEPTH_OUT;//N
+
+ if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) ||
+ index[1] < pad_values.y || index[1] >= (IH + pad_values.y) ||
+ index[2] < pad_values.z || index[2] >= (ID + pad_values.z) ||
+ index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
+ {
+ *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+ }
+ else
+ {
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)
+ tensor4D_offset(&in, index[0] - pad_values.x,
+ index[1] - pad_values.y,
+ index[2] - pad_values.z,
+ index[3] - pad_values.w));
+ }
+ }
+
+#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
new file mode 100644
index 000000000..7cc8b0354
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/** Perform a Generic permute operation on an input tensor of Shape DCHW.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1
+6/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in b
+ytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in b
+ytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in b
+ytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute_generic(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ int out_index[4];
+ int in_index[4];
+ in_index[0] = get_global_id(0);//W
+ in_index[1] = get_global_id(1);//H
+ in_index[2] = get_global_id(2) % DEPTH_IN;//C
+ in_index[3] = get_global_id(2) / DEPTH_IN;//B
+ out_index[0] = in_index[P1];
+ out_index[1] = in_index[P2];
+ out_index[2] = in_index[P3];
+ out_index[3] = in_index[P4];
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
index 512c62023..aa05121b1 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
@@ -2,25 +2,17 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2016, 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#include "helpers.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
index 82edf3b1d..fdfb78003 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
@@ -2,40 +2,20 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2016, 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-
-#include "fixed_point.h"
-
-#if defined(SATURATE)
-#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#else // SATURATE
-#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#endif // SATURATE
-
-#else // FIXED_POINT_POSITION
-
#if defined(SATURATE)
#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
#else // SATURATE
@@ -45,17 +25,14 @@
#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
-#endif // FIXED_POINT_POSITION
-
/** Performs a pixelwise division with integer scale of integer inputs.
*
* @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
* @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
* e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
*
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/S16
* @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
@@ -79,7 +56,7 @@
* @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
* @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
+ * @param[in] scale Integer scaling factor. Supported data types: S32
*/
__kernel void pixelwise_div_int(
TENSOR3D_DECLARATION(in1),
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
index ddc9d5a27..ab1307e64 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -2,25 +2,17 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2016, 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#include "helpers_asymm.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
new file mode 100644
index 000000000..68da2ba32
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in] input1_ptr Pointer to the source image. Supported Data types : F16/F32
+ * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image
+ *
+ * @param[in] alpha_ptr Pointer to the source image. Supported Data types : F16/F32
+ * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image
+ *
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void prelu(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(alpha),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VSTORE(VEC_SIZE)
+ (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ?
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) :
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
+ 0, (__global DATA_TYPE *)output.ptr);
+
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
new file mode 100644
index 000000000..7e97b7ed6
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take uchar data types.
+ *
+ * @param[in] input1_ptr Pointer to the source image. Supported Data types : QASYMM8
+ * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image
+ *
+ * @param[in] alpha_ptr Pointer to the source image. Supported Data types : QASYMM8
+ * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image
+ *
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void prelu_qasymm8(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(alpha),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
+ VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
+
+ in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1));
+ in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2));
+
+ const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+ const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+ const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32;
+ const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
+ const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global uchar *)output.ptr);
+}
+
+#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
deleted file mode 100644
index dfa3b85f4..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(WIDTH)
-/** Perform reduce max
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reduce_max(VECTOR_DECLARATION(input),
- VECTOR_DECLARATION(output))
-{
- Vector input = CONVERT_TO_VECTOR_STRUCT(input);
- Vector output = CONVERT_TO_VECTOR_STRUCT(output);
-
- __global float *input_addr = (__global float *)(input.ptr);
- __global float *output_addr = (__global float *)(output.ptr);
-
- float max_value = *input_addr;
- for(int x = 1; x < WIDTH; x++)
- {
- float value = *(input_addr + x);
- max_value = max(value, max_value);
- }
-
- // Store max
- *output_addr = max_value;
-}
-#endif // defined(WIDTH)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
new file mode 100644
index 000000000..8bef49363
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform reduce max/min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] axis Axis through which reduction occurs
+ * @param[in] dim Dimension across the axis to be reduced.
+ */
+__kernel void reduce_min_max(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ const int axis,
+ const int dim)
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int indices[4] =
+ {
+ get_global_id(0),
+ get_global_id(1),
+ get_global_id(2) % DEPTH_OUT,
+ get_global_id(2) / DEPTH_OUT,
+ };
+
+ DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+ for(int i = 1; i < dim; ++i)
+ {
+ indices[axis] = i;
+
+ #if OP_CODE == 1 // REDUCE_MAX
+ value = max(value, *((__global DATA_TYPE *)
+ tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+
+ #elif OP_CODE == 2 // REDUCE_MIN
+ value = min(value, *((__global DATA_TYPE *)
+ tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+
+ #else // OP NOT SUPPORTED
+ return;
+
+ #endif
+ }
+
+ *((__global DATA_TYPE *)out.ptr) = value;
+}
+
+/** Perform reduce sum/mean
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] axis Axis through which reduction occurs
+ * @param[in] dim Dimension across the axis to be reduced.
+ */
+__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ const int axis,
+ const int dim)
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int indices[4] =
+ {
+ get_global_id(0),
+ get_global_id(1),
+ get_global_id(2) % DEPTH_OUT,
+ get_global_id(2) / DEPTH_OUT,
+ };
+
+ DATA_TYPE sum_value = (DATA_TYPE)0;
+ for(int i = 0; i < dim; ++i)
+ {
+ indices[axis] = i;
+ sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+ }
+
+ #if OP_CODE == 3 // REDUCE_SUM
+ *((__global DATA_TYPE *)out.ptr) = sum_value;
+
+ #elif OP_CODE == 4 // REDUCE_MEAN
+ *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
+
+ #else // OP NOT SUPPORTED
+ return;
+
+ #endif
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
deleted file mode 100644
index 1a96eea61..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-inline DATA_TYPE sum_8(__global const DATA_TYPE *input)
-{
- VEC_DATA_TYPE(DATA_TYPE, 8)
- in = vload8(0, input);
- in.s0123 += in.s4567;
- in.s01 += in.s23;
- return ((in.s0 + in.s1));
-}
-
-/** This function calculates the sum and sum of squares of a given input image.
- *
- * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: U8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] local_sum Local sum of all elements
- * @param[in] height Height of the input image
- * @param[in] divider Divider to calculate mean
- */
-__kernel void reduction_mean(
- IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst),
- __local DATA_TYPE *local_sums,
- int height,
- int divider)
-{
- // Get pixels pointer
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- float8 tmp_sum = 0;
- // Calculate partial sum
-
- for(int i = 0; i < height; i++)
- {
- local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i));
- }
- ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider;
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
new file mode 100644
index 000000000..a0fc2d5a9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+/** Perform space to batch with input of 4D and NCHW format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(block_size),
+ IMAGE_DECLARATION(padding_size))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int block_size_x = *((__global int *)(block_size_ptr));
+ int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+ int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
+ int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
+
+ int in_index[4] = {0, };
+ in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+ in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+ in_index[2] = get_global_id(2) % DEPTH_OUT;
+ in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
+
+ if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
+ {
+ *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+ }
+ else
+ {
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
+ }
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+/** Perform space to batch with input of 4D and NHWC format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(block_size),
+ IMAGE_DECLARATION(padding_size))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+
+ int block_size_x = *((__global int *)(block_size_ptr));
+ int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+ int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+ int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+
+ int in_index[4] = {0, };
+ in_index[0] = get_global_id(0) * VEC_SIZE;
+ in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+ in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+ in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+
+ if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+ {
+ VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+ }
+ else
+ {
+ VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+ 0, (__global DATA_TYPE *)out.ptr);
+ }
+}
+
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
new file mode 100644
index 000000000..f6977045a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_depth(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+ {
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ int out_index[4]={0};
+ int in_index[4]={0};
+
+ in_index[0] = get_global_id(0);//W
+ in_index[1] = get_global_id(1);//H
+ in_index[2] = get_global_id(2) % DEPTH_IN;//C
+ in_index[3] = get_global_id(2) / DEPTH_IN;//B
+
+ out_index[0] = in_index[0]/BLOCK_SIZE;
+ out_index[1] = in_index[1]/BLOCK_SIZE;
+ out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+ out_index[3] = in_index[3];
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
new file mode 100644
index 000000000..3e1a5c97f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Returns true value of squared_difference of two tensors.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in] input1_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image
+ *
+ * @param[in] input2_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source image
+ *
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: F16/F32
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void squared_difference(
+ TENSOR3D_DECLARATION(input1),
+ TENSOR3D_DECLARATION(input2),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+ Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ sq_diff = diff * diff;
+
+ VSTORE(VEC_SIZE)
+ (sq_diff, 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
deleted file mode 100644
index c5ff82f9e..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-
-inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w)
-{
- int stride_x = vector->stride_x;
- int stride_y = stride_x * dim_x;
- int stride_z = stride_y * dim_y;
- int stride_w = stride_z * dim_z;
- Tensor4D tensor =
- {
- .ptr = vector->ptr,
- .offset_first_element_in_bytes = vector->offset_first_element_in_bytes,
- .stride_x = stride_x,
- .stride_y = stride_y,
- .stride_z = stride_z,
- .stride_w = stride_w,
- };
- return tensor;
-}
-
-/** Extracts a strided slice up to 4-dimensions
- *
- * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
- * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2
- *
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] dims_in The 4-dimensional dimension of the input. Supported data types: S32
- * @param[in] dims_out The 4-dimensional dimension of the output. Supported data types: S32
- * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32
- * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32
- */
-__kernel void strided_slice(VECTOR_DECLARATION(input),
- VECTOR_DECLARATION(output),
- const int4 dims_in,
- const int4 dims_out,
- const int4 starts,
- const int4 strides)
-{
- // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset
- Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
- Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
-
- // Implemenation
- // Infer a Tensor4D from output Vector and output's dimensions info
- // Infer a Tensor4D from input Vector and input's dimensions info
- // Infer indices of output as 4D from the offset of output vector
- // Infer indices of input as 4D from indices of output
- // out(offset of output vector) = in(offset of input)
-
- Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
- Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w);
-
- // Must be output_step_x == output_stride_x == an element's size
- const int offset_out = get_global_id(0) * output_stride_x;
- int4 indices_out =
- {
- get_global_id(0) % dims_out.x,
- (offset_out / tensor_out.stride_y) % dims_out.y,
- (offset_out / tensor_out.stride_z) % dims_out.z,
- (offset_out / tensor_out.stride_w) % dims_out.w,
- };
-
- int4 indices_in =
- {
- starts.x + (strides.x * indices_out.x),
- starts.y + (strides.y * indices_out.y),
- starts.z + (strides.z * indices_out.z),
- starts.w + (strides.w * indices_out.w),
- };
-
- *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
new file mode 100644
index 000000000..b39c55b96
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
+/** Extracts a strided slice up to 4-dimensions
+ *
+ * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32
+ * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32
+ */
+__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ const int4 starts,
+ const int4 strides)
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int4 indices_in =
+ {
+ starts.x + (strides.x * get_global_id(0)),
+ starts.y + (strides.y * get_global_id(1)),
+ starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)),
+ starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)),
+ };
+ *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
+}
+#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
index 0b0cf8218..d97f23a47 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -2,25 +2,17 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#include "helpers.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
index deadf8412..0292fab04 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -2,25 +2,17 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#include "helpers.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
index cac0c071e..c2c2d89a4 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -2,25 +2,17 @@
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2017 ARM Limited.
*
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
// reference:
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
new file mode 100644
index 000000000..1fdd2f98f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/UtilsEx.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const ActivationLayerInfoEx &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::F16, DataType::F32);
+
+ // Checks performed when output is configured
+ if ((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ if (output != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, *input);
+ }
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
+
+ if (output != nullptr)
+ {
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+ }
+ else
+ {
+ window_changed = update_window_and_padding(
+ win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+ }
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLActivationLayerExKernel::CLActivationLayerExKernel()
+ : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output,
+ ActivationLayerInfoEx act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _run_in_place = (output == nullptr) || (output == input);
+
+ if (output != nullptr)
+ {
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+ }
+
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const DataType dt = input->info()->data_type();
+ float a_const = act_info.a();
+ float b_const = act_info.b();
+ int a_const_int = 0;
+ int b_const_int = 0;
+
+ // Create quantized version of constants a, b if needed
+ if (is_data_type_quantized(dt))
+ {
+ a_const_int =
+ input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
+ b_const_int =
+ input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
+ }
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(
+ ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation()))));
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized(dt))
+ {
+ build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+ build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+
+ const int o1 = input->info()->quantization_info().offset;
+ // Quantized value of 0 corresponds to the offset o1
+ build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
+
+ // Set scale and offset of the input and output if they have different quantization info
+ if (is_data_type_quantized_asymmetric(dt) && output != nullptr)
+ {
+ const float s1 = input->info()->quantization_info().scale;
+ const float s2 = output->info()->quantization_info().scale;
+ const int o2 = output->info()->quantization_info().offset;
+
+ if (o1 != o2 || s1 != s2)
+ {
+ build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
+ build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
+ build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+ build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+ }
+ }
+ }
+ else
+ {
+ build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+ build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+ }
+
+ build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
+
+ // Create kernel
+ std::string kernel_name = std::string("activation_layer_ex");
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Make sure _kernel is initialized before calling the parent's configure
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ auto win_config =
+ validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "activation_layer_ex_";
+ _config_id += lower_string(string_from_data_type(dt));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ActivationLayerInfoEx &act_info)
+{
+ const bool run_in_place = (output == nullptr) || (output == input);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(),
+ (run_in_place) ? nullptr : output->clone().get())
+ .first);
+
+ return Status{};
+}
+
+void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ if (!_run_in_place)
+ {
+ add_3D_tensor_argument(idx, _output, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
new file mode 100644
index 000000000..c1a2ad0be
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(argminmax_axis, 1);
+
+ return out_shape;
+}
+} // namespace
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t argminmax_axis, ArgOperation op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
+ DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match argminmax_axis");
+
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ argminmax_axis >= 0 && argminmax_axis < num_dimensions,
+ "argminmax_axis must be greater than or equal to 0 and less than (input's rank).");
+ return Status{};
+}
+
+} // namespace
+
+CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {}
+
+void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const uint32_t argminmax_axis, ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis));
+
+ _input = input;
+ _output = output;
+ _argminmax_axis = argminmax_axis;
+
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis));
+
+ // Construct kernel name for argmax and argmin based on axis
+ std::string kernel_name = "arg_op";
+ int op_code = 0;
+ if (op == ArgOperation::MAX)
+ {
+ op_code = 1;
+ }
+ else if (op == ArgOperation::MIN)
+ {
+ op_code = 2;
+ }
+ else
+ throw std::runtime_error("Operation not supported, yet");
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+ build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t argminmax_axis, ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op));
+
+ return Status{};
+}
+
+void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ _kernel.setArg<cl_int>(idx++, _argminmax_axis);
+ _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
new file mode 100644
index 000000000..1c505b4d5
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_UNUSED(policy);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
+
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->data_type() == DataType::U8 &&
+ (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+ "Output can only be U8 if both inputs are U8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
+ ITensorInfo *output)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output, out_shape);
+
+ if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output, Format::S16);
+ }
+ else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output, Format::F16);
+ }
+ else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output, Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input1->info(), input2->info(), output->info(), policy));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ const bool has_float_out = is_data_type_float(output->info()->data_type());
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts));
+
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
+ input2->clone().get(),
+ output->clone().get())
+ .first);
+
+ return Status{};
+}
+
+void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLArithmeticSubtractionExKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
new file mode 100644
index 000000000..b0016d23c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t *block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2),
+ "Input Depth should be equal to Output Depth");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3),
+ "Input batch should be equal to (output batch * block size[0] *block size[1])");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) &&
+ !(output->dimension(1) % block_size[0]),
+ "Output height and width should be divisible by block size[0] "
+ "and block_size[1] respectively");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) &&
+ (output->dimension(1) == input->dimension(1) * block_size[0]),
+ "Output height and width should be equal to "
+ "input_height*blocksize[0] and input_width*blocksize[1] "
+ "respectively");
+
+ return Status{};
+}
+
+} // namespace
+
+CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t *block_size)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0]));
+ build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1]));
+ build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3)));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_out);
+ add_4D_tensor_argument(idx, _output, slice_in);
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
new file mode 100644
index 000000000..3d2f2c702
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, BinaryLogicalOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "binary_logical_op";
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+
+ int op_code = 0;
+ switch (op)
+ {
+ case BinaryLogicalOperation::AND:
+ op_code = 1;
+ break;
+ case BinaryLogicalOperation::OR:
+ op_code = 2;
+ break;
+ default:
+ throw std::runtime_error("Operation not supported, yet");
+ }
+
+ build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLBinaryLogicalOpKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
index b019e8c33..bf7ebae3f 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -17,15 +17,8 @@
#include "arm_compute/core/CL/kernels/CLCastKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
using namespace arm_compute;
@@ -60,8 +53,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
{
const float scale_in = input->info()->quantization_info().scale;
const int offset_in = input->info()->quantization_info().offset;
- build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
- build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+ build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+ build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
_kernel = static_cast<cl::Kernel>(
CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
@@ -70,8 +63,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
{
const float scale_in = output->info()->quantization_info().scale;
const int offset_in = output->info()->quantization_info().offset;
- build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
- build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+ build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+ build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
_kernel = static_cast<cl::Kernel>(
CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
@@ -88,7 +81,7 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, input->info()->valid_region());
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
new file mode 100644
index 000000000..5af5b16ea
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32, DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, const ComparisonOperation &op)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "comparison_op";
+ int op_code = 0;
+
+ switch (op)
+ {
+ case ComparisonOperation::EQUAL:
+ op_code = 1;
+ break;
+ case ComparisonOperation::NOT_EQUAL:
+ op_code = 2;
+ break;
+ default:
+ throw std::runtime_error(" Operation not supported, yet");
+ }
+
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+ build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type())));
+ build_opts.emplace(
+ ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized_asymmetric(input1->info()->data_type()) &&
+ ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) ||
+ (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale)))
+ {
+ build_opts.emplace("-DOFFSET_IN1=" +
+ support::cpp11::to_string(input1->info()->quantization_info().offset));
+ build_opts.emplace("-DOFFSET_IN2=" +
+ support::cpp11::to_string(input2->info()->quantization_info().offset));
+ build_opts.emplace("-DSCALE_IN1=" +
+ support::cpp11::to_string(input1->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_IN2=" +
+ support::cpp11::to_string(input2->info()->quantization_info().scale));
+ kernel_name += "_qasymm8";
+ }
+
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), out_shape);
+
+ if (input1->info()->data_type() == DataType::S16 ||
+ input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if (input1->info()->data_type() == DataType::F16 &&
+ input2->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if (input1->info()->data_type() == DataType::F32 ||
+ input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLComparisonOpKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
new file mode 100644
index 000000000..c386e3312
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size,
+ "Output width should be equal to (Input width * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size,
+ "Output height should be equal to (Input height * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0,
+ "Input depth should be divisible by (block size * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->dimension(2) == input->dimension(2) / (block_size * block_size),
+ "Output depth should be equal to (Input depth / (block size * block size))");
+
+ return Status{};
+}
+} // namespace
+
+CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+{
+ // DO NOTHING
+}
+
+void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t block_size)
+{
+
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..0862b78bf
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
+ : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *lookups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+ return Status{};
+}
+
+void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const ICLTensor *lookups)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+ _input = input;
+ _output = output;
+ _lookups = lookups;
+
+ // Set kernel build options
+ std::stringstream kernel_name;
+ std::set<std::string> build_opts;
+ kernel_name << "embedding_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ Window win_lookup;
+ win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_in);
+ add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
new file mode 100644
index 000000000..b1ee21bdc
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLExpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Auto initialize output
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(),
+ input->info()->quantization_info());
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLExpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
index 23efafa6a..ae2801e2b 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
@@ -17,26 +17,14 @@
#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
using namespace arm_compute;
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+constexpr unsigned int num_elems_processed_per_iteration = 1;
Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
const ITensorInfo *output)
@@ -46,6 +34,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
return Status{};
}
@@ -57,8 +46,7 @@ CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(n
void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
_input1 = input1;
_input2 = input2;
@@ -89,11 +77,10 @@ void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration = 1;
Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
new file mode 100644
index 000000000..cd7b21c6d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLHashtableLookupKernel::CLHashtableLookupKernel()
+ : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Output's shape was not set");
+
+ ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) &&
+ output->dimension(output->num_dimensions() - 1) == lookups->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+
+ return Status{};
+}
+
+void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
+ const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+ _lookups = lookups;
+ _keys = keys;
+ _input = input;
+ _output = output;
+ _hits = hits;
+
+ // Make _lookup_indices tensor
+ _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+ _lookup_indices->allocator()->init(
+ TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+ _lookup_indices->allocator()->allocate();
+
+ // Set kernel build options
+ std::stringstream kernel_name;
+ std::set<std::string> build_opts;
+ kernel_name << "hashtable_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const_cast<ICLTensor *>(_lookups)->map(queue);
+ const_cast<ICLTensor *>(_keys)->map(queue);
+ _hits->map(queue);
+ _lookup_indices->map(queue);
+
+ // Set values of hits
+ const int32_t *lookups_buf =
+ reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+ const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
+ uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+ int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
+
+ std::map<int32_t, size_t> key_map;
+ const size_t keys_num = _keys->info()->dimension(0);
+ for (size_t key_index = 0; key_index < keys_num; key_index++)
+ {
+ key_map[keys_buf[key_index]] = key_index;
+ }
+
+ const size_t lookups_num = _lookups->info()->dimension(0);
+ for (size_t i = 0; i < lookups_num; ++i)
+ {
+ const auto lookup_value = lookups_buf[i];
+ const auto it = key_map.find(lookup_value);
+ if (it != key_map.end())
+ {
+#if defined(DEBUG)
+ if (it->second >= lookups_num)
+ ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
+#endif // defined(DEBUG)
+ lookup_indices_buf[i] = static_cast<int32_t>(it->second);
+ hits_buf[i] = static_cast<uint8_t>(1);
+ }
+ else
+ {
+ lookup_indices_buf[i] = -1;
+ hits_buf[i] = static_cast<uint8_t>(0);
+ }
+ }
+
+ const_cast<ICLTensor *>(_lookups)->unmap(queue);
+ const_cast<ICLTensor *>(_keys)->unmap(queue);
+ _hits->unmap(queue);
+ _lookup_indices->unmap(queue);
+
+ Window win = window.collapse(ICLKernel::window(), 2, 4);
+
+ Window win_lookup;
+ win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, win);
+ add_4D_tensor_argument(idx, _output, win);
+ add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+ enqueue(queue, *this, win);
+ } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
new file mode 100644
index 000000000..80d99dd3b
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(),
+ output->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ return Status{};
+}
+
+} // namespace
+
+CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
new file mode 100644
index 000000000..12bbe910f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Checks performed when output is configured
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+
+ const unsigned int norm_size = norm_info.norm_size();
+ bool is_in_map = norm_info.is_in_map();
+
+ const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
+ const BorderSize border_size = BorderSize(0, border_width);
+
+ const unsigned int num_elems_processed_per_iteration = 4;
+ const unsigned int num_elems_read_per_iteration =
+ is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2))
+ : num_elems_processed_per_iteration;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside
+ // the kernel, avoiding padding
+ AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLNormalizationLayerExKernel::CLNormalizationLayerExKernel()
+ : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
+{
+}
+
+BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; }
+
+void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output,
+ NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
+
+ _input = input;
+ _output = output;
+
+ const unsigned int num_elems_processed_per_iteration = 4;
+ const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option(
+ ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
+ build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
+ build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
+ build_opts.add_option(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size())));
+ build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
+ build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
+
+ // Create kernel
+ std::string kernel_name =
+ _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "normalization_layer_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(
+ static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(norm_info.norm_size());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+
+ return Status{};
+}
+
+void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
+ Window slice = window_collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ } while (window_collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
new file mode 100644
index 000000000..241f8ae4d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info()));
+
+ _input = input;
+ _alpha = alpha;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "prelu";
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ build_opts.emplace("-DOFF_IN1=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ build_opts.emplace("-DOFF_IN2=" +
+ support::cpp11::to_string(alpha->info()->quantization_info().offset));
+ build_opts.emplace("-DOFF_OUT=" +
+ support::cpp11::to_string(output->info()->quantization_info().offset));
+ build_opts.emplace("-DSCALE_IN1=" +
+ support::cpp11::to_string(input->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_IN2=" +
+ support::cpp11::to_string(alpha->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_OUT=" +
+ support::cpp11::to_string(output->info()->quantization_info().scale));
+ kernel_name += "_qasymm8";
+ }
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), out_shape);
+
+ if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if (input->info()->data_type() == DataType::F32 ||
+ alpha->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+
+ AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input->info()->tensor_shape();
+ const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_input1);
+ add_3D_tensor_argument(idx, _alpha, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPReLUKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
new file mode 100644
index 000000000..99b54c822
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
+ const ITensorInfo *pad_size_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 &&
+ input_info->num_dimensions() <= 4,
+ "Pad kernel supports upto 4-D input tensor");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input_info->num_dimensions() == output_info->num_dimensions(),
+ "output tensor should have same number of dimensions as input tensor");
+
+ if (input_info->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
+ output_info->quantization_info(),
+ "The input and output quantization info are different!");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
+
+void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
+
+ _input = input;
+ _output = output;
+ _pad_size = pad_size;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
+ build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
+ if (input->info()->data_type() == DataType::QASYMM8)
+ {
+ build_opts.emplace("-DZERO_VALUE=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ }
+ else
+ {
+ build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ _pad_size->map(queue);
+
+ // Padding values only for up, top, left and front are required based on the rank of tensor
+ int rank = _pad_size->info()->dimension(1);
+
+ auto pad_batch_up =
+ (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0;
+ auto pad_height_top =
+ (rank >= 2)
+ ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1}))
+ : 0;
+ auto pad_width_left = (rank >= 1)
+ ? *reinterpret_cast<const int32_t *>(
+ _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1}))
+ : 0;
+ auto pad_depth_front =
+ (rank >= 3)
+ ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3}))
+ : 0;
+
+ _pad_size->unmap(queue);
+
+ // Pad_values which needs to be passed
+ const cl_int4 paddingValues = {
+ {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
+ static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ _kernel.setArg<cl_int4>(idx++, paddingValues);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
new file mode 100644
index 000000000..aa094761c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute;
+
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
+{
+ TensorShape output_shape = input->tensor_shape();
+ permute(output_shape, perm);
+ return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const PermutationVector &perm)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+
+ // Validate configured output
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+ return Status{};
+}
+} // namespace
+
+CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {}
+
+void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const PermutationVector &perm)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
+
+ _input = input;
+ _output = output;
+ _perm = perm;
+
+ const TensorShape output_shape = get_output_shape(input->info(), perm);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+ // Create kernel
+ std::set<std::string> build_opts;
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+ // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector
+ build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0]));
+ build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1]));
+ build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2]));
+ build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3]));
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PermutationVector &perm)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
+
+ return Status{};
+}
+
+void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
index a3e0163de..b985aa737 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
@@ -17,20 +17,8 @@
#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
using namespace arm_compute;
@@ -45,12 +33,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_UNUSED(overflow_policy);
ARM_COMPUTE_UNUSED(rounding_policy);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8,
- DataType::QS16, DataType::S16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8,
- DataType::QS16, DataType::S16, DataType::F16,
- DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
const TensorShape &out_shape =
@@ -58,21 +44,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
"Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
-
- if (is_data_type_fixed_point(input1->data_type()))
- {
- // All data types must be all QS8 or all QS16
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1,
- "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
- }
// Validate in case of configured output
if (output->total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8,
- DataType::QS16, DataType::S16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
output->data_type() == DataType::U8 &&
@@ -81,11 +57,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
"Wrong shape for output");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
- if (is_data_type_fixed_point(input1->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
- }
}
return Status{};
@@ -191,14 +162,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
{
compute_type = "int";
}
- else if (input1->info()->data_type() == DataType::QS8)
- {
- compute_type = "qs8";
- }
- else if (input1->info()->data_type() == DataType::QS16)
- {
- compute_type = "qs16";
- }
else
{
compute_type = "ushort";
@@ -218,11 +181,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
: "-DSATURATE");
build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
: "-DROUND=_rte");
- if (is_data_type_fixed_point(input1->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" +
- support::cpp11::to_string(input1->info()->fixed_point_position()));
- }
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
@@ -245,7 +203,7 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
_kernel.setArg(idx++, scale);
}
- ICLKernel::configure(win_config.second);
+ ICLKernel::configure_internal(win_config.second);
}
Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
deleted file mode 100644
index 168b246bf..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
-{
- // We can handle for simple case only
- // Input rank: 2
- // Output rank: 1
- // Axis: one axis value, restrict to 1
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
- "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
- "Output same type allowed for input and output");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
- "Only support for output dimension 1");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
- "Only support for input dimension 2");
- }
-
- return Status{};
-}
-
-} // namespace
-
-CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {}
-
-void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
-
- _input = input;
- _output = output;
- _axis = axis;
-
- // Configure kernel window
- int cols = _input->info()->tensor_shape()[0];
- int rows = _input->info()->tensor_shape()[1];
- Window win;
- win.set(0, Window::Dimension(0, cols, 1));
- win.set(1, Window::Dimension(0, rows, 1));
-
- // Construct kernel name
- std::string kernel_name = "reduce_max";
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
- ICLKernel::configure(win);
-}
-
-Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
-
- return Status{};
-}
-
-void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window window_input = window;
- Window slice_input = window_input.first_slice_window_1D();
-
- do
- {
- Window slice_output = slice_input.shift_dimensions(1);
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, slice_input);
- add_1D_tensor_argument(idx, _output, slice_output);
- enqueue(queue, *this, slice_input);
-
- } while (window_input.slide_window_slice_1D(slice_input));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
new file mode 100644
index 000000000..f581780e1
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+namespace
+{
+// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
+// are the same.
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(axis, 1);
+
+ return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32, DataType::S32);
+ if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
+ "Not support QASYMM8, yet");
+ }
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ axis >= 0 && axis < num_dimensions,
+ "axis must be greater than or equal to 0 and less than (input's rank).");
+
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match axis");
+
+ return Status{};
+}
+} // namespace
+
+CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const uint32_t axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+ // Construct kernel name
+ std::string kernel_name;
+ int op_code = 0;
+ if (op == ReduceOperation::MAX)
+ {
+ kernel_name = "reduce_min_max";
+ op_code = 1;
+ }
+ else if (op == ReduceOperation::MIN)
+ {
+ kernel_name = "reduce_min_max";
+ op_code = 2;
+ }
+ else if (op == ReduceOperation::SUM)
+ {
+ kernel_name = "reduce_sum_mean";
+ op_code = 3;
+ }
+ else if (op == ReduceOperation::MEAN)
+ {
+ kernel_name = "reduce_sum_mean";
+ op_code = 4;
+ }
+ else
+ throw std::runtime_error("Operation not supported, yet");
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+ build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+ return Status{};
+}
+
+void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ _kernel.setArg<cl_int>(idx++, _axis);
+ _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+ // Support dimensions up to 4
+ Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
+ // of input and output are the same
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+ idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
deleted file mode 100644
index 84a77122d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- std::vector<uint32_t> axis)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions,
- "Reduction axis greater than max number of dimensions");
-
- std::vector<uint32_t>::const_iterator it;
- bool axis_w = false;
- bool axis_h = false;
- for (it = axis.begin(); it != axis.end(); ++it)
- {
- if ((*it) == 0)
- {
- axis_w = true;
- }
- else if ((*it) == 1)
- {
- axis_h = true;
- }
- else
- {
- ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
- }
- }
- // TODO Other axises (currently, only axises for both width and height are supported.)
- if (!axis_w || !axis_h)
- {
- ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
- }
-
- if (output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
- std::vector<uint32_t> axis)
-{
- // Output tensor auto initialization if not yet initialized
- TensorShape output_shape{input->tensor_shape()};
- output_shape.set(0, 1);
- output_shape.set(1, 1);
- auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(),
- input->fixed_point_position());
-
- // Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
- const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-
- Window win = calculate_max_window(
- *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
- num_elems_processed_per_iteration_y);
- AccessWindowHorizontal output_access(output, 0, 1);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, output->valid_region());
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
-
- return std::make_tuple(err, win);
-}
-} // namespace
-
-CLReductionMeanKernel::CLReductionMeanKernel()
- : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size()
-{
-}
-
-BorderSize CLReductionMeanKernel::border_size() const { return _border_size; }
-
-void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output,
- std::vector<uint32_t> axis)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis));
-
- _input = input;
- _output = output;
- _reduction_axis = axis;
-
- constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
-
- // Set border size
- _border_size = BorderSize(
- ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) -
- input->info()->dimension(0));
-
- // Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- // build_opts.emplace(("-DVEC_SIZE=" +
- // support::cpp11::to_string(num_elems_processed_per_iteration)));
- if (is_data_type_fixed_point(input->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" +
- support::cpp11::to_string(input->info()->fixed_point_position()));
- }
-
- // Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure(std::get<1>(win_config));
-}
-
-Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- std::vector<uint32_t> axis)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
- validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
-
- return Status{};
-}
-
-void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- // Set out window
- Window out_window(window);
- out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- // Get first input and output slices
- Window in_slice = window.first_slice_window_2D();
- Window out_slice = out_window.first_slice_window_2D();
-
- // Set local sums buffer
- // TODO work_group
- unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
-
- unsigned int idx = 2 * num_arguments_per_2D_tensor();
- _kernel.setArg(idx++, local_sum_size, nullptr);
- _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height
- _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) *
- _input->info()->dimension(1))); // divider
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice);
- } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
new file mode 100644
index 000000000..6b0697e89
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
+ const ITensorInfo *padding_size, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
+ "The number of dimensions of input should be equal to output");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
+ "The input and output layouts are different!");
+
+ // TODO Support other cases
+ if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
+ else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
+ "CLSpaceToBatchNDKernel supports dimensions up to 4");
+
+ if (input->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
+ "The input and output quantization info are different!");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
+ const ICLTensor *padding_size, ICLTensor *output)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
+
+ _input = input;
+ _block_size = block_size;
+ _padding_size = padding_size;
+ _output = output;
+
+ // Set kernel build options
+ // TODO Support other cases
+ std::string kernel_name = "space_to_batch_4d";
+ std::set<std::string> build_opts;
+ Window win;
+
+ if (input->info()->data_layout() == DataLayout::NCHW)
+ {
+ kernel_name += "_nchw";
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+ win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+ }
+ else if (input->info()->data_layout() == DataLayout::NHWC)
+ {
+ kernel_name += "_nhwc";
+ build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DVEC_SIZE=" +
+ support::cpp11::to_string(num_elems_processed_per_iteration));
+
+ win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->info()->valid_region());
+
+ if (window_changed)
+ {
+ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported layout");
+ }
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
+ if (input->info()->data_type() == DataType::QASYMM8)
+ {
+ build_opts.emplace("-DZERO_VALUE=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ }
+ else
+ {
+ build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ }
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+#if defined(DEBUG)
+ const_cast<ICLTensor *>(_block_size)->map(queue);
+ const_cast<ICLTensor *>(_padding_size)->map(queue);
+
+ const size_t num_dimensions = _input->info()->num_dimensions();
+ const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
+ int32_t batch_size = _input->info()->dimension(num_dimensions - 1);
+ for (size_t i = 0; i < num_spacial_dimensions; ++i)
+ {
+ const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
+ const int32_t padding_size_pre =
+ *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
+ const int32_t padding_size_post =
+ *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
+
+ ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
+ ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
+ "Padding size should be greater than or equal to 0");
+
+ if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(i) !=
+ (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
+ "Dimension value of spatial block does not match output's dimension value");
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
+ (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
+ padding_size_pre + padding_size_post) /
+ block_size,
+ "Dimension value of spatial block does not match output's dimension value");
+ }
+
+ batch_size *= block_size;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(num_dimensions - 1) != batch_size,
+ "Output batch size should be equal to input batch size * (multiplication of all block size)");
+
+ const_cast<ICLTensor *>(_block_size)->unmap(queue);
+ const_cast<ICLTensor *>(_padding_size)->unmap(queue);
+#endif // defined(DEBUG)
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Set block size window
+ Window win_block = calculate_max_window(*_block_size->info(), Steps());
+
+ // Set padding size window
+ Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ add_1D_tensor_argument(idx, _block_size, win_block);
+ add_2D_tensor_argument(idx, _padding_size, win_padding);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
new file mode 100644
index 000000000..5d6329edc
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3),
+ "Input batch should be equal to Output batch");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input->dimension(2) * block_size * block_size == output->dimension(2),
+ "Output depth should be equal to (input depth * block size *block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) &&
+ !(input->dimension(1) % block_size),
+ "Input height and width should be divisible by block size");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) &&
+ (output->dimension(1) == (input->dimension(1) / block_size)),
+ "Output height and width should be equal to "
+ "input_height/blocksize and input_width/blocksize respectively");
+
+ return Status{};
+}
+
+} // namespace
+
+CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t block_size)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
new file mode 100644
index 000000000..260bc39f1
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLSquaredDifferenceKernel::CLSquaredDifferenceKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), out_shape);
+
+ if (input1->info()->data_type() == DataType::F16 &&
+ input2->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if (input1->info()->data_type() == DataType::F32 ||
+ input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLSquaredDifferenceKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
index 80ffd423a..48146a43a 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
@@ -14,43 +14,30 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include <string>
-
-using namespace std;
using namespace arm_compute;
-static const int32_t maxDim = 4;
-
-CLStridedSliceKernel::CLStridedSliceKernel()
+CLStridedSliceExKernel::CLStridedSliceExKernel()
: _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
_stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
{
}
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *begin, const ITensorInfo *end,
- const ITensorInfo *strides, int32_t beginMask,
- int32_t endMask, int32_t shrinkAxisMask)
+Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *begin, const ITensorInfo *end,
+ const ITensorInfo *strides, int32_t beginMask,
+ int32_t endMask, int32_t shrinkAxisMask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16,
- DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
@@ -153,15 +140,6 @@ inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
return stop;
}
-inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
-{
- int32_t offset = b * shape[2] * shape[1] * shape[0];
- offset += d * shape[1] * shape[0];
- offset += h * shape[0];
- offset += w;
- return offset;
-}
-
inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
{
int32_t ret = 0;
@@ -177,10 +155,10 @@ inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
return ret;
}
-void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
- ICLTensor *beginData, ICLTensor *endData,
- ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
- int32_t shrinkAxisMask)
+void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output,
+ ICLTensor *beginData, ICLTensor *endData,
+ ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
endData->info(), stridesData->info(), beginMask, endMask,
@@ -195,48 +173,31 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
_endMask = endMask;
_shrinkAxisMask = shrinkAxisMask;
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
// Set kernel build options
std::set<std::string> build_opts;
build_opts.emplace("-DELEMENT_DATA_TYPE=" +
get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
// Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts));
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts));
- // Create output's window without padding
- TensorShape collapsed = output->info()->tensor_shape();
- collapsed.collapse(4);
- TensorInfo info = *output->info();
- info.set_tensor_shape(collapsed);
- Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration));
-
- ICLKernel::configure(win);
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ ICLKernel::configure_internal(win);
}
-void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- // Create input window
- TensorShape collapsed = _input->info()->tensor_shape();
- collapsed.collapse(4);
- TensorInfo info = *_input->info();
- info.set_tensor_shape(collapsed);
- Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size()));
-
_beginData->map(queue);
_endData->map(queue);
_stridesData->map(queue);
- std::vector<int32_t> dimsIn;
- std::vector<int32_t> dimsOut;
std::vector<int32_t> starts;
- std::vector<int32_t> stops;
std::vector<int32_t> strides;
for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
@@ -246,22 +207,13 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
- stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n],
- reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape,
- n));
-
strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
- dimsIn.emplace_back(shape[n]);
- dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n]));
}
for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
{
starts.emplace_back(0);
- stops.emplace_back(1);
strides.emplace_back(1);
- dimsIn.emplace_back(1);
- dimsOut.emplace_back(1);
}
// TODO: Apply shrinkAxisMask
@@ -269,20 +221,7 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
_stridesData->unmap(queue);
_endData->unmap(queue);
- // Set parameters
- unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
- const cl_int4 dimsInArg = {{
- static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]),
- static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]),
- }};
- _kernel.setArg<cl_int4>(idx++, dimsInArg);
-
- const cl_int4 dimsOutArg = {{
- static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]),
- static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]),
- }};
- _kernel.setArg<cl_int4>(idx++, dimsOutArg);
-
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
const cl_int4 startsArg = {{
static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
@@ -295,10 +234,20 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
}};
_kernel.setArg<cl_int4>(idx++, stridesArg);
- // TODO: Apply slicing output's window
- idx = 0;
- add_1D_tensor_argument(idx, _input, win_in);
- add_1D_tensor_argument(idx, _output, window);
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
- enqueue(queue, *this, window);
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
index d95b485b7..073c2f7bb 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -17,15 +17,8 @@
#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <climits>
-#include <cassert>
namespace arm_compute
{
@@ -59,7 +52,7 @@ void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTens
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, 1, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
@@ -102,7 +95,7 @@ void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffe
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
@@ -147,7 +140,7 @@ void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -192,7 +185,7 @@ void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -236,7 +229,7 @@ void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buf
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -275,7 +268,7 @@ void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -322,7 +315,7 @@ void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
@@ -365,7 +358,7 @@ void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, in
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
@@ -404,7 +397,7 @@ void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, n, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
@@ -449,7 +442,7 @@ void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int
// Configure kernel window
Window win;
win.set(0, Window::Dimension(0, k, 1));
- ICLKernel::configure(win);
+ ICLKernel::configure_internal(win);
}
void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
diff --git a/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp
new file mode 100644
index 000000000..3b5782c25
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEMath.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared,
+ const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+
+ // Checks performed when output is configured
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
+ ITensorInfo *input_squared,
+ ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
+{
+ unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+ const unsigned int num_elems_read_per_iteration =
+ num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+ const unsigned int num_rows =
+ (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
+ const unsigned int border_width =
+ (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+ BorderSize border_size = BorderSize(0, border_width);
+ bool window_changed = false;
+
+ // Configure window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration,
+ num_rows);
+ AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0,
+ num_elems_read_per_iteration, num_rows);
+
+ if (output->total_size() != 0)
+ {
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed =
+ update_window_and_padding(win, input_access, input_squared_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, input_access, input_squared_access);
+ }
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+NENormalizationLayerExKernel::NENormalizationLayerExKernel()
+ : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr),
+ _norm_info(NormType::IN_MAP_1D), _border_size()
+{
+}
+
+BorderSize NENormalizationLayerExKernel::border_size() const { return _border_size; }
+
+void NENormalizationLayerExKernel::configure(const ITensor *input, const ITensor *input_squared,
+ ITensor *output, NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), input_squared->info(), output->info(), norm_info));
+
+ const unsigned int border_width =
+ (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+
+ _input = input;
+ _input_squared = input_squared;
+ _output = output;
+ _norm_info = norm_info;
+ _border_size = BorderSize(0, border_width);
+
+ switch (_input->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ switch (norm_info.type())
+ {
+ case NormType::IN_MAP_1D:
+ _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, false>;
+ break;
+ case NormType::IN_MAP_2D:
+ // Normalize over X and Y
+ _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, true>;
+ break;
+ case NormType::CROSS_MAP:
+ _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 2, false>;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ case DataType::F16:
+ {
+ switch (norm_info.type())
+ {
+ case NormType::IN_MAP_1D:
+ _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, false>;
+ break;
+ case NormType::IN_MAP_2D:
+ // Normalize over X and Y
+ _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, true>;
+ break;
+ case NormType::CROSS_MAP:
+ _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 2, false>;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), input_squared->info(),
+ output->info(), norm_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+template <DataType dt, unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerExKernel::normalize_float(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator input_squared(_input_squared, window);
+ Iterator output(_output, window);
+
+ const int dim_y = 1;
+ const int radius = _norm_info.norm_size();
+ const int total_size = _input->info()->dimension(dim) - 1;
+ const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
+ // We account padding across X only and we iterate over rows
+ const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+ const int max_right = (dim == 2) ? total_size : total_size + border_size().left;
+ const int min_top = 0;
+ const int max_bottom = _input->info()->dimension(dim_y) - 1;
+
+ if (dt == DataType::F32)
+ {
+ const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
+ const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta());
+ const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id) {
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ // Accumulate 2D In-Map values
+ float32x4_t accu = vdupq_n_f32(0.f);
+ for (int j = first_row; j <= last_row; j++)
+ {
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr =
+ input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for (int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(
+ input_squared_ptr + i * input_squared_stride)));
+ }
+ }
+
+ // Normalize
+ const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
+ const float32x4_t normalized_pixel = vmulq_f32(
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ else if (dt == DataType::F16)
+ {
+ const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff());
+ const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta());
+ const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa());
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id) {
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ // Accumulate 2D In-Map values
+ float16x8_t accu = vdupq_n_f16(0.f);
+ for (int j = first_row; j <= last_row; j++)
+ {
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr =
+ input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for (int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>(
+ input_squared_ptr + i * input_squared_stride)));
+ }
+ }
+
+ const float16x8_t norm_f16 =
+ vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16);
+ const float16x8_t normalized_pixel = vmulq_f16(
+ vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16));
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+}
+
+Status NENormalizationLayerExKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *input_squared,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ input_squared->clone().get(),
+ output->clone().get(), norm_info)
+ .first);
+
+ return Status{};
+}
+
+void NENormalizationLayerExKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ // Run function
+ (this->*_func)(window);
+}
diff --git a/libs/ARMComputeEx/src/core/UtilsEx.cpp b/libs/ARMComputeEx/src/core/UtilsEx.cpp
new file mode 100644
index 000000000..b63093bbb
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/UtilsEx.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/UtilsEx.h"
+
+#include <cstdint>
+#include <fstream>
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+const std::string &
+arm_compute::string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act)
+{
+ static std::map<ActivationLayerInfoEx::ActivationFunction, const std::string> act_map = {
+ {ActivationLayerInfoEx::ActivationFunction::RSQRT, "RSQRT"},
+ };
+
+ return act_map[act];
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp
new file mode 100644
index 000000000..1e52fc429
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLActivationLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
+
+using namespace arm_compute;
+
+void CLActivationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+ ActivationLayerInfoEx act_info)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerExKernel>();
+ k->configure(input, output, act_info);
+ _kernel = std::move(k);
+}
+
+Status CLActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ActivationLayerInfoEx &act_info)
+{
+ return CLActivationLayerExKernel::validate(input, output, act_info);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp
new file mode 100644
index 000000000..dff743e89
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArgMinMax.h"
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+
+CLArgMinMax::CLArgMinMax()
+ : _input(nullptr), _output(nullptr), _argminmax_axis(), _interm_tensors(), _argminmax_kernels(),
+ _num_of_kernels()
+{
+}
+
+void CLArgMinMax::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+ ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op));
+ _input = input;
+ _output = output;
+ _argminmax_axis = axis;
+ _arg_op = op;
+ // NOTE The argminmax_axis must have no duplication.
+ _num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+ _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _argminmax_kernels =
+ arm_compute::support::cpp14::make_unique<CLArgMinMaxKernel[]>(_num_of_kernels);
+
+ TensorShape shape{input->info()->tensor_shape()};
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ shape.set(_argminmax_axis[i], 1);
+ _interm_tensors[i].allocator()->init(
+ TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+ _interm_tensors[i].allocator()->allocate();
+ }
+
+ // Set a vector that is ordered ICLTensors sequentially.
+ std::vector<ICLTensor *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ tensors.emplace_back(_interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Apply ArgMinMax on all kernels
+ for (size_t i = 0; i < _num_of_kernels; i++)
+ {
+ _argminmax_kernels[i].configure(tensors[i], tensors[i + 1], _argminmax_axis[i], op);
+ }
+}
+
+Status CLArgMinMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis,
+ const ITensorInfo *output, ArgOperation op)
+{
+ const size_t num_of_kernels = argminmax_axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+ // Create temporary tensor infos
+ auto interm_tensors =
+ arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ shape.set(argminmax_axis[i], 1);
+ interm_tensors[i].set_data_type(input->data_type());
+ interm_tensors[i].set_tensor_shape(shape);
+ interm_tensors[i].set_num_channels(input->num_channels());
+ }
+
+ // Set a vector that is ordered ITensorInfo sequentially.
+ std::vector<const ITensorInfo *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ tensors.emplace_back(interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Validate argminmax only on all kernels
+ for (size_t i = 0; i < num_of_kernels; i++)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArgMinMaxKernel::validate(tensors[i], tensors[i + 1], argminmax_axis[i], op));
+ }
+
+ return Status{};
+}
+
+void CLArgMinMax::run()
+{
+ for (size_t i = 0; i < _num_of_kernels; ++i)
+ {
+ CLScheduler::get().enqueue(_argminmax_kernels[i]);
+ }
+}
+
+} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp
new file mode 100644
index 000000000..3f403c80a
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
+
+using namespace arm_compute;
+
+void CLArithmeticSubtractionEx::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionExKernel>();
+ k->configure(input1, input2, output, policy);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status CLArithmeticSubtractionEx::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, ConvertPolicy policy)
+{
+ return CLArithmeticSubtractionExKernel::validate(input1, input2, output, policy);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp
new file mode 100644
index 000000000..26e3798cc
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBatchToSpaceND.h"
+
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
+
+using namespace arm_compute;
+
+void CLBatchToSpaceND::configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLBatchToSpaceNDKernel>();
+ k->configure(input, output, block_size);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
new file mode 100644
index 000000000..7c5fe5eda
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ BinaryLogicalOperation op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+ k->configure(input1, input2, output, op);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
index e1059ab53..8e106737c 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -17,7 +17,6 @@
#include "arm_compute/runtime/CL/functions/CLCast.h"
#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-#include "support/ToolchainSupport.h"
using namespace arm_compute;
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp
new file mode 100644
index 000000000..f6a745a25
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparisonOp.h"
+
+#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLComparisonOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ const ComparisonOperation &op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComparisonOpKernel>();
+ k->configure(input1, input2, output, op);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
new file mode 100644
index 000000000..c2e4ca9ff
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+using namespace arm_compute;
+
+void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+ k->configure(input, output, block_size);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
new file mode 100644
index 000000000..2781784ca
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+ const ICLTensor *lookups)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+ k->configure(input, output, lookups);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp
new file mode 100644
index 000000000..411fa8700
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLExp.h"
+
+#include "arm_compute/core/CL/kernels/CLExpKernel.h"
+
+using namespace arm_compute;
+
+void CLExp::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLExpKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
index 5552cbc6f..fb056fe45 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
@@ -16,11 +16,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLGather.h"
-#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
using namespace arm_compute;
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
new file mode 100644
index 000000000..7180e9356
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+ const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+ k->configure(lookups, keys, input, output, hits);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
new file mode 100644
index 000000000..be35ea732
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNeg.h"
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+using namespace arm_compute;
+
+void CLNeg::configure(ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp
new file mode 100644
index 000000000..276c4557a
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayerEx::CLNormalizationLayerEx() : _norm_kernel(), _border_handler() {}
+
+void CLNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+ const NormalizationLayerInfo &norm_info)
+{
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+ // Configure normalization kernel
+ _norm_kernel.configure(input, output, norm_info);
+
+ // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
+ _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+Status CLNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
+{
+ return CLNormalizationLayerExKernel::validate(input, output, norm_info);
+}
+
+void CLNormalizationLayerEx::run()
+{
+ // Run border handler
+ CLScheduler::get().enqueue(_border_handler, false);
+
+ // Run normalization kernel
+ CLScheduler::get().enqueue(_norm_kernel);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
new file mode 100644
index 000000000..38adedd10
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+ k->configure(input, alpha, output);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
new file mode 100644
index 000000000..5265b6c34
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
@@ -0,0 +1,28 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+using namespace arm_compute;
+
+void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLPadLayerKernel>();
+ k->configure(input, output, pad_size);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp
new file mode 100644
index 000000000..fb363270d
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPermuteEx.h"
+
+#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
+
+using namespace arm_compute;
+
+void CLPermuteEx::configure(const ICLTensor *input, ICLTensor *output,
+ const PermutationVector &perm)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLPermuteExKernel>();
+ k->configure(input, output, perm);
+ _kernel = std::move(k);
+}
+
+Status CLPermuteEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PermutationVector &perm)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteExKernel::validate(input, output, perm));
+ return Status{};
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
index e1add5e90..dc0baa8dd 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
@@ -18,9 +18,6 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
using namespace arm_compute;
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
deleted file mode 100644
index 3382058db..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLReduceMax.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/ToolchainSupport.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#include <vector>
-#include <algorithm>
-
-#include <utility>
-
-#define REDUCE_MAX_RUN_ON_CPU 1
-
-namespace arm_compute
-{
-
-CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {}
-
-void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output)
-{
- _axis = axis;
-
- _input = input;
- _output = output;
-
- auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
- k->configure(input, axis, output);
- _kernel = std::move(k);
-
- // We can handle for simple case only
- // Output rank: 1
- // Axis: one axis value, restrict to 1
- ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2);
- ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1);
- ARM_COMPUTE_ERROR_ON(axis != 1);
-}
-
-Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
-{
- return CLReduceMaxKernel::validate(input, axis, output);
-}
-
-void CLReduceMax::run()
-{
-#if REDUCE_MAX_RUN_ON_CPU
- run_on_cpu();
-
- arm_compute::CLScheduler::get().sync();
-#else
- arm_compute::CLScheduler::get().enqueue(*_kernel);
-#endif
-}
-
-void CLReduceMax::run_on_cpu()
-{
- cl::CommandQueue q = CLScheduler::get().queue();
-
- _input->map(q);
- _output->map(q);
-
- // Compute by CPU for simple case
- // Input rank: 2
- // Output rank: 1
- // Axis: one axis value, restrict to 1
-
- float *input_data = (float *)_input->buffer();
- float *output_data = (float *)_output->buffer();
-
- std::vector<float> container_max;
- int cols = _input->info()->tensor_shape()[0];
- int rows = _input->info()->tensor_shape()[1];
- container_max.resize(rows);
-
- // Initialize as 1st element in row
- float *input_pointer = input_data;
- for (int i = 0; i < rows; i++)
- {
- container_max[i] = *input_pointer;
- input_pointer += cols;
- }
-
- // Update max value in row
- for (int i = 0; i < rows; i++)
- {
- float max_in_row = container_max[i];
- for (int j = 1; j < cols; j++)
- {
- if (max_in_row < input_data[i * cols + j])
- {
- max_in_row = input_data[i * cols + j];
- }
- }
- container_max[i] = max_in_row;
- }
-
- for (int i = 0; i < rows; i++)
- {
- output_data[i] = container_max[i];
- }
-
- _input->unmap(q);
- _output->unmap(q);
-}
-} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
new file mode 100644
index 000000000..2b8d82706
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLReduceOperation::CLReduceOperation()
+ : _input(nullptr), _output(nullptr), _axis(), _interm_tensors(), _reduce_kernels()
+{
+}
+
+Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const std::set<uint32_t> &axis, const ReduceOperation &op)
+{
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+ // Create temporary tensor infos
+ auto interm_tensors =
+ arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+
+ auto it = axis.begin();
+ for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+ {
+ shape.set(*it, 1);
+ interm_tensors[i].set_data_type(input->data_type());
+ interm_tensors[i].set_tensor_shape(shape);
+ interm_tensors[i].set_num_channels(input->num_channels());
+ }
+
+ // Set a vector that is ordered ITensorInfo sequentially.
+ std::vector<const ITensorInfo *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; ++i)
+ {
+ tensors.emplace_back(interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Validate ReduceOperation only on all kernels
+ it = axis.begin();
+ for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+ }
+
+ return Status{};
+}
+
+void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+ const std::set<uint32_t> &axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op));
+
+ _axis = axis;
+
+ _input = input;
+ _output = output;
+
+ // NOTE The axis must have no duplication.
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+ _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _reduce_kernels =
+ arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+ TensorShape shape{input->info()->tensor_shape()};
+ auto it = axis.begin();
+ for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+ {
+ shape.set(*it, 1);
+ _interm_tensors[i].allocator()->init(
+ TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+ _interm_tensors[i].allocator()->allocate();
+ }
+
+ // Set a vector that is ordered ICLTensors sequentially.
+ std::vector<ICLTensor *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; ++i)
+ {
+ tensors.emplace_back(_interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Apply ReduceOperation on all kernels
+ it = axis.begin();
+ for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+ {
+ _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
+ }
+}
+
+void CLReduceOperation::run()
+{
+ const size_t num_of_kernels = _axis.size();
+ for (size_t i = 0; i < num_of_kernels; ++i)
+ {
+ CLScheduler::get().enqueue(_reduce_kernels[i]);
+ }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
deleted file mode 100644
index ab724e752..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLReductionMean.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLReductionMean::CLReductionMean() : _reduction_mean_kernel(), _fill_border_kernel() {}
-
-Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output,
- std::vector<uint32_t> axis)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis));
- return Status{};
-}
-
-void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis)
-{
- _reduction_mean_kernel.configure(input, output, axis);
- _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT,
- PixelValue(0));
-}
-
-void CLReductionMean::run()
-{
- CLScheduler::get().enqueue(_fill_border_kernel);
- CLScheduler::get().enqueue(_reduction_mean_kernel);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
new file mode 100644
index 000000000..c03826891
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size,
+ const ICLTensor *padding_size, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>();
+ k->configure(input, block_size, padding_size, output);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
new file mode 100644
index 000000000..0f455f96f
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+ k->configure(input, output, block_size);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp
new file mode 100644
index 000000000..dc6e4af44
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSquaredDifference.h"
+
+#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLSquaredDifference::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSquaredDifferenceKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
deleted file mode 100644
index cd576cec1..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-#include <vector>
-
-using namespace arm_compute;
-
-static const int32_t maxDims = 4;
-
-// Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axisSize - 1] that can be used to index
-// directly into the data.
-inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices,
- std::vector<int32_t> const &strides, const TensorShape &inputShape,
- int32_t axis)
-{
- // Begin with the specified index
- int32_t start = startIndices[axis];
-
- // beginMask override
- if (beginMask & 1 << axis)
- {
- if (strides[axis] > 0)
- {
- // Forward iteration - use the first element. These values will get
- // clamped below (Note: We could have set them to 0 and axisSize-1, but
- // use lowest() and max() to maintain symmetry with StopForAxis())
- start = std::numeric_limits<int32_t>::lowest();
- }
- else
- {
- // Backward iteration - use the last element.
- start = std::numeric_limits<int32_t>::max();
- }
- }
-
- // Handle negative indices
- int32_t axisSize = inputShape[axis];
- if (start < 0)
- {
- start += axisSize;
- }
-
- // Clamping
- start = arm_compute::utility::clamp(start, 0, axisSize - 1);
-
- return start;
-}
-
-// Return the "real" index for the end of iteration along that axis. This is an
-// "end" in the traditional C sense, in that it points to one past the last
-// element. ie. So if you were iterating through all elements of a 1D array of
-// size 4, this function would return 4 as the stop, because it is one past the
-// "real" indices of 0, 1, 2 & 3.
-inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices,
- std::vector<int32_t> const &strides, const TensorShape &inputShape,
- int32_t axis)
-{
- // Begin with the specified index
- int32_t stop = stopIndices[axis];
-
- // endMask override
- if (endMask & (1 << axis))
- {
- if (strides[axis] > 0)
- {
- // Forward iteration - use the last element. These values will get
- // clamped below
- stop = std::numeric_limits<int32_t>::max();
- }
- else
- {
- // Backward iteration - use the first element.
- stop = std::numeric_limits<int32_t>::lowest();
- }
- }
-
- // Handle negative indices
- int32_t axisSize = inputShape[axis];
- if (stop < 0)
- {
- stop += axisSize;
- }
-
- // Clamping
- // Because the end index points one past the last element, we need slightly
- // different clamping ranges depending on the direction.
- if (strides[axis] > 0)
- {
- // Forward iteration
- stop = arm_compute::utility::clamp(stop, 0, axisSize);
- }
- else
- {
- // Backward iteration
- stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
- }
-
- return stop;
-}
-
-inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
-{
- int32_t offset = b * shape[2] * shape[1] * shape[0];
- offset += d * shape[1] * shape[0];
- offset += h * shape[0];
- offset += w;
- return offset;
-}
-
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
- ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
- int32_t endMask, int32_t shrinkAxisMask)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
- k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
- _kernel = std::move(k);
-}
-
-void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
- ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
- int32_t endMask, int32_t shrinkAxisMask)
-{
- ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate(
- input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(),
- beginMask, endMask, shrinkAxisMask));
-
- _input = input;
- _output = output;
- _beginData = beginData;
- _endData = endData;
- _stridesData = stridesData;
- _beginMask = beginMask;
- _endMask = endMask;
- _shrinkAxisMask = shrinkAxisMask;
-}
-
-void CLStridedSliceCPU::run()
-{
- run_on_cpu();
-
- arm_compute::CLScheduler::get().sync();
-}
-
-inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
-{
- if (stride > 0)
- {
- return ((stop - start - 1) / stride) + 1;
- }
- else
- {
- return ((stop - start + 1) / stride) + 1;
- }
-}
-
-template <typename T>
-inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask,
- int32_t endMask, const std::vector<int32_t> &startIndices,
- const std::vector<int32_t> &stopIndices,
- const std::vector<int32_t> &strides, T *outputData)
-{
- ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims);
- ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims);
- ARM_COMPUTE_ERROR_ON(strides.size() != maxDims);
-
- const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3);
- const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3);
- const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2);
- const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2);
- const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1);
- const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1);
- const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0);
- const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0);
-
- // The shape of outputData may collapse in one-dimension.
- // Therefore, it is necessary to create a shape that matches the result of the outputData.
- TensorShape outputShape(
- getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]),
- getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3]));
- for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b;
- in_b += strides[3], b++)
- {
- for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d;
- in_d += strides[2], d++)
- {
- for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h;
- in_h += strides[1], h++)
- {
- for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w;
- in_w += strides[0], w++)
- {
- outputData[offset4D(outputShape, b, d, h, w)] =
- inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)];
- }
- }
- }
- }
-}
-
-void CLStridedSliceCPU::run_on_cpu()
-{
- // TODO: Support shrinkAxisMask
- cl::CommandQueue q = CLScheduler::get().queue();
-
- _input->map(q);
- _output->map(q);
- _beginData->map(q);
- _endData->map(q);
- _stridesData->map(q);
-
- TensorShape inputShape = _input->info()->tensor_shape();
- TensorShape outputShape = _output->info()->tensor_shape();
-
- std::vector<int32_t> starts;
- std::vector<int32_t> stops;
- std::vector<int32_t> strides;
-
- for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx)
- {
- starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]);
- stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]);
- strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]);
- }
-
- for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++)
- {
- starts.emplace_back(0);
- stops.emplace_back(1);
- strides.emplace_back(1);
- }
-
- switch (_input->info()->data_type())
- {
- case DataType::U8:
- case DataType::QASYMM8:
- StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask,
- _endMask, starts, stops, strides,
- reinterpret_cast<uint8_t *>(_output->buffer()));
- break;
- case DataType::S8:
- case DataType::QS8:
- StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask,
- _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer()));
- break;
- case DataType::U16:
- StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask,
- _endMask, starts, stops, strides,
- reinterpret_cast<uint16_t *>(_output->buffer()));
- break;
- case DataType::S16:
- case DataType::QS16:
- StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask,
- _endMask, starts, stops, strides,
- reinterpret_cast<int16_t *>(_output->buffer()));
- break;
- case DataType::F16:
- // Not sure this works.
- StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask,
- _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer()));
- break;
- case DataType::U32:
- StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask,
- _endMask, starts, stops, strides,
- reinterpret_cast<uint32_t *>(_output->buffer()));
- break;
- case DataType::S32:
- StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask,
- _endMask, starts, stops, strides,
- reinterpret_cast<int32_t *>(_output->buffer()));
- break;
- case DataType::F32:
- StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask,
- _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer()));
- break;
- default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
-
- _input->unmap(q);
- _output->unmap(q);
- _beginData->unmap(q);
- _endData->unmap(q);
- _stridesData->unmap(q);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp
new file mode 100644
index 000000000..be7353493
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLStridedSliceEx.h"
+
+#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
+
+using namespace arm_compute;
+
+void CLStridedSliceEx::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+ ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+ int32_t endMask, int32_t shrinkAxisMask)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceExKernel>();
+ k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
+ _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
index 6426364c9..19177497c 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -15,12 +15,9 @@
* limitations under the License.
*/
#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-
-#include <vector>
-#include <algorithm>
#include "../../topk_v2.h"
diff --git a/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp
new file mode 100644
index 000000000..988e92715
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NENormalizationLayerEx::NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(),
+ _border_handler(), _input_squared()
+{
+}
+
+void NENormalizationLayerEx::configure(const ITensor *input, ITensor *output,
+ const NormalizationLayerInfo &norm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(),
+ input->info()->quantization_info());
+ _input_squared.allocator()->init(tensor_info);
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_input_squared);
+
+ // Configure kernels
+ _norm_kernel.configure(input, &_input_squared, output, norm_info);
+ _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT,
+ PixelValue(0.0f));
+
+ // Allocate the tensor once the configure methods have been called
+ _input_squared.allocator()->allocate();
+}
+
+Status NENormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
+{
+ // Perform validation step
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NENormalizationLayerExKernel::validate(input, input, output, norm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(
+ input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+ return Status{};
+}
+
+void NENormalizationLayerEx::run()
+{
+ _memory_group.acquire();
+
+ NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_border_handler, Window::DimY);
+ NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+ _memory_group.release();
+}
diff --git a/libs/ARMComputeEx/src/runtime/topk_v2.h b/libs/ARMComputeEx/src/runtime/topk_v2.h
index a18ff0b0d..f94effea1 100644
--- a/libs/ARMComputeEx/src/runtime/topk_v2.h
+++ b/libs/ARMComputeEx/src/runtime/topk_v2.h
@@ -15,6 +15,12 @@
* limitations under the License.
*/
+/**
+ * @file topk_v2.h
+ * @brief This file contains TopK method and TopContainer class for TopK operation
+ * @ingroup COM_AI_RUNTIME
+ */
+
#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
@@ -26,34 +32,62 @@ namespace rt
{
namespace optimized_ops
{
-// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
-// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
-// TFLite.
-//(TFLite additionaly supports kTfLiteInt64.)
-
-// The class that collects top indexes of k values. Based on template
-// tensorflow::gtl::TopN<> but, for optimization,
-// it re-uses the same container.
+/**
+ * @brief class to define TopK operation
+ * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
+ * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
+ * TFLite.
+ * (TFLite additionaly supports kTfLiteInt64.)
+ *
+ * The class that collects top indexes of k values. Based on template
+ * tensorflow::gtl::TopN<> but, for optimization,
+ * it re-uses the same container.
+ */
template <typename T> class TopContainer
{
public:
+ /**
+ * @brief Prevent default constructor of of this class
+ */
TopContainer() = delete;
+ /**
+ * @brief Constructor with params
+ * @param [in] row_size Size of row in data
+ * @param [in] k The top k predictions
+ */
TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
{
container_.reserve(std::min(k, row_size) + 1);
}
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * @param [in] topContainer To copy
+ */
TopContainer(const TopContainer &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
+ /*
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * @param [in] topContainer To copy
+ * @return Reference of TopContainer
+ */
TopContainer &operator=(const TopContainer &) = delete;
+ /**
+ * @brief Start collecting
+ * @param [in] values To set as values
+ * @return N/A
+ */
void start_collecting(const T *values)
{
values_ = values;
container_.clear();
}
+ /**
+ * @brief Push a value to be compared for topk
+ * @param [in] a A value to compare
+ * @return N/A
+ */
void push(int32 a)
{
auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
@@ -74,6 +108,10 @@ public:
}
}
+ /**
+ * @brief Get sorted result from pushed values
+ * @return Reference of vector with sorted values
+ */
const std::vector<int32> &sorted_result()
{
auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
@@ -111,6 +149,16 @@ private:
}
};
+/**
+ * @brief Operates TopK operation with params
+ * @param [in] row_size Size of row in data
+ * @param [in] num_rows The number of rows in data
+ * @param [in] data To be operated in
+ * @param [in] k The top k predictions
+ * @param [out] output_indexes Indexes of targets in the top k predictions
+ * @param [out] output_values Values of targets in the top k predictions
+ * @return N/A
+ */
template <typename T>
void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
T *output_values)
diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt
index 687159725..99d2028f4 100644
--- a/libs/CMakeLists.txt
+++ b/libs/CMakeLists.txt
@@ -1,3 +1,4 @@
-add_subdirectory(util)
-add_subdirectory(support)
-add_subdirectory(ARMComputeEx)
+# Add all subdirectories.
+# Each library in sub-directory must have it's own CMakeLists.txt
+# to build library's binaries or to support interface.
+add_subdirectories()
diff --git a/libs/cpp14/CMakeLists.txt b/libs/cpp14/CMakeLists.txt
new file mode 100644
index 000000000..bba9e132d
--- /dev/null
+++ b/libs/cpp14/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_library(nnfw_lib_cpp14 INTERFACE)
+target_include_directories(nnfw_lib_cpp14 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
diff --git a/libs/cpp14/include/cpp14/memory.h b/libs/cpp14/include/cpp14/memory.h
new file mode 100644
index 000000000..b3e678baa
--- /dev/null
+++ b/libs/cpp14/include/cpp14/memory.h
@@ -0,0 +1,29 @@
+/**
+ * @file memory.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains @c make_unique which is not supported by C++11
+ */
+#ifndef __NNFW_CPP14_MEMORY_H__
+#define __NNFW_CPP14_MEMORY_H__
+
+#include <memory>
+
+namespace nnfw
+{
+namespace cpp14
+{
+/**
+ * @brief Provide @c make_unique function supported from C++14
+ * @param[in] args List of arguments with which an instance of T will be constructed.
+ * @return @c std::unique_ptr of an instance of type T
+ */
+template <typename T, typename... Args> std::unique_ptr<T> make_unique(Args &&... args)
+{
+ // NOTE std::make_unique is missing in C++11 standard
+ return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+} // napesapce cpp14
+} // namespace nnfw
+
+#endif // __NNFW_CPP14_MEMORY_H__
diff --git a/libs/misc/CMakeLists.txt b/libs/misc/CMakeLists.txt
new file mode 100644
index 000000000..cd01695fb
--- /dev/null
+++ b/libs/misc/CMakeLists.txt
@@ -0,0 +1,13 @@
+# Library `nnfw_lib_misc`
+set(NNFW_UTILITY_SRCS src/environment.cpp)
+list(APPEND NNFW_UTILITY_SRCS src/tensor/Shape.cpp)
+list(APPEND NNFW_UTILITY_SRCS src/tensor/NonIncreasingStride.cpp)
+list(APPEND NNFW_UTILITY_SRCS src/tensor/IndexFormatter.cpp)
+list(APPEND NNFW_UTILITY_SRCS src/tensor/Comparator.cpp)
+
+add_library(nnfw_lib_misc STATIC ${NNFW_UTILITY_SRCS})
+target_include_directories(nnfw_lib_misc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set_target_properties(nnfw_lib_misc PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+add_executable(nnfw_tensor_index_iterator "examples/tensor_index_iterator.cpp")
+target_link_libraries(nnfw_tensor_index_iterator nnfw_lib_misc)
diff --git a/libs/util/examples/tensor_index_iterator.cpp b/libs/misc/examples/tensor_index_iterator.cpp
index 284e04aa0..8a19dac87 100644
--- a/libs/util/examples/tensor_index_iterator.cpp
+++ b/libs/misc/examples/tensor_index_iterator.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "util/tensor/IndexIterator.h"
+#include "misc/tensor/IndexIterator.h"
#include <array>
@@ -25,14 +25,14 @@
void test_iterate(void)
{
- const nnfw::util::tensor::Shape shape{3, 4, 7};
+ const nnfw::misc::tensor::Shape shape{3, 4, 7};
std::array<int, 3 * 4 * 7> array;
array.fill(0);
- using nnfw::util::tensor::iterate;
- using nnfw::util::tensor::Index;
+ using nnfw::misc::tensor::iterate;
+ using nnfw::misc::tensor::Index;
iterate(shape) << [&](const Index &index) {
assert(index.rank() == shape.rank());
@@ -57,11 +57,11 @@ int main(int argc, char **argv)
{
test_iterate();
- nnfw::util::tensor::Shape shape{3, 4, 3, 4};
+ nnfw::misc::tensor::Shape shape{3, 4, 3, 4};
std::cout << "Iterate over tensor{3, 4, 3, 4}" << std::endl;
- nnfw::util::tensor::iterate(shape) << [](const nnfw::util::tensor::Index &index) {
+ nnfw::misc::tensor::iterate(shape) << [](const nnfw::misc::tensor::Index &index) {
std::cout << "rank: " << index.rank() << std::endl;
for (size_t d = 0; d < index.rank(); ++d)
diff --git a/libs/misc/include/misc/EnvVar.h b/libs/misc/include/misc/EnvVar.h
new file mode 100644
index 000000000..47206d4c0
--- /dev/null
+++ b/libs/misc/include/misc/EnvVar.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file EnvVar.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::EnvVar class
+ */
+
+#ifndef __NNFW_MISC_ENV_VAR__
+#define __NNFW_MISC_ENV_VAR__
+
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <string>
+
+namespace nnfw
+{
+namespace misc
+{
+/**
+ * @brief Class to access environment variable
+ */
+class EnvVar
+{
+public:
+ /**
+ * @brief Construct a new EnvVar object
+ * @param[in] key environment variable
+ */
+ EnvVar(const std::string &key)
+ {
+ const char *value = std::getenv(key.c_str());
+ if (value == nullptr)
+ {
+ // An empty string is considered as an empty value
+ _value = "";
+ }
+ else
+ {
+ _value = value;
+ }
+ }
+
+ /**
+ * @brief Get environment variable of string type
+ * @param[in] def Default value of environment variable
+ * @return Defaut value passed as a parameter when there is no environment variable,
+ * otherwise the value of environment variable passed into constructor
+ */
+ std::string asString(const std::string &def) const
+ {
+ if (_value.empty())
+ return def;
+ return _value;
+ }
+
+ /**
+ * @brief Get environment variable of boolean type
+ * @param[in] def Default value of environment variable
+ * @return Defaut value passed as a parameter when there is no environment variable,
+ * otherwise the value of environment variable passed into constructor
+ */
+ bool asBool(bool def) const
+ {
+ if (_value.empty())
+ return def;
+ static const std::array<std::string, 5> false_list{"0", "OFF", "FALSE", "N", "NO"};
+ auto false_found = std::find(false_list.begin(), false_list.end(), _value);
+ return (false_found == false_list.end());
+ }
+
+ /**
+ * @brief Get environment variable of int type
+ * @param[in] def Default value of environment variable
+ * @return Defaut value passed as a parameter when there is no environment variable,
+ * otherwise the value of environment variable passed into constructor
+ */
+ int asInt(int def) const
+ {
+ if (_value.empty())
+ return def;
+ return std::stoi(_value);
+ }
+
+private:
+ std::string _value;
+};
+
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_ENV_VAR__
diff --git a/libs/misc/include/misc/benchmark.h b/libs/misc/include/misc/benchmark.h
new file mode 100644
index 000000000..fe5b97585
--- /dev/null
+++ b/libs/misc/include/misc/benchmark.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file benchmark.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::benchmark::Accumulator class
+ */
+#ifndef __NNFW_MISC_BENCHMARK_H__
+#define __NNFW_MISC_BENCHMARK_H__
+
+#include <chrono>
+
+namespace nnfw
+{
+namespace misc
+{
+// Benckmark support
+namespace benchmark
+{
+
+/**
+ * @brief Class to accumulate time during benchmark
+ */
+template <typename T> class Accumulator
+{
+public:
+ /**
+ * @brief Construct a new Accumulator object
+ * @param[in] ref Object to keep time duration
+ */
+ Accumulator(T &ref) : _ref(ref)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Return the reference of @c ref passed to constructor
+ * @return Reference of @c ref
+ */
+ T &operator()(void) { return _ref; }
+
+private:
+ T &_ref;
+};
+
+/**
+ * @brief Run passed function and returns accumulated time
+ * @tparam T Period used by @c std::chrono::duration_cast
+ * @tparam Callable Function type to benchmark
+ * @param[in] acc Accumulated time after running @cb
+ * @param[in] cb Function to run and benchmark
+ * @return Accumulated time
+ */
+template <typename T, typename Callable>
+Accumulator<T> &operator<<(Accumulator<T> &&acc, Callable cb)
+{
+ auto begin = std::chrono::steady_clock::now();
+ cb();
+ auto end = std::chrono::steady_clock::now();
+
+ acc() += std::chrono::duration_cast<T>(end - begin);
+
+ return acc;
+}
+
+template <typename T> Accumulator<T> measure(T &out) { return Accumulator<T>(out); }
+
+} // namespace benchmark
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_BENCHMARK_H__
diff --git a/libs/misc/include/misc/environment.h b/libs/misc/include/misc/environment.h
new file mode 100644
index 000000000..8e6bd00d5
--- /dev/null
+++ b/libs/misc/include/misc/environment.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file environment.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains utility functions and classes to access environment variables
+ */
+
+#ifndef __UTIL_ENVIRONMENT_H__
+#define __UTIL_ENVIRONMENT_H__
+
+namespace nnfw
+{
+namespace misc
+{
+
+/**
+ * @brief Get the environment variable of int type
+ * @param[in] name Name of the environment variable
+ * @param[in] defaultValue Default value when the value of environment variable does not exist
+ * @return The int value of the environment variable
+ */
+int get_env_int(const char *name, int defaultValue = 0);
+
+/**
+ * @brief Get the environment variable of bool type
+ * @param[in] name Name of the environment variable
+ * @param[in] defaultValue Default value when the value of environment variable does not exist
+ * @return @c 0 if the value of the environment variable is @c "0", @c 1 in case of other number
+ */
+bool get_env_bool(const char *name, bool defaultValue = false);
+}
+}
+
+#include <string>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace env
+{
+/**
+ * @brief Parent struct of @ref IntAccessor and @ref FloatAccessor
+ * @tparam T Type of the value of environment variable
+ */
+template <typename T> struct Accessor
+{
+ /**
+ * @brief Destroy the Accessor object
+ */
+ virtual ~Accessor() = default;
+ /**
+ * @brief Read the value of environment variable
+ * @param[out] out The value of environment variable
+ * @return @c true if accessing environment variable is successful,
+ * @c false if there is exist no such environment variable
+ */
+ virtual bool access(T &out) const = 0;
+};
+
+/**
+ * @brief Class to read int environment variable
+ */
+class IntAccessor : public Accessor<int>
+{
+public:
+ /**
+ * @brief Construct a new IntAccessor object
+ * @param[in] tag Name of environment variable
+ */
+ IntAccessor(const std::string &tag);
+
+public:
+ /**
+ * @brief Read the value of environment variable
+ * @param[out] out The value of environment variable
+ * @return @c true if accessing environment variable is successful,
+ * @c false if there is exist no such environment variable
+ */
+ bool access(int &out) const override;
+
+private:
+ std::string _tag;
+};
+
+/**
+ * @brief Class to read float environment variable
+ */
+class FloatAccessor : public Accessor<float>
+{
+public:
+ /**
+ * @brief Construct a new FloatAccessor object
+ * @param[in] tag Name of environment variable
+ */
+ FloatAccessor(const std::string &tag);
+
+public:
+ /**
+ * @brief Read the value of environment variable
+ * @param[out] out The value of environment variable
+ * @return @c true if accessing environment variable is successful,
+ * @c false if there is exist no such environment variable
+ */
+ bool access(float &out) const override;
+
+private:
+ std::string _tag;
+};
+
+} // namespace env
+} // namespace misc
+} // namespace nnfw
+
+#endif // __UTIL_ENVIRONMENT_H__
diff --git a/libs/misc/include/misc/feature/Index.h b/libs/misc/include/misc/feature/Index.h
new file mode 100644
index 000000000..a361d8dd2
--- /dev/null
+++ b/libs/misc/include/misc/feature/Index.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Index.h
+ * @brief This file contains Index class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_FEATURE_INDEX_H__
+#define __NNFW_MISC_FEATURE_INDEX_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace feature
+{
+
+/**
+ * @brief Class to have the index information for calculating the offset.
+ */
+class Index
+{
+public:
+ /**
+ * @brief Construct Index object using default constrcutor
+ */
+ Index() = default;
+
+public:
+ /**
+ * @brief Construct Index object with three indexes of dimensions
+ * @param[in] ch The depth index
+ * @param[in] row The heigth index
+ * @param[in] col The width index
+ */
+ Index(int32_t ch, int32_t row, int32_t col) : _batch{1}, _ch{ch}, _row{row}, _col{col}
+ {
+ // DO NOTHING
+ }
+ /**
+ * @brief Construct Index object with four indexes of dimensions
+ * @param[in] batch The batch index
+ * @param[in] ch The depth index
+ * @param[in] row The height index
+ * @param[in] col The width index
+ */
+ Index(int32_t batch, int32_t ch, int32_t row, int32_t col)
+ : _batch{batch}, _ch{ch}, _row{row}, _col{col}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Get the batch index
+ * @return The batch index
+ */
+ int32_t batch(void) const { return _batch; }
+ /**
+ * @brief Get the depth index
+ * @return The depth index
+ */
+ int32_t ch(void) const { return _ch; }
+ /**
+ * @brief Get the height index
+ * @return The height index
+ */
+ int32_t row(void) const { return _row; }
+ /**
+ * @brief Get the width index
+ * @return The width index
+ */
+ int32_t col(void) const { return _col; }
+
+public:
+ /**
+ * @brief Get the batch index as the lvalue reference
+ * @return The reference of the batch value
+ */
+ int32_t &batch(void) { return _batch; }
+ /**
+ * @brief Get the depth index as the lvalue reference
+ * @return The reference of the depth value
+ */
+ int32_t &ch(void) { return _ch; }
+ /**
+ * @brief Get the height index as the lvalue reference
+ * @return The reference of the height value
+ */
+ int32_t &row(void) { return _row; }
+ /**
+ * @brief Get the width index as the lvalue reference
+ * @return The reference of the width value
+ */
+ int32_t &col(void) { return _col; }
+
+private:
+ /**
+ * @brief The batch index
+ */
+ int32_t _batch;
+ /**
+ * @brief The depth index
+ */
+ int32_t _ch;
+ /**
+ * @brief The height index
+ */
+ int32_t _row;
+ /**
+ * @brief The width index
+ */
+ int32_t _col;
+};
+
+} // namespace feature
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FEATURE_INDEX_H__
diff --git a/libs/misc/include/misc/feature/IndexIterator.h b/libs/misc/include/misc/feature/IndexIterator.h
new file mode 100644
index 000000000..1cf675526
--- /dev/null
+++ b/libs/misc/include/misc/feature/IndexIterator.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file IndexIterator.h
+ * @brief This file contains IndexIterator class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_FEATURE_INDEX_ITERATOR_H__
+#define __NNFW_MISC_FEATURE_INDEX_ITERATOR_H__
+
+#include "misc/feature/Shape.h"
+
+namespace nnfw
+{
+namespace misc
+{
+namespace feature
+{
+
+/**
+ * @brief Class to iterate Callable with Index of feature
+ */
+class IndexIterator
+{
+public:
+ /**
+ * @brief Construct IndexIterator object with Shape of feature
+ * @param[in] shape Shape reference of feature
+ */
+ IndexIterator(const Shape &shape) : _shape{shape}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Call a function iterated
+ * @param[in] cb A callback function
+ * @return Current IndexIterator object
+ */
+ template <typename Callable> IndexIterator &iter(Callable cb)
+ {
+ for (int32_t batch = 0; batch < _shape.N; ++batch)
+ {
+ for (int32_t ch = 0; ch < _shape.C; ++ch)
+ {
+ for (int32_t row = 0; row < _shape.H; ++row)
+ {
+ for (int32_t col = 0; col < _shape.W; ++col)
+ {
+ cb(batch, ch, row, col);
+ }
+ }
+ }
+ }
+
+ return (*this);
+ }
+
+private:
+ /**
+ * @brief Shape for feature
+ */
+ const Shape _shape;
+};
+
+/**
+ * @brief Create an object of IndexIterator for feature
+ * @param[in] Shape reference of feature
+ * @return Created IndexIterator object
+ */
+static inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; }
+
+/**
+ * @brief Call a function iterated using IndexIterator of feature
+ * Overloaded operator<<
+ * @param[in] it An IndexIterator reference
+ * @param[in] cb A callback function
+ * @return created IndexIterator object
+ */
+template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb)
+{
+ return it.iter(cb);
+}
+
+} // namespace feature
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FEATURE_INDEX_ITERATOR_H__
diff --git a/libs/misc/include/misc/feature/Object.h b/libs/misc/include/misc/feature/Object.h
new file mode 100644
index 000000000..7af0e28f4
--- /dev/null
+++ b/libs/misc/include/misc/feature/Object.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Object.h
+ * @brief This file contains Object class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_FEATURE_OBJECT_H__
+#define __NNFW_MISC_FEATURE_OBJECT_H__
+
+#include "misc/feature/Shape.h"
+#include "misc/feature/Index.h"
+#include "misc/feature/Reader.h"
+
+#include <vector>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace feature
+{
+
+/**
+ * @brief Class to have information of the operand for feature
+ */
+template <typename T> class Object final : public Reader<T>
+{
+public:
+ using Generator = std::function<T(const Shape &shape, const Index &index)>;
+
+public:
+ /**
+ * @brief Construct Object object with Shape of feature and set value used by Generator
+ * @param[in] shape Reference of Shape for feature
+ * @param[in] fn A function to set values of operand tensor
+ */
+ Object(const Shape &shape, const Generator &fn) : _shape{shape}
+ {
+ _value.resize(_shape.C * _shape.H * _shape.W);
+
+ for (int32_t ch = 0; ch < _shape.C; ++ch)
+ {
+ for (int32_t row = 0; row < _shape.H; ++row)
+ {
+ for (int32_t col = 0; col < _shape.W; ++col)
+ {
+ _value.at(offsetOf(ch, row, col)) = fn(_shape, Index{ch, row, col});
+ }
+ }
+ }
+ }
+
+public:
+ /**
+ * @brief Get Shape of feature as the reference
+ * @return The reference of the width value
+ */
+ const Shape &shape(void) const { return _shape; }
+
+public:
+ /**
+ * @brief Get the value used by three indexes
+ * @param[in] ch The depth index
+ * @param[in] row The height index
+ * @param[in] col The width index
+ * @return The value at the offset
+ */
+ T at(uint32_t ch, uint32_t row, uint32_t col) const override
+ {
+ return _value.at(offsetOf(ch, row, col));
+ }
+
+private:
+ /**
+ * @brief Get the offset value at three indexes
+ * @param[in] ch The depth index
+ * @param[in] row The height index
+ * @param[in] col The width index
+ * @return The offset value
+ */
+ uint32_t offsetOf(uint32_t ch, uint32_t row, uint32_t col) const
+ {
+ return ch * _shape.H * _shape.W + row * _shape.W + col;
+ }
+
+private:
+ /**
+ * @brief Shape of operand
+ */
+ Shape _shape;
+ /**
+ * @brief The tensor vector of operand
+ */
+ std::vector<T> _value;
+};
+
+} // namespace feature
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FEATURE_OBJECT_H__
diff --git a/libs/misc/include/misc/feature/Reader.h b/libs/misc/include/misc/feature/Reader.h
new file mode 100644
index 000000000..b09209789
--- /dev/null
+++ b/libs/misc/include/misc/feature/Reader.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Reader.h
+ * @brief This file contains Reader class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_FEATURE_READER_H__
+#define __NNFW_MISC_FEATURE_READER_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace feature
+{
+
+/**
+ * @brief Class reads values of feature
+ * The interface class
+ */
+template <typename T> struct Reader
+{
+ /**
+ * @brief Destruct Reader object using default destructor
+ */
+ virtual ~Reader() = default;
+
+ /**
+ * @brief Get the value used by three indexes
+ * @param[in] ch The depth index
+ * @param[in] row The height index
+ * @param[in] col The width index
+ * @return The value at the offset
+ */
+ virtual T at(uint32_t ch, uint32_t row, uint32_t col) const = 0;
+ /**
+ * @brief Get the value used by four indexes
+ * @param[in] batch The batch index
+ * @param[in] ch The depth index
+ * @param[in] row The height index
+ * @param[in] col The width index
+ * @return The value at the offset
+ */
+ virtual T at(uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) const = 0;
+};
+
+} // namespace feature
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FEATURE_READER_H__
diff --git a/libs/misc/include/misc/feature/Shape.h b/libs/misc/include/misc/feature/Shape.h
new file mode 100644
index 000000000..09881f58b
--- /dev/null
+++ b/libs/misc/include/misc/feature/Shape.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Shape.h
+ * @brief This file contains Shape class for feature
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_FEATURE_SHAPE_H__
+#define __NNFW_MISC_FEATURE_SHAPE_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace feature
+{
+
+/**
+ * @brief Structure to have values of dimensions for feature
+ */
+struct Shape
+{
+ int32_t N; /**< The batch value */
+ int32_t C; /**< The depth value */
+ int32_t H; /**< The height value */
+ int32_t W; /**< The width value */
+
+ /**
+ * @brief Construct Shape object using default constrcutor
+ */
+ Shape() = default;
+ /**
+ * @brief Construct Shape object with three values of dimensions
+ * @param[in] depth The depth value
+ * @param[in] height The height value
+ * @param[in] width The width value
+ */
+ Shape(int32_t depth, int32_t height, int32_t width) : N{1}, C{depth}, H{height}, W{width}
+ {
+ // DO NOTHING
+ }
+ /**
+ * @brief Construct Shape object with four values of dimensions
+ * @param[in] batch The batch value
+ * @param[in] depth The depth value
+ * @param[in] height The height value
+ * @param[in] width The width value
+ */
+ Shape(int32_t batch, int32_t depth, int32_t height, int32_t width)
+ : N{batch}, C{depth}, H{height}, W{width}
+ {
+ // DO NOTHING
+ }
+};
+
+} // namespace feature
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FEATURE_H__
diff --git a/libs/misc/include/misc/feature/TextFormatter.h b/libs/misc/include/misc/feature/TextFormatter.h
new file mode 100644
index 000000000..e053f1c61
--- /dev/null
+++ b/libs/misc/include/misc/feature/TextFormatter.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TextFormatter.h
+ * @brief This file contains TextFormatter class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_FEATURE_TEXT_FORMATTER_H__
+#define __NNFW_MISC_FEATURE_TEXT_FORMATTER_H__
+
+#include "misc/feature/Shape.h"
+#include "misc/feature/Reader.h"
+
+#include <ostream>
+#include <iomanip>
+#include <limits>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace feature
+{
+
+/**
+ * @brief Class to print operand of feature to ostream in the given string format
+ */
+template <typename T> class TextFormatter
+{
+public:
+ /**
+ * @brief Construct TextFormatter object with an operand's information.
+ * @param[in] shape The shape of an operand
+ * @param[in] data The data of an operand
+ */
+ TextFormatter(const Shape &shape, const Reader<T> &data) : _shape(shape), _data(data)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Get Shape of feature as the lvalue reference
+ * @return Shape of feature
+ */
+ const Shape &shape(void) const { return _shape; }
+ /**
+ * @brief Get Reader<T> that can read the data of an operand
+ * @return Reader<T>
+ */
+ const Reader<T> &data(void) const { return _data; }
+
+private:
+ /**
+ * @brief Shape of feature
+ */
+ const Shape &_shape;
+ /**
+ * @brief Reader<T> that can read the data of an operand
+ */
+ const Reader<T> &_data;
+};
+
+/**
+ * @brief Print operand of feature
+ * @param[in] os Standard output stream
+ * @param[in] fmt TextFormatter to print information of an operand
+ * @return Standard output stream
+ */
+template <typename T> std::ostream &operator<<(std::ostream &os, const TextFormatter<T> &fmt)
+{
+ const auto &shape = fmt.shape();
+
+ for (uint32_t ch = 0; ch < shape.C; ++ch)
+ {
+ os << " Channel " << ch << ":" << std::endl;
+ for (uint32_t row = 0; row < shape.H; ++row)
+ {
+ os << " ";
+ for (uint32_t col = 0; col < shape.W; ++col)
+ {
+ const auto value = fmt.data().at(ch, row, col);
+ os << std::right;
+ os << std::fixed;
+ os << std::setw(std::numeric_limits<T>::digits10 + 2);
+ os << std::setprecision(5);
+ os << value;
+ os << " ";
+ }
+ os << std::endl;
+ }
+ }
+
+ return os;
+}
+
+} // namespace feature
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FEATURE_TEXT_FORMATTER_H__
diff --git a/libs/misc/include/misc/fp32.h b/libs/misc/include/misc/fp32.h
new file mode 100644
index 000000000..c310402ba
--- /dev/null
+++ b/libs/misc/include/misc/fp32.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file fp32.h
+ * @brief This file contains functions to compare float values
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_FP32_H__
+#define __NNFW_MISC_FP32_H__
+
+#include <cmath>
+#include <cfloat>
+#include <algorithm>
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace fp32
+{
+
+/**
+ * @brief Get the difference between two float values as a relative value.
+ * @param[in] lhs A float value to be compared
+ * @param[in] rhs A float value to be compared
+ * @return A relative value of difference between two float values.
+ */
+inline float relative_diff(float lhs, float rhs)
+{
+ const auto diff = std::fabs(lhs - rhs);
+ const auto base = std::max(std::fabs(lhs), std::fabs(rhs));
+
+ return diff / base;
+}
+
+/**
+ * @brief Verify that an obtained float value is equal to the expected float value
+ * by using FLT_EPSILON
+ * @param[in] expected An expected float value to be compared
+ * @param[in] obtained An obtained float value to be compared
+ * @param[in] tolerance A tolerance value
+ * @return @c true if both values are equal, otherwise @c false
+ */
+inline bool epsilon_equal(float expected, float obtained, uint32_t tolerance = 1)
+{
+ if (std::isnan(expected) && std::isnan(obtained))
+ {
+ return true;
+ }
+
+ // Let's use relative epsilon comparision
+ const auto diff = std::fabs(expected - obtained);
+ const auto max = std::max(std::fabs(expected), std::fabs(obtained));
+
+ return diff <= (max * FLT_EPSILON * tolerance);
+}
+
+/**
+ * @brief Verify that an obtained float value is equal to the expected float value
+ * by comparing absolute tolerance value
+ * @param[in] expected An expected float value to be compared
+ * @param[in] obtained An obtained float value to be compared
+ * @param[in] tolerance A tolerance value
+ * @return @c true if both values are equal, otherwise @c false
+ */
+inline bool absolute_epsilon_equal(float expected, float obtained, float tolerance = 0.001)
+{
+ if (std::isnan(expected) && std::isnan(obtained))
+ {
+ return true;
+ }
+
+ // Let's use absolute epsilon comparision
+ const auto diff = std::fabs(expected - obtained);
+
+ return diff <= tolerance;
+}
+
+} // namespace fp32
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FP32_H__
diff --git a/libs/misc/include/misc/kernel/IndexIterator.h b/libs/misc/include/misc/kernel/IndexIterator.h
new file mode 100644
index 000000000..59e0f0095
--- /dev/null
+++ b/libs/misc/include/misc/kernel/IndexIterator.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file IndexIterator.h
+ * @brief This file contains IndexIterator class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_KERNEL_INDEX_ITERATOR_H__
+#define __NNFW_MISC_KERNEL_INDEX_ITERATOR_H__
+
+#include "misc/kernel/Shape.h"
+
+namespace nnfw
+{
+namespace misc
+{
+namespace kernel
+{
+
+/**
+ * @brief Class to iterate Callable with Index of kernel
+ */
+class IndexIterator
+{
+public:
+ /**
+ * @brief Construct IndexIterator object with Shape of kernel
+ * @param[in] shape Shape reference of feature
+ */
+ IndexIterator(const Shape &shape) : _shape{shape}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Call a function iterated
+ * @param[in] cb A callback function
+ * @return Current IndexIterator object
+ */
+ template <typename Callable> IndexIterator &iter(Callable cb)
+ {
+ for (int32_t nth = 0; nth < _shape.N; ++nth)
+ {
+ for (int32_t ch = 0; ch < _shape.C; ++ch)
+ {
+ for (int32_t row = 0; row < _shape.H; ++row)
+ {
+ for (int32_t col = 0; col < _shape.W; ++col)
+ {
+ cb(nth, ch, row, col);
+ }
+ }
+ }
+ }
+
+ return (*this);
+ }
+
+private:
+ const Shape _shape; /**< Shape for kernel */
+};
+
+/**
+ * @brief Create an object of IndexIterator for kernel
+ * @param[in] shape reference of feature
+ * @return Created IndexIterator object
+ */
+inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; }
+
+/**
+ * @brief Call a function iterated using IndexIterator of kernel
+ * Overloaded operator<<
+ * @param[in] it An IndexIterator reference
+ * @param[in] cb A callback function
+ * @return Created IndexIterator object
+ */
+template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb)
+{
+ return it.iter(cb);
+}
+
+} // namespace kernel
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FEATURE_INDEX_ITERATOR_H__
diff --git a/libs/misc/include/misc/kernel/RandomObject.h b/libs/misc/include/misc/kernel/RandomObject.h
new file mode 100644
index 000000000..4b58b0c7f
--- /dev/null
+++ b/libs/misc/include/misc/kernel/RandomObject.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file RandomObject.h
+ * @brief This file contains RandomObject class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_KERNEL_RANDOM_OBJECT_H__
+#define __NNFW_MISC_KERNEL_RANDOM_OBJECT_H__
+
+#include "misc/kernel/Shape.h"
+#include "misc/kernel/Reader.h"
+
+#include <vector>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace kernel
+{
+
+template <typename T> class RandomObject final : public Reader<T>
+{
+public:
+ RandomObject(const Shape &shape) : _shape{shape}
+ {
+ const uint32_t size = _shape.N * _shape.C * _shape.H * _shape.W;
+
+ // TODO Use random number
+ for (uint32_t off = 0; off < size; ++off)
+ {
+ _value.emplace_back(static_cast<float>(off));
+ }
+ }
+
+public:
+ const Shape &shape(void) const { return _shape; }
+
+public:
+ T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const override
+ {
+ uint32_t index = 0;
+
+ index += nth * _shape.C * _shape.H * _shape.W;
+ index += ch * _shape.H * _shape.W;
+ index += row * _shape.W;
+ index += col;
+
+ return _value.at(index);
+ }
+
+private:
+ const Shape _shape;
+ std::vector<T> _value;
+};
+
+} // namespace kernel
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_KERNEL_RANDOM_OBJECT_H__
diff --git a/libs/misc/include/misc/kernel/Reader.h b/libs/misc/include/misc/kernel/Reader.h
new file mode 100644
index 000000000..019c809ee
--- /dev/null
+++ b/libs/misc/include/misc/kernel/Reader.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Reader.h
+ * @brief This file contains Reader structure
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_KERNEL_READER_H__
+#define __NNFW_MISC_KERNEL_READER_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace kernel
+{
+
+/**
+ * @brief Structure to Reader
+ */
+template <typename T> struct Reader
+{
+ /**
+ * @brief Destroy the Reader object as default
+ */
+ virtual ~Reader() = default;
+
+ /**
+ * @brief Get the value used by four indexes
+ * @param[in] nth The kernel index
+ * @param[in] ch The channel index
+ * @param[in] row The row index
+ * @param[in] col The column index
+ * @return The value at the offset
+ */
+ virtual T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const = 0;
+};
+
+} // namespace kernel
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_KERNEL_READER_H__
diff --git a/libs/misc/include/misc/kernel/Shape.h b/libs/misc/include/misc/kernel/Shape.h
new file mode 100644
index 000000000..27d6a8bf0
--- /dev/null
+++ b/libs/misc/include/misc/kernel/Shape.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Shape.h
+ * @brief This file contains Shape structure
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_KERNEL_SHAPE_H__
+#define __NNFW_MISC_KERNEL_SHAPE_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace kernel
+{
+
+/**
+ * @brief Structure to Shape
+ */
+struct Shape
+{
+ int32_t N; /**< The kernel index */
+ int32_t C; /**< The channel index */
+ int32_t H; /**< The height index */
+ int32_t W; /**< The width index */
+
+ /**
+ * @brief Construct a new Shape object as default
+ */
+ Shape() = default;
+
+ /**
+ * @brief Construct a new Shape object with parameters
+ * @param[in] count The kernel index
+ * @param[in] depth The channel index
+ * @param[in] height The height index
+ * @param[in] width The width index
+ */
+ Shape(int32_t count, int32_t depth, int32_t height, int32_t width)
+ : N{count}, C{depth}, H{height}, W{width}
+ {
+ // DO NOTHING
+ }
+};
+
+} // namespace kernel
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_KERNEL_SHAPE_H__
diff --git a/libs/misc/include/misc/matrix/IndexIterator.h b/libs/misc/include/misc/matrix/IndexIterator.h
new file mode 100644
index 000000000..742ed3a65
--- /dev/null
+++ b/libs/misc/include/misc/matrix/IndexIterator.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file IndexIterator.h
+ * @brief This file contains IndexIterator class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_MATRIX_INDEX_ITERATOR_H__
+#define __NNFW_MISC_MATRIX_INDEX_ITERATOR_H__
+
+#include "misc/matrix/Shape.h"
+
+namespace nnfw
+{
+namespace misc
+{
+namespace matrix
+{
+
+/**
+ * @brief Class to iterate Callable with Index of matrix
+ */
+class IndexIterator
+{
+public:
+ /**
+ * @brief Construct IndexIterator object with Shape of matrix
+ * @param[in] shape Shape reference of matrix
+ */
+ IndexIterator(const Shape &shape) : _shape{shape}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Call a function iterated
+ * @param[in] cb A callback function
+ * @return Current IndexIterator object
+ */
+ template <typename Callable> IndexIterator &iter(Callable cb)
+ {
+ for (uint32_t row = 0; row < _shape.H; ++row)
+ {
+ for (uint32_t col = 0; col < _shape.W; ++col)
+ {
+ cb(row, col);
+ }
+ }
+
+ return (*this);
+ }
+
+private:
+ /**
+ * @brief Shape for matrix
+ */
+ const Shape _shape;
+};
+
+/**
+ * @brief Create an object of IndexIterator for matrix
+ * @param[in] Shape reference of matrix
+ * @return Created IndexIterator object
+ */
+inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; }
+
+/**
+ * @brief Call a function iterated using IndexIterator of matrix
+ * Overloaded operator<<
+ * @param[in] it An IndexIterator reference
+ * @param[in] cb A callback function
+ * @return created IndexIterator object
+ */
+template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb)
+{
+ return it.iter(cb);
+}
+
+} // namespace matrix
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_MATRIX_INDEX_ITERATOR_H__
diff --git a/libs/misc/include/misc/matrix/Reader.h b/libs/misc/include/misc/matrix/Reader.h
new file mode 100644
index 000000000..ea222c9d1
--- /dev/null
+++ b/libs/misc/include/misc/matrix/Reader.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Reader.h
+ * @brief This file contains Reader class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_MATRIX_READER_H__
+#define __NNFW_MISC_MATRIX_READER_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace matrix
+{
+
+/**
+ * @brief Class reads values of matrix
+ * The interface class
+ */
+template <typename T> struct Reader
+{
+ /**
+ * @brief Destruct Reader object using default destructor
+ */
+ virtual ~Reader() = default;
+
+ /**
+ * @brief Get the value used by two indexes
+ * @param[in] row The height index
+ * @param[in] col The width index
+ * @return The value at the offset
+ */
+ virtual T at(uint32_t row, uint32_t col) const = 0;
+};
+
+} // namespace matrix
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_MATRIX_READER_H__
diff --git a/libs/misc/include/misc/matrix/Shape.h b/libs/misc/include/misc/matrix/Shape.h
new file mode 100644
index 000000000..8cbcc1e12
--- /dev/null
+++ b/libs/misc/include/misc/matrix/Shape.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Shape.h
+ * @brief This file contains Shape class for matrix
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_MATRIX_SHAPE_H__
+#define __NNFW_MISC_MATRIX_SHAPE_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace matrix
+{
+
+/**
+ * @brief Structure to have values of dimensions for matrix
+ */
+struct Shape
+{
+ int32_t H; /**< The height value */
+ int32_t W; /**< The width value */
+
+ /**
+ * @brief Construct Shape object using default constrcutor
+ */
+ Shape() = default;
+
+ /**
+ * @brief Construct Shape object with two values of dimensions
+ * @param[in] height The height value
+ * @param[in] width The width value
+ */
+ Shape(int32_t height, int32_t width) : H{height}, W{width}
+ {
+ // DO NOTHING
+ }
+};
+
+} // namespace matrix
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_MATRIX_SHAPE_H__
diff --git a/libs/misc/include/misc/tensor/Comparator.h b/libs/misc/include/misc/tensor/Comparator.h
new file mode 100644
index 000000000..80f53043c
--- /dev/null
+++ b/libs/misc/include/misc/tensor/Comparator.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Comparator.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::Comparator class
+ */
+
+#ifndef __NNFW_MISC_TENSOR_COMPARATOR_H__
+#define __NNFW_MISC_TENSOR_COMPARATOR_H__
+
+#include "misc/tensor/Index.h"
+#include "misc/tensor/Shape.h"
+#include "misc/tensor/Reader.h"
+#include "misc/tensor/Diff.h"
+
+#include <functional>
+
+#include <vector>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Class to compare two tensors (expected and obtained to compare)
+ */
+class Comparator
+{
+public:
+ /**
+ * @brief Construct a new @c Comparator object
+ * @param[in] fn Function that compares two float values
+ */
+ Comparator(const std::function<bool(float lhs, float rhs)> &fn) : _compare_fn{fn}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Struct to observe comparison results
+ */
+ struct Observer
+ {
+ /**
+ * @brief Get notification of comparison result at every index of two tensors
+ * @param[in] index Index of tensors compared
+ * @param[in] expected Expected value of element at @c index
+ * @param[in] obtained Obtained value of element at @c index
+ * @return N/A
+ */
+ virtual void notify(const Index &index, float expected, float obtained) = 0;
+ };
+
+public:
+ /**
+ * @brief Compare two tensors
+ * @param[in] shape Shape of two tensors
+ * @param[in] expected @c Reader<float> object that accesses expected tensor
+ * @param[in] obtained @c Reader<float> object that accesses obtained tensor
+ * @param[in] observer @c Observer notified of expected value and obtained value at every index
+ * @return @c std::vector<Diff<float>> containing information of failed comparison
+ */
+ // NOTE Observer should live longer than comparator
+ std::vector<Diff<float>> compare(const Shape &shape, const Reader<float> &expected,
+ const Reader<float> &obtained,
+ Observer *observer = nullptr) const;
+
+private:
+ std::function<bool(float lhs, float rhs)> _compare_fn;
+};
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_COMPARATOR_H__
diff --git a/libs/misc/include/misc/tensor/Diff.h b/libs/misc/include/misc/tensor/Diff.h
new file mode 100644
index 000000000..c41a97987
--- /dev/null
+++ b/libs/misc/include/misc/tensor/Diff.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Diff.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::Diff struct
+ */
+
+#ifndef __NNFW_MISC_TENSOR_DIFF_H__
+#define __NNFW_MISC_TENSOR_DIFF_H__
+
+#include "misc/tensor/Index.h"
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Struct to have information after comparing two elements of two tensors
+ */
+template <typename T> struct Diff
+{
+ Index index; /**< Index of elements in two tensors, which turn out to be different */
+
+ T expected; /**< Expected value of element of first tensor */
+ T obtained; /**< Obtained value of element of second tensor */
+
+ /**
+ * @brief Construct a new @c Diff object
+ * @param[in] i Initial value of index
+ */
+ Diff(const Index &i) : index(i)
+ {
+ // DO NOTHING
+ }
+
+ /**
+ * @brief Construct a new @c Diff object
+ * @param[in] i Index value
+ * @param[in] e Expected value of element of first tensor
+ * @param[in] o Obtained value of element of second tensor
+ */
+ Diff(const Index &i, const T &e, const T &o) : index(i), expected{e}, obtained{o}
+ {
+ // DO NOTHING
+ }
+};
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_DIFF_H__
diff --git a/libs/misc/include/misc/tensor/Index.h b/libs/misc/include/misc/tensor/Index.h
new file mode 100644
index 000000000..a08d7099e
--- /dev/null
+++ b/libs/misc/include/misc/tensor/Index.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Index.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::Index struct
+ */
+#ifndef __NNFW_MISC_TENSOR_INDEX_H__
+#define __NNFW_MISC_TENSOR_INDEX_H__
+
+#include <cstdint>
+#include <cstddef>
+
+#include <vector>
+#include <initializer_list>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Struct to represent index of each dimension of a tensor
+ */
+struct Index
+{
+public:
+ /**
+ * @brief Construct a new @c Index object
+ * @param[in] rank Rank of a tensor
+ */
+ Index(size_t rank) { _offsets.resize(rank); }
+
+public:
+ /**
+ * @brief Construct a new @c Index object
+ * @param[in] offsets Rank of a tensor of @c std::initializer_list<int32_t> type
+ */
+ Index(std::initializer_list<int32_t> offsets) : _offsets{offsets}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Get the rank
+ * @return Rank that this @c Index object can handle
+ */
+ size_t rank(void) const { return _offsets.size(); }
+
+public:
+ /**
+ * @brief Get the index n'th dimension
+ * @param[in] n Dimension
+ * @return index of n'th dimension
+ */
+ int32_t at(size_t n) const { return _offsets.at(n); }
+
+ /**
+ * @brief Get the reference of the index n'th dimension
+ * @param[in] n Dimension
+ * @return reference of index of n'th dimension
+ */
+ int32_t &at(size_t n) { return _offsets.at(n); }
+
+private:
+ std::vector<int32_t> _offsets;
+};
+
+/**
+ * @brief Copy an @c Index with reversed order
+ * @param[in] origin @c Index object to copy
+ * @return an @c Index object with reversed order
+ * @note This is used to convert NNAPI tensor index to ARM tensor index or vice versa
+ */
+inline static Index copy_reverse(const Index &origin)
+{
+ size_t rank = origin.rank();
+ Index target(rank);
+ for (int i = 0; i < rank; i++)
+ target.at(i) = origin.at(rank - 1 - i);
+ return target;
+}
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_INDEX_H__
diff --git a/libs/misc/include/misc/tensor/IndexEnumerator.h b/libs/misc/include/misc/tensor/IndexEnumerator.h
new file mode 100644
index 000000000..4912ea289
--- /dev/null
+++ b/libs/misc/include/misc/tensor/IndexEnumerator.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file IndexEnumerator.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::IndexEnumerator class
+ */
+
+#ifndef __NNFW_MISC_TENSOR_INDEX_ENUMERATOR_H__
+#define __NNFW_MISC_TENSOR_INDEX_ENUMERATOR_H__
+
+#include "misc/tensor/Shape.h"
+#include "misc/tensor/Index.h"
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+/**
+ * @brief Class to enumerate index of a tensor
+ *
+ */
+class IndexEnumerator
+{
+public:
+ /**
+ * @brief Construct a new @c IndexEnumerator object
+ * @param[in] shape Shape of tensor of which index will be enumerate
+ */
+ explicit IndexEnumerator(const Shape &shape) : _shape(shape), _index(shape.rank()), _cursor(0)
+ {
+ const size_t rank = _shape.rank();
+
+ for (size_t axis = 0; axis < rank; ++axis)
+ {
+ _index.at(axis) = 0;
+ }
+
+ for (_cursor = 0; _cursor < rank; ++_cursor)
+ {
+ if (_index.at(_cursor) < _shape.dim(_cursor))
+ {
+ break;
+ }
+ }
+ }
+
+public:
+ /**
+ * @brief Prevent constructing @c IndexEnumerator object by using R-value reference
+ */
+ IndexEnumerator(IndexEnumerator &&) = delete;
+ /**
+ * @brief Prevent copy constructor
+ */
+ IndexEnumerator(const IndexEnumerator &) = delete;
+
+public:
+ /**
+ * @brief Check if more enumeration is available
+ * @return @c true if more @c advance() is available, otherwise @c false
+ */
+ bool valid(void) const { return _cursor < _shape.rank(); }
+
+public:
+ /**
+ * @brief Get the current index to enumerate
+ * @return Current index
+ */
+ const Index &curr(void) const { return _index; }
+
+public:
+ /**
+ * @brief Advance index by +1
+ */
+ void advance(void)
+ {
+ const size_t rank = _shape.rank();
+
+ // Find axis to be updated
+ while ((_cursor < rank) && !(_index.at(_cursor) + 1 < _shape.dim(_cursor)))
+ {
+ ++_cursor;
+ }
+
+ if (_cursor == rank)
+ {
+ return;
+ }
+
+ // Update index
+ _index.at(_cursor) += 1;
+
+ for (size_t axis = 0; axis < _cursor; ++axis)
+ {
+ _index.at(axis) = 0;
+ }
+
+ // Update cursor
+ _cursor = 0;
+ }
+
+public:
+ const Shape _shape; //!< Shape to enumerate
+
+private:
+ size_t _cursor;
+ Index _index;
+};
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_INDEX_ENUMERATOR_H__
diff --git a/libs/misc/include/misc/tensor/IndexFormatter.h b/libs/misc/include/misc/tensor/IndexFormatter.h
new file mode 100644
index 000000000..7ae34eec1
--- /dev/null
+++ b/libs/misc/include/misc/tensor/IndexFormatter.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file IndexFormatter.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::IndexFormatter class
+ */
+
+#ifndef __NNFW_MISC_TENSOR_INDEX_FORMATTER_H__
+#define __NNFW_MISC_TENSOR_INDEX_FORMATTER_H__
+
+#include "misc/tensor/Index.h"
+
+#include <ostream>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Class to send @c Index object to output stream
+ */
+class IndexFormatter
+{
+public:
+ /**
+ * @brief Construct a new @c IndexFormatter object
+ * @param[in] index index to be sent to output stream
+ */
+ IndexFormatter(const nnfw::misc::tensor::Index &index) : _index(index)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Get an @c Index object
+ * @return @c Index object previously passed to the constructor
+ */
+ const nnfw::misc::tensor::Index &index(void) const { return _index; }
+
+private:
+ const nnfw::misc::tensor::Index &_index;
+};
+
+/**
+ * @brief Send @c IndexFormatter object to output stream
+ * @param[in] os Output stream
+ * @param[in] fmt @c IndexFormatter object that is sent to output stream
+ * @return Output stream
+ */
+std::ostream &operator<<(std::ostream &os, const IndexFormatter &fmt);
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_INDEX_FORMATTER_H__
diff --git a/libs/misc/include/misc/tensor/IndexIterator.h b/libs/misc/include/misc/tensor/IndexIterator.h
new file mode 100644
index 000000000..f6428e19e
--- /dev/null
+++ b/libs/misc/include/misc/tensor/IndexIterator.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file IndexIterator.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::IndexIterator class and
+ * helper function and operator
+ */
+#ifndef __NNFW_MISC_TENSOR_INDEX_ITERATOR_H__
+#define __NNFW_MISC_TENSOR_INDEX_ITERATOR_H__
+
+#include "misc/tensor/Shape.h"
+#include "misc/tensor/Index.h"
+#include "misc/tensor/IndexEnumerator.h"
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Class to iterate indexes available for given shape
+ */
+class IndexIterator
+{
+public:
+ /**
+ * @brief Construct a new @c IndexIterator object
+ * @param[in] shape Shape of tensor of which index will be iterated
+ */
+ IndexIterator(const Shape &shape) : _shape(shape)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Construct a new IndexIterator object using reference
+ * @param[in] IndexIterator @c IndexIterator object to move
+ */
+ IndexIterator(IndexIterator &&) = default;
+
+ /**
+ * @brief Prevent copy constructor
+ */
+ IndexIterator(const IndexIterator &) = delete;
+
+public:
+ /**
+ * @brief Iterate all available indexes and run a function for each index
+ * @param[in] fn Function that requires an index as a parameter.
+ * @return @c IndexIterator object
+ */
+ template <typename Callable> IndexIterator &iter(Callable fn)
+ {
+ for (IndexEnumerator e{_shape}; e.valid(); e.advance())
+ {
+ fn(e.curr());
+ }
+
+ return (*this);
+ }
+
+private:
+ const Shape &_shape;
+};
+
+/**
+ * @brief Get an @c IndexItator object
+ * @param[in] shape Shape of tensor of which index will be iterated
+ * @return @c IndexIterator object
+ */
+inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; }
+
+/**
+ * @brief Iterate all indexes and apply a function
+ * @param[in] it @c IndexIterator object that is constructed with a tensor shape
+ * @param[in] cb A function that will receive a specific index.
+ * Inside the function, the index is used to manipulate tensor element.
+ * @return @c IndexIterator object
+ */
+template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb)
+{
+ return it.iter(cb);
+}
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_INDEX_ITERATOR_H__
diff --git a/libs/misc/include/misc/tensor/NonIncreasingStride.h b/libs/misc/include/misc/tensor/NonIncreasingStride.h
new file mode 100644
index 000000000..e7ad0857b
--- /dev/null
+++ b/libs/misc/include/misc/tensor/NonIncreasingStride.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file NonIncreasingStride.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::NonIncreasingStride class
+ */
+#ifndef __NNFW_MISC_TENSOR_NON_INCREASING_STRIDE_H__
+#define __NNFW_MISC_TENSOR_NON_INCREASING_STRIDE_H__
+
+#include "misc/tensor/Shape.h"
+#include "misc/tensor/Index.h"
+
+#include <vector>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Class to represent strides where stride[N-1] >= stride[N] holds for all N < rank
+ */
+class NonIncreasingStride
+{
+public:
+ /**
+ * @brief Initialize the stride data using @c Shape
+ * @param[in] shape to build stride info
+ * @return N/A
+ */
+ void init(const Shape &shape)
+ {
+ _stride.resize(shape.rank());
+ _stride.at(shape.rank() - 1) = 1;
+
+ for (uint32_t axis = shape.rank() - 1; axis > 0; --axis)
+ {
+ _stride.at(axis - 1) = _stride.at(axis) * shape.dim(axis);
+ }
+ }
+
+public:
+ /**
+ * @brief Get an stride value for specific axis
+ * @param[in] axis Axis of stride
+ * @return The value of stride
+ */
+ uint32_t at(uint32_t axis) const { return _stride.at(axis); }
+
+public:
+ /**
+ * @brief Get the 1-D offset of specified index for n-D tensor
+ * @param index @c Index object
+ * @return 1-D offset of index
+ */
+ uint32_t offset(const Index &index) const;
+
+private:
+ std::vector<uint32_t> _stride;
+};
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_NON_INCREASING_STRIDE_H__
diff --git a/libs/misc/include/misc/tensor/Object.h b/libs/misc/include/misc/tensor/Object.h
new file mode 100644
index 000000000..83fbc0bd1
--- /dev/null
+++ b/libs/misc/include/misc/tensor/Object.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Object.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::Object class
+ */
+
+#ifndef __NNFW_MISC_TENSOR_OBJECT_H__
+#define __NNFW_MISC_TENSOR_OBJECT_H__
+
+#include "misc/tensor/Shape.h"
+#include "misc/tensor/Index.h"
+#include "misc/tensor/IndexIterator.h"
+#include "misc/tensor/NonIncreasingStride.h"
+#include "misc/tensor/Reader.h"
+
+#include <vector>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Class to build a tensor using specific generator
+ * @tparam T Type of tensor element
+ */
+
+template <typename T> class Object final : public Reader<T>
+{
+public:
+ /**
+ * @brief Function to generate tensor element
+ */
+ using Generator = std::function<T(const Shape &shape, const Index &index)>;
+
+public:
+ /**
+ * @brief Construct a new @c Object object
+ * @param[in] shape Tensor shape
+ * @param[in] fn Function to generate tensor elements
+ */
+ Object(const Shape &shape, const Generator &fn) : _shape{shape}
+ {
+ // Set 'stride'
+ _stride.init(shape);
+
+ // Pre-allocate buffer
+ _values.resize(_shape.dim(0) * _stride.at(0));
+
+ // Set 'value'
+ iterate(_shape) <<
+ [this, &fn](const Index &index) { _values.at(_stride.offset(index)) = fn(_shape, index); };
+ }
+
+public:
+ /**
+ * @brief Get reference of shape
+ * @return Reference of shape
+ */
+ const Shape &shape(void) const { return _shape; }
+
+public:
+ /**
+ * @brief Get and element of tensor
+ * @param[in] index Index of a tensor element
+ * @return Value of tensor element
+ */
+ T at(const Index &index) const override { return _values.at(_stride.offset(index)); }
+
+private:
+ Shape _shape;
+ NonIncreasingStride _stride;
+
+private:
+ std::vector<T> _values;
+};
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_FEATURE_OBJECT_H__
diff --git a/libs/misc/include/misc/tensor/Reader.h b/libs/misc/include/misc/tensor/Reader.h
new file mode 100644
index 000000000..9175a913e
--- /dev/null
+++ b/libs/misc/include/misc/tensor/Reader.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Reader.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::Reader struct
+ */
+
+#ifndef __NNFW_MISC_TENSOR_READER_H__
+#define __NNFW_MISC_TENSOR_READER_H__
+
+#include "misc/tensor/Index.h"
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Struct to read element of tensor
+ * @tparam T Type of elements in tensor
+ */
+template <typename T> struct Reader
+{
+ /**
+ * @brief Destroy the Reader object
+ */
+ virtual ~Reader() = default;
+
+ /**
+ * @brief Get an element of tensor
+ * @param[in] index Index specifying indexes of tensor element
+ * @return The value of specificed element
+ */
+ virtual T at(const Index &index) const = 0;
+};
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_READER_H__
diff --git a/libs/misc/include/misc/tensor/Shape.h b/libs/misc/include/misc/tensor/Shape.h
new file mode 100644
index 000000000..6e6c23502
--- /dev/null
+++ b/libs/misc/include/misc/tensor/Shape.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Shape.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::Shape class
+ */
+
+#ifndef __NNFW_MISC_TENSOR_SHAPE_H__
+#define __NNFW_MISC_TENSOR_SHAPE_H__
+
+#include <cstdint>
+#include <cstddef>
+#include <deque>
+#include <initializer_list>
+#include <ostream>
+#include <string>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Class to represent shape of a tensor
+ */
+class Shape
+{
+public:
+ /**
+ * @brief Construct a new Shape object
+ * @param[in] rank Rank of a tensor
+ */
+ Shape(size_t rank) { _dimensions.resize(rank); }
+
+public:
+ /**
+ * @brief Construct a new Shape object
+ * @param[in] dimensions @c initializer_list<int32_t> of dimensions of tensor
+ */
+ Shape(const std::initializer_list<int32_t> &dimensions) : _dimensions{dimensions}
+ {
+ // DO NOTHING
+ }
+
+ /**
+ * @brief Construct a new Shape object
+ * @param[in] origin @c Shape object to copy
+ */
+ Shape(const Shape &origin) = default;
+
+public:
+ /**
+ * @brief Add dimension to the beginning
+ * @param[in] d dimension to add to the beginning
+ * @return N/A
+ */
+ void prepend(int32_t d) { _dimensions.emplace_front(d); }
+
+ /**
+ * @brief Add dimension to the back
+ * @param[in] d dimension to add to the back
+ * @return N/A
+ */
+ void append(int32_t d) { _dimensions.emplace_back(d); }
+
+public:
+ /**
+ * @brief Get the rank of this shape
+ * @return rank
+ */
+ size_t rank(void) const { return _dimensions.size(); }
+
+public:
+ /**
+ * @brief Get specific dimension
+ * @param[in] n Index of dimension
+ * @return n'th dimension
+ */
+ int32_t dim(size_t n) const { return _dimensions.at(n); }
+
+ /**
+ * @brief Get the reference of specific dimension
+ * @param[in] n Index of dimension
+ * @return Reference of n'th dimension
+ */
+ int32_t &dim(size_t n) { return _dimensions.at(n); }
+
+public:
+ /**
+ * @brief Get the number of elements specified by this shape
+ * @return The number of elements
+ */
+ size_t element_nums() const
+ {
+ size_t nums = 1;
+ for (auto d : _dimensions)
+ {
+ nums *= d;
+ }
+ return nums;
+ }
+
+private:
+ std::deque<int32_t> _dimensions;
+
+public:
+ /**
+ * @brief Get a @c Shape object after parsing string
+ * @param[in] s String of dimension list. Accepted format is numbers separated by comma.
+ * @return @c Shape object
+ */
+ static Shape from(const std::string &s);
+};
+
+/**
+ * @brief Check equality of two @c Shape
+ * @param[in] Shape First shape to compare
+ * @param[in] Shape Second shape to compare
+ * @return @c true if both shapes are equal, otherwise @c false
+ */
+bool operator==(const Shape &, const Shape &);
+
+/**
+ * @brief Send @c Shape to @c std::ostream
+ * @param[in] os @c std::ostream to process this @c Shape
+ * @param[in] shape @c Shape to send to @c ostream
+ * @return Reference of @c std::ostream
+ */
+std::ostream &operator<<(std::ostream &os, const Shape &shape);
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_SHAPE_H__
diff --git a/libs/misc/include/misc/tensor/Zipper.h b/libs/misc/include/misc/tensor/Zipper.h
new file mode 100644
index 000000000..8f0ec4ab6
--- /dev/null
+++ b/libs/misc/include/misc/tensor/Zipper.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Zipper.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains nnfw::misc::tensor::Zipper class
+ */
+
+#ifndef __NNFW_MISC_TENSOR_ZIPPER_H__
+#define __NNFW_MISC_TENSOR_ZIPPER_H__
+
+#include "misc/tensor/Index.h"
+#include "misc/tensor/IndexIterator.h"
+#include "misc/tensor/Reader.h"
+
+namespace nnfw
+{
+namespace misc
+{
+namespace tensor
+{
+
+/**
+ * @brief Class to apply a function with three params: @c Index, elements of a tensor
+ * at passed index read by @c Reader objects
+ */
+template <typename T> class Zipper
+{
+public:
+ /**
+ * @brief Construct a new @c Zipper object
+ * @param[in] shape Shape of @c lhs and @c rhs
+ * @param[in] lhs @c Reader object of a tensor
+ * @param[in] rhs @c Reader object of a tensor
+ */
+ Zipper(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs)
+ : _shape{shape}, _lhs{lhs}, _rhs{rhs}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Apply @c cb to all elements of tensors. Elements of two tensors
+ * at passed @c index are read by @c lhs and @c rhs
+ * @param[in] cb Function to apply
+ * @return N/A
+ */
+ template <typename Callable> void zip(Callable cb) const
+ {
+ iterate(_shape) <<
+ [this, &cb](const Index &index) { cb(index, _lhs.at(index), _rhs.at(index)); };
+ }
+
+private:
+ const Shape &_shape;
+ const Reader<T> &_lhs;
+ const Reader<T> &_rhs;
+};
+
+/**
+ * @brief Apply @c cb by using @c lhs and @c rhs passed to the constructor of @c zipper
+ * @param[in] zipper @c Zipper object
+ * @param[in] cb Function to zpply using @c zip function
+ * @return @c zipper object after applying @c cb to @c zipper
+ */
+template <typename T, typename Callable>
+const Zipper<T> &operator<<(const Zipper<T> &zipper, Callable cb)
+{
+ zipper.zip(cb);
+ return zipper;
+}
+
+/**
+ * @brief Get @c Zipper object constructed using passed params
+ * @param shape Shape of @c lhs and @c rhs
+ * @param lhs @c Reader object of a tensor
+ * @param rhs @c Reader object of a tensor
+ * @return @c Zipper object
+ */
+template <typename T> Zipper<T> zip(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs)
+{
+ return Zipper<T>{shape, lhs, rhs};
+}
+
+} // namespace tensor
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_TENSOR_ZIPPER_H__
diff --git a/libs/misc/include/misc/vector.h b/libs/misc/include/misc/vector.h
new file mode 100644
index 000000000..395b08912
--- /dev/null
+++ b/libs/misc/include/misc/vector.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file vector.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains @c == operator to check equality of elements in two vectors
+ */
+#ifndef __NNFW_MISC_VECTOR_H__
+#define __NNFW_MISC_VECTOR_H__
+
+#include <vector>
+
+/**
+ * @brief Compare elements of two vectors
+ * @tparam T Type of elements in vectors
+ * @param[in] lhs First vector to compare
+ * @param[in] rhs Second vector to compare
+ * @return @c true if all elements are equal, otherwise @c false.
+ */
+template <typename T> bool operator==(const std::vector<T> &lhs, const std::vector<T> &rhs)
+{
+ if (lhs.size() != rhs.size())
+ {
+ return false;
+ }
+
+ for (size_t ind = 0; ind < lhs.size(); ++ind)
+ {
+ if (lhs.at(ind) != rhs.at(ind))
+ {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+#endif // __NNFW_MISC_VECTOR_H__
diff --git a/libs/misc/include/misc/vector/Object.h b/libs/misc/include/misc/vector/Object.h
new file mode 100644
index 000000000..65d4bc613
--- /dev/null
+++ b/libs/misc/include/misc/vector/Object.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Object.h
+ * @brief This file contains Object class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_VECTOR_OBJECT_H__
+#define __NNFW_MISC_VECTOR_OBJECT_H__
+
+#include "misc/vector/Reader.h"
+
+#include <vector>
+#include <functional>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace vector
+{
+
+/**
+ * @brief Class to have information of the operand for vector
+ */
+template <typename T> class Object final : public Reader<T>
+{
+public:
+ using Generator = std::function<T(int32_t size, int32_t offset)>;
+
+public:
+ /**
+ * @brief Construct Object object with size of vector and set value used by Generator
+ * @param[in] size The size of vector
+ * @param[in] gen A function to set values of operand tensor
+ */
+ Object(int32_t size, const Generator &gen) : _size{size}
+ {
+ _value.resize(_size);
+
+ for (int32_t offset = 0; offset < size; ++offset)
+ {
+ _value.at(offset) = gen(size, offset);
+ }
+ }
+
+public:
+ /**
+ * @brief Get size of vector
+ * @return Size of vector
+ */
+ int32_t size(void) const { return _size; }
+
+public:
+ /**
+ * @brief Get the value used by index
+ * @param[in] nth The vector index
+ * @return The value at the offset
+ */
+ T at(uint32_t nth) const override { return _value.at(nth); }
+
+private:
+ /**
+ * @brief Size of vector
+ */
+ const int32_t _size;
+ /**
+ * @brief The tensor vector of operand
+ */
+ std::vector<T> _value;
+};
+
+} // namespace vector
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_VECTOR_OBJECT_H__
diff --git a/libs/misc/include/misc/vector/Reader.h b/libs/misc/include/misc/vector/Reader.h
new file mode 100644
index 000000000..eab4c427b
--- /dev/null
+++ b/libs/misc/include/misc/vector/Reader.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Reader.h
+ * @brief This file contains Reader class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_MISC_VECTOR_READER_H__
+#define __NNFW_MISC_VECTOR_READER_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace misc
+{
+namespace vector
+{
+
+/**
+ * @brief Class reads values of vector
+ * The interface class
+ */
+template <typename T> struct Reader
+{
+ /**
+ * @brief Destruct Reader object using default destructor
+ */
+ virtual ~Reader() = default;
+
+ /**
+ * @brief Get the value used by the index
+ * @param[in] nth The vector index
+ * @return The value at the offset
+ */
+ virtual T at(uint32_t nth) const = 0;
+};
+
+} // namespace vector
+} // namespace misc
+} // namespace nnfw
+
+#endif // __NNFW_MISC_VECTOR_READER_H__
diff --git a/libs/util/src/environment.cpp b/libs/misc/src/environment.cpp
index 4b18b409f..e39f18d62 100644
--- a/libs/util/src/environment.cpp
+++ b/libs/misc/src/environment.cpp
@@ -18,11 +18,11 @@
#include <cstdlib>
#include <string>
-#include "util/environment.h"
+#include "misc/environment.h"
namespace nnfw
{
-namespace util
+namespace misc
{
int get_env_int(const char *name, int defaultValue)
@@ -44,12 +44,12 @@ bool get_env_bool(const char *name, bool defaultValue)
return defaultValue;
}
-} // namespace util
+} // namespace misc
} // namespace nnfw
namespace nnfw
{
-namespace util
+namespace misc
{
namespace env
{
@@ -91,5 +91,5 @@ bool FloatAccessor::access(float &out) const
}
} // namespace env
-} // namespace util
+} // namespace misc
} // namespace nnfw
diff --git a/libs/util/src/tensor/Comparator.cpp b/libs/misc/src/tensor/Comparator.cpp
index 89cd687e9..013c9eed2 100644
--- a/libs/util/src/tensor/Comparator.cpp
+++ b/libs/misc/src/tensor/Comparator.cpp
@@ -1,11 +1,11 @@
-#include "util/tensor/Comparator.h"
-#include "util/tensor/Zipper.h"
+#include "misc/tensor/Comparator.h"
+#include "misc/tensor/Zipper.h"
-#include "util/fp32.h"
+#include "misc/fp32.h"
namespace nnfw
{
-namespace util
+namespace misc
{
namespace tensor
{
@@ -18,7 +18,7 @@ std::vector<Diff<float>> Comparator::compare(const Shape &shape, const Reader<fl
zip(shape, expected, obtained) <<
[&](const Index &index, float expected_value, float obtained_value) {
- const auto relative_diff = nnfw::util::fp32::relative_diff(expected_value, obtained_value);
+ const auto relative_diff = nnfw::misc::fp32::relative_diff(expected_value, obtained_value);
if (!_compare_fn(expected_value, obtained_value))
{
@@ -36,5 +36,5 @@ std::vector<Diff<float>> Comparator::compare(const Shape &shape, const Reader<fl
}
} // namespace tensor
-} // namespace util
+} // namespace misc
} // namespace nnfw
diff --git a/libs/util/src/tensor/IndexFormatter.cpp b/libs/misc/src/tensor/IndexFormatter.cpp
index 66ff80771..c949db7a8 100644
--- a/libs/util/src/tensor/IndexFormatter.cpp
+++ b/libs/misc/src/tensor/IndexFormatter.cpp
@@ -14,13 +14,13 @@
* limitations under the License.
*/
-#include "util/tensor/IndexFormatter.h"
+#include "misc/tensor/IndexFormatter.h"
#include <cassert>
namespace nnfw
{
-namespace util
+namespace misc
{
namespace tensor
{
@@ -45,5 +45,5 @@ std::ostream &operator<<(std::ostream &os, const IndexFormatter &fmt)
}
} // namespace tensor
-} // namespace util
+} // namespace misc
} // namespace nnfw
diff --git a/libs/util/src/tensor/NonIncreasingStride.cpp b/libs/misc/src/tensor/NonIncreasingStride.cpp
index 3774ded83..c51ad0324 100644
--- a/libs/util/src/tensor/NonIncreasingStride.cpp
+++ b/libs/misc/src/tensor/NonIncreasingStride.cpp
@@ -14,13 +14,13 @@
* limitations under the License.
*/
-#include "util/tensor/NonIncreasingStride.h"
+#include "misc/tensor/NonIncreasingStride.h"
#include <cassert>
namespace nnfw
{
-namespace util
+namespace misc
{
namespace tensor
{
@@ -42,5 +42,5 @@ uint32_t NonIncreasingStride::offset(const Index &index) const
}
} // namespace tensor
-} // namespace util
+} // namespace misc
} // namespace nnfw
diff --git a/libs/util/src/tensor/Shape.cpp b/libs/misc/src/tensor/Shape.cpp
index f1de26fdc..675695e8e 100644
--- a/libs/util/src/tensor/Shape.cpp
+++ b/libs/misc/src/tensor/Shape.cpp
@@ -14,13 +14,13 @@
* limitations under the License.
*/
-#include "util/tensor/Shape.h"
+#include "misc/tensor/Shape.h"
#include <cassert>
namespace nnfw
{
-namespace util
+namespace misc
{
namespace tensor
{
@@ -95,5 +95,5 @@ std::ostream &operator<<(std::ostream &os, const Shape &shape)
}
} // namespace tensor
-} // namespace util
+} // namespace misc
} // namespace nnfw
diff --git a/libs/profiling/CMakeLists.txt b/libs/profiling/CMakeLists.txt
new file mode 100644
index 000000000..7169508a1
--- /dev/null
+++ b/libs/profiling/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+
+add_library(nnfw_lib_profiling STATIC ${SOURCES})
+set_property(TARGET nnfw_lib_profiling PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(nnfw_lib_profiling PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
diff --git a/libs/profiling/include/profiling/profile_buffer.h b/libs/profiling/include/profiling/profile_buffer.h
new file mode 100644
index 000000000..83cd3eb2b
--- /dev/null
+++ b/libs/profiling/include/profiling/profile_buffer.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
+// NOTE This header is derived from the following file (in TensorFlow v1.12)
+// 'externals/tensorflow/tensorflow/contrib/lite/profiling/profile_buffer.h
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "profiling/time.h"
+
+namespace tflite {
+namespace profiling {
+
+// A profiling event.
+struct ProfileEvent {
+ // Describes the type of event.
+ // The event_metadata field may contain additional data for interpreting
+ // the event.
+ enum class EventType {
+ // Default event type, the metadata field has no special significance.
+ DEFAULT = 0,
+ // The event is an operator invocation and the event_metadata field is the
+ // index of operator node.
+ OPERATOR_INVOKE_EVENT = 1
+ };
+
+ // Label of the event. This usually describes the event.
+ const char* tag;
+ // Timestamp in microseconds when the event began.
+ uint64_t begin_timestamp_us;
+ // Timestamp in microseconds when the event ended.
+ uint64_t end_timestamp_us;
+ // The field containing the type of event. This must be one of the event types
+ // in EventType.
+ EventType event_type;
+ // Extra data describing the details of the event.
+ uint32_t event_metadata;
+};
+} // namespace profiling
+} // namespace tflite
+
+#ifdef TFLITE_PROFILING_ENABLED
+
+#include <sys/time.h>
+#include <vector>
+
+namespace tflite {
+namespace profiling {
+constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
+
+// A ring buffer of profile events.
+// This class is not thread safe.
+class ProfileBuffer {
+ public:
+ ProfileBuffer(uint32_t max_num_entries, bool enabled)
+ : enabled_(enabled), current_index_(0), event_buffer_(max_num_entries) {}
+
+ // Adds an event to the buffer with begin timestamp set to the current
+ // timestamp. Returns a handle to event that can be used to call EndEvent. If
+ // buffer is disabled this has no affect.
+ // The tag of the event should remain valid till the buffer is valid.
+ uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type,
+ uint32_t event_metadata) {
+ if (!enabled_) {
+ return kInvalidEventHandle;
+ }
+ uint64_t timestamp = time::NowMicros();
+ int index = current_index_ % event_buffer_.size();
+ event_buffer_[index].tag = tag;
+ event_buffer_[index].event_type = event_type;
+ event_buffer_[index].event_metadata = event_metadata;
+ event_buffer_[index].begin_timestamp_us = timestamp;
+ event_buffer_[index].end_timestamp_us = 0;
+ current_index_++;
+ return index;
+ }
+
+ // Sets the enabled state of buffer to |enabled|
+ void SetEnabled(bool enabled) { enabled_ = enabled; }
+
+ // Sets the end timestamp for event for the handle to current time.
+ // If the buffer is disabled or previous event has been overwritten this
+ // operation has not effect.
+ void EndEvent(uint32_t event_handle) {
+ if (!enabled_ || event_handle == kInvalidEventHandle ||
+ event_handle > current_index_) {
+ return;
+ }
+ const uint32_t max_size = event_buffer_.size();
+ if (current_index_ > (max_size + event_handle)) {
+ // Ignore, buffer has already overflowed.
+ return;
+ }
+
+ int event_index = event_handle % max_size;
+ event_buffer_[event_index].end_timestamp_us = time::NowMicros();
+ }
+
+ // Returns the size of the buffer.
+ size_t Size() const {
+ return (current_index_ >= event_buffer_.size()) ? event_buffer_.size()
+ : current_index_;
+ }
+
+ // Resets the buffer.
+ void Reset() {
+ enabled_ = false;
+ current_index_ = 0;
+ }
+
+ // Returns the profile event at the given index. If the index is invalid a
+ // nullptr is returned. The return event may get overwritten if more events
+ // are added to buffer.
+ const struct ProfileEvent* const At(int index) const {
+ size_t size = Size();
+ if (index >= size) {
+ return nullptr;
+ }
+ const uint32_t max_size = event_buffer_.size();
+ uint32_t start =
+ (current_index_ > max_size) ? current_index_ % max_size : max_size;
+ index = (index + start) % max_size;
+ return &event_buffer_[index];
+ }
+
+ private:
+ bool enabled_;
+ uint32_t current_index_;
+ std::vector<ProfileEvent> event_buffer_;
+};
+} // namespace profiling
+} // namespace tflite
+#endif // TFLITE_PROFILING_ENABLED
+#endif // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+
+// clang-format on
diff --git a/libs/profiling/include/profiling/profiler.h b/libs/profiling/include/profiling/profiler.h
new file mode 100644
index 000000000..953042da3
--- /dev/null
+++ b/libs/profiling/include/profiling/profiler.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
+// NOTE This header is derived from the following file (in TensorFlow v1.12)
+// 'externals/tensorflow/tensorflow/contrib/lite/profiling/profiler.h
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+
+#include <vector>
+
+#include "profiling/profile_buffer.h"
+
+#ifdef TFLITE_PROFILING_ENABLED
+
+namespace tflite {
+namespace profiling {
+class ScopedProfile;
+class ScopedOperatorProfile;
+
+// Controls whether profiling is enabled or disabled and collects profiles.
+// TFLite is used on platforms that don't have posix threads, so the profiler is
+// kept as simple as possible. It is designed to be used only on a single
+// thread.
+//
+// Profiles are collected using Scoped*Profile objects that begin and end a
+// profile event.
+// An example usage is shown in the example below:
+//
+// Say Worker class has a DoWork method and we are interested in profiling
+// the overall execution time for DoWork and time spent in Task1 and Task2
+// functions.
+//
+// class Worker {
+// public:
+// void DoWork() {
+// ScopedProfile(&controller, "DoWork");
+// Task1();
+// Task2();
+// .....
+// }
+//
+// void Task1() {
+// ScopedProfile(&controller, "Task1");
+// ....
+// }
+//
+// void Task2() {
+// ScopedProfile(&controller, "Task2");
+// }
+//
+// Profiler profiler;
+// }
+//
+// We instrument the functions that need to be profiled.
+//
+// Profile can be collected by enable profiling and then getting profile
+// events.
+//
+// void ProfileWorker() {
+// Worker worker;
+// worker.profiler.EnableProfiling();
+// worker.DoWork();
+// worker.profiler.DisableProfiling();
+// // Profiling is complete, extract profiles.
+// auto profile_events = worker.profiler.GetProfiles();
+// }
+//
+//
+class Profiler {
+ public:
+ Profiler() : buffer_(1024, false) {}
+
+ void StartProfiling() { buffer_.SetEnabled(true); }
+ void StopProfiling() { buffer_.SetEnabled(false); }
+ void Reset() { buffer_.Reset(); }
+ std::vector<const ProfileEvent*> GetProfileEvents() {
+ std::vector<const ProfileEvent*> profile_events;
+ profile_events.reserve(buffer_.Size());
+ for (size_t i = 0; i < buffer_.Size(); i++) {
+ profile_events.push_back(buffer_.At(i));
+ }
+ return profile_events;
+ }
+
+ private:
+ friend class ScopedProfile;
+ friend class ScopedOperatorProfile;
+ ProfileBuffer* GetProfileBuffer() { return &buffer_; }
+ ProfileBuffer buffer_;
+};
+
+class ScopedProfile {
+ public:
+ // Adds a profile event to profile that begins with the construction
+ // of object and ends when the object goes out of scope.
+ // The lifetime of tag should be at least the lifetime of profiler.
+
+ ScopedProfile(Profiler* profiler, const char* tag)
+ : buffer_(nullptr), event_handle_(0) {
+ if (profiler) {
+ buffer_ = profiler->GetProfileBuffer();
+ event_handle_ =
+ buffer_->BeginEvent(tag, ProfileEvent::EventType::DEFAULT, 0);
+ }
+ }
+ ~ScopedProfile() {
+ if (buffer_) {
+ buffer_->EndEvent(event_handle_);
+ }
+ }
+
+ private:
+ ProfileBuffer* buffer_;
+ int32_t event_handle_;
+};
+
+class ScopedOperatorProfile {
+ public:
+ // Adds a profile event to profile that begins with the construction
+ // of object and ends when the object goes out of scope.
+ // The lifetime of tag should be at least the lifetime of profiler.
+ ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index)
+ : buffer_(nullptr), event_handle_(0) {
+ if (profiler) {
+ buffer_ = profiler->GetProfileBuffer();
+ event_handle_ = buffer_->BeginEvent(
+ tag, ProfileEvent::EventType::OPERATOR_INVOKE_EVENT, node_index);
+ }
+ }
+
+ ~ScopedOperatorProfile() {
+ if (buffer_) {
+ buffer_->EndEvent(event_handle_);
+ }
+ }
+
+ private:
+ ProfileBuffer* buffer_;
+ int32_t event_handle_;
+};
+
+} // namespace profiling
+} // namespace tflite
+
+#define VARNAME_UNIQ(name, ctr) name##ctr
+
+#define SCOPED_OPERATOR_PROFILE(profiler, node_index) \
+ tflite::profiling::ScopedOperatorProfile VARNAME_UNIQ( \
+ _profile_, __COUNTER__)((profiler), "OpInvoke", (node_index))
+#else
+
+namespace tflite {
+namespace profiling {
+// A noop version of profiler when profiling is disabled.
+class Profiler {
+ public:
+ Profiler() {}
+ void StartProfiling() {}
+ void StopProfiling() {}
+ void Reset() {}
+ std::vector<const ProfileEvent*> GetProfileEvents() { return {}; }
+};
+} // namespace profiling
+} // namespace tflite
+
+#define SCOPED_OPERATOR_PROFILE(profiler, node_index)
+
+#endif // TFLITE_PROFILING_ENABLED
+
+#endif // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+
+// clang-format on
diff --git a/libs/profiling/include/profiling/profiling.h b/libs/profiling/include/profiling/profiling.h
new file mode 100644
index 000000000..ee0df1338
--- /dev/null
+++ b/libs/profiling/include/profiling/profiling.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_MISC_PROFILING_H__
+#define __NNFW_MISC_PROFILING_H__
+
+#include <iostream>
+
+namespace tflite
+{
+namespace profiling
+{
+class Profiler; // forward declaration
+}
+}
+
+namespace profiling
+{
+
+class Context
+{
+public:
+ Context() : _sync(false), _profiler(nullptr) {}
+
+public:
+ const bool &sync(void) const { return _sync; }
+ tflite::profiling::Profiler *getProfiler() { return _profiler; }
+ void setProfiler(tflite::profiling::Profiler *p) { _profiler = p; }
+ void setSync(void) { _sync = true; }
+
+private:
+ bool _sync;
+ tflite::profiling::Profiler *_profiler;
+
+public:
+ static Context &get(void)
+ {
+ static Context ctx{};
+ return ctx;
+ }
+};
+
+} // namespace profiling
+#endif // __NNFW_MISC_PROFILING_H__
diff --git a/libs/profiling/include/profiling/time.h b/libs/profiling/include/profiling/time.h
new file mode 100644
index 000000000..4b194944d
--- /dev/null
+++ b/libs/profiling/include/profiling/time.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
+// NOTE This header is derived from the following file (in TensorFlow v1.12)
+// 'externals/tensorflow/tensorflow/contrib/lite/profiling/time.h
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+
+#include <cstdint>
+
+namespace tflite {
+namespace profiling {
+namespace time {
+uint64_t NowMicros();
+} // namespace time
+} // namespace profiling
+} // namespace tflite
+#endif // TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+
+// clang-format on
diff --git a/libs/profiling/src/profiling/time.cpp b/libs/profiling/src/profiling/time.cpp
new file mode 100644
index 000000000..92d8595f8
--- /dev/null
+++ b/libs/profiling/src/profiling/time.cpp
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
+// NOTE This header is derived from the following file (in TensorFlow v1.12)
+// 'externals/tensorflow/tensorflow/contrib/lite/profiling/time.cpp
+#include "profiling/time.h"
+
+#if defined(_MSC_VER)
+#include <chrono> // NOLINT(build/c++11)
+#else
+#include <sys/time.h>
+#endif
+
+namespace tflite {
+namespace profiling {
+namespace time {
+
+#if defined(_MSC_VER)
+
+uint64_t NowMicros() {
+ return std::chrono::duration_cast<std::chrono::microseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count();
+}
+
+#else
+
+uint64_t NowMicros() {
+ struct timeval tv;
+ gettimeofday(&tv, nullptr);
+ return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+
+#endif // defined(_MSC_VER)
+
+} // namespace time
+} // namespace profiling
+} // namespace tflite
+
+// clang-format on
diff --git a/libs/support/CMakeLists.txt b/libs/support/CMakeLists.txt
deleted file mode 100644
index c91677266..000000000
--- a/libs/support/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_subdirectory(tflite)
-add_subdirectory(nnapi)
diff --git a/libs/support/nnapi/CMakeLists.txt b/libs/support/nnapi/CMakeLists.txt
deleted file mode 100644
index 193bcbd4e..000000000
--- a/libs/support/nnapi/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-file(GLOB_RECURSE SOURCES "src/*.cpp")
-
-add_library(nnfw_support_nnapi ${SOURCES})
-set_property(TARGET nnfw_support_nnapi PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_include_directories(nnfw_support_nnapi PUBLIC ${CMAKE_SOURCE_DIR}/include)
-target_link_libraries(nnfw_support_nnapi static_nnfw_util)
diff --git a/libs/support/nnapi/src/Utils.cpp b/libs/support/nnapi/src/Utils.cpp
deleted file mode 100644
index ae1076fd1..000000000
--- a/libs/support/nnapi/src/Utils.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "support/nnapi/Utils.h"
-
-#include <cassert>
-
-namespace nnfw
-{
-namespace support
-{
-namespace nnapi
-{
-
-const char *to_string(const PaddingCode &code)
-{
- assert((ANEURALNETWORKS_PADDING_SAME == code) || (ANEURALNETWORKS_PADDING_VALID == code));
-
- switch (code)
- {
- case ANEURALNETWORKS_PADDING_SAME:
- return "ANEURALNETWORKS_PADDING_SAME";
- case ANEURALNETWORKS_PADDING_VALID:
- return "ANEURALNETWORKS_PADDING_VALID";
- }
-
- return nullptr;
-}
-
-} // namespace nnapi
-} // namespace support
-} // namespace nnfw
diff --git a/libs/support/tflite/CMakeLists.txt b/libs/support/tflite/CMakeLists.txt
deleted file mode 100644
index 667b3bc11..000000000
--- a/libs/support/tflite/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-file(GLOB_RECURSE SOURCES "src/*.cpp")
-file(GLOB_RECURSE TESTS "src/*.test.cpp")
-list(REMOVE_ITEM SOURCES ${TESTS})
-
-add_library(nnfw_support_tflite STATIC ${SOURCES})
-set_target_properties(nnfw_support_tflite PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_include_directories(nnfw_support_tflite PUBLIC ${CMAKE_SOURCE_DIR}/include)
-target_link_libraries(nnfw_support_tflite tensorflow-lite ${LIB_PTHREAD} dl)
-target_link_libraries(nnfw_support_tflite static_nnfw_util)
-
-add_executable(nnfw_support_tflite_test_TensorView src/TensorView.test.cpp)
-target_link_libraries(nnfw_support_tflite_test_TensorView nnfw_support_tflite)
diff --git a/libs/support/tflite/src/TensorView.test.cpp b/libs/support/tflite/src/TensorView.test.cpp
deleted file mode 100644
index 1d3a70500..000000000
--- a/libs/support/tflite/src/TensorView.test.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "support/tflite/TensorView.h"
-
-#include <cassert>
-
-void int_test(void)
-{
- int value[6] = {1, 2, 3, 4, 5, 6};
-
- const nnfw::util::tensor::Shape shape{2, 3};
- const nnfw::support::tflite::TensorView<int> view{shape, value};
-
- assert(view.at(nnfw::util::tensor::Index{0, 0}) == 1);
- assert(view.at(nnfw::util::tensor::Index{0, 1}) == 2);
- assert(view.at(nnfw::util::tensor::Index{0, 2}) == 3);
- assert(view.at(nnfw::util::tensor::Index{1, 0}) == 4);
- assert(view.at(nnfw::util::tensor::Index{1, 1}) == 5);
- assert(view.at(nnfw::util::tensor::Index{1, 2}) == 6);
-}
-
-int main(int argc, char **argv)
-{
- float value[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-
- const nnfw::util::tensor::Shape shape{2, 3};
- const nnfw::support::tflite::TensorView<float> view{shape, value};
-
- assert(view.at(nnfw::util::tensor::Index{0, 0}) == 1.0f);
- assert(view.at(nnfw::util::tensor::Index{0, 1}) == 2.0f);
- assert(view.at(nnfw::util::tensor::Index{0, 2}) == 3.0f);
- assert(view.at(nnfw::util::tensor::Index{1, 0}) == 4.0f);
- assert(view.at(nnfw::util::tensor::Index{1, 1}) == 5.0f);
- assert(view.at(nnfw::util::tensor::Index{1, 2}) == 6.0f);
-
- int_test();
-
- return 0;
-}
diff --git a/libs/support/tflite/src/kernels/RSQRT.cpp b/libs/support/tflite/src/kernels/RSQRT.cpp
deleted file mode 100644
index 13efe0ed9..000000000
--- a/libs/support/tflite/src/kernels/RSQRT.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "support/tflite/kernels/RSQRT.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-
-#include <cmath>
-#include <iostream>
-
-namespace tflite
-{
-namespace ops
-{
-namespace custom
-{
-namespace nnfw
-{
-namespace RSQRT
-{
-
-void *InitRSQRT(TfLiteContext *context, const char *buffer, size_t length) { return nullptr; }
-
-void FreeRSQRT(TfLiteContext *context, void *buffer) {}
-
-TfLiteStatus PrepareRSQRT(TfLiteContext *context, TfLiteNode *node)
-{
- TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
- TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
- const TfLiteTensor *input = GetInput(context, node, 0);
- TfLiteTensor *output = GetOutput(context, node, 0);
- TF_LITE_ENSURE_EQ(context, input->type, output->type);
- // Quantized float is not supported yet.
- TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
- return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims));
-}
-
-inline TfLiteStatus Eval(TfLiteContext *context, TfLiteNode *node, float float_func(float))
-{
- const TfLiteTensor *input = GetInput(context, node, 0);
- TfLiteTensor *output = GetOutput(context, node, 0);
- switch (input->type)
- {
- case kTfLiteFloat32:
- {
- size_t elements = NumElements(input);
- const float *in = input->data.f;
- const float *in_end = in + elements;
- float *out = output->data.f;
- for (; in < in_end; in++, out++)
- *out = float_func(*in);
- return kTfLiteOk;
- }
- default:
- {
- context->ReportError(context, "Input type is %d, requires float32", input->type);
- return kTfLiteError;
- }
- }
-}
-
-TfLiteStatus EvalRSQRT(TfLiteContext *context, TfLiteNode *node)
-{
- return Eval(context, node, [](float f) { return 1.f / std::sqrt(f); });
-}
-
-} // namespace RSQRT
-} // namespace nnfw
-} // namespace custom
-} // namespace ops
-} // namespace tflite
diff --git a/libs/support/tflite/src/nnapi_delegate.cpp b/libs/support/tflite/src/nnapi_delegate.cpp
deleted file mode 100644
index 1eada4bca..000000000
--- a/libs/support/tflite/src/nnapi_delegate.cpp
+++ /dev/null
@@ -1,720 +0,0 @@
-/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// NOTE To minimize diff with upstream tensorflow, disable clang-format
-// clang-format off
-
-// NOTE This code is derived from the following file (in TensorFlow)
-// 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.cc'
-#include "support/tflite/nnapi_delegate.h"
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/error_reporter.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "NeuralNetworksShim.h"
-#include "NeuralNetworksExShim.h"
-
-#ifdef __ANDROID__
-#include <sys/system_properties.h>
-#endif
-
-namespace nnfw
-{
-
-// TODO(aselle): FATAL leaves resources hanging.
-void FATAL(const char* format, ...) {
- va_list args;
- va_start(args, format);
- vfprintf(stderr, format, args);
- va_end(args);
- fflush(stderr);
- exit(1);
-}
-
-// TODO(aselle): Change the error model to use status codes.
-#define CHECK_TFLITE_SUCCESS(x) \
- if (x != kTfLiteOk) { \
- FATAL("Aborting since tflite returned failure."); \
- }
-
-#define CHECK_NN(x) \
- if (x != ANEURALNETWORKS_NO_ERROR) { \
- FATAL("Aborting since tflite returned failure."); \
- }
-
-namespace {
-
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
- const char* sdkProp = "ro.build.version.sdk";
- char sdkVersion[PROP_VALUE_MAX];
- int length = __system_property_get(sdkProp, sdkVersion);
- if (length != 0) {
- for (int i = 0; i < length; ++i) {
- int digit = sdkVersion[i] - '0';
- if (digit < 0 || digit > 9) {
- // Non-numeric SDK version, assume it's higher then expected;
- return 0xFFFF;
- }
- }
- return atoi(sdkVersion);
- }
- FATAL("No %s prop", sdkProp);
-#endif // __ANDROID__
- return 0;
-}
-
-static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
-
-} // namespace
-
-NNAPIAllocation::NNAPIAllocation(const char* filename,
- ::tflite::ErrorReporter* error_reporter)
- : MMAPAllocation(filename, error_reporter) {
- if (mmapped_buffer_ != MAP_FAILED)
- CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
- mmap_fd_, 0, &handle_));
-}
-
-NNAPIAllocation::~NNAPIAllocation() {
- if (handle_) {
- ANeuralNetworksMemory_free(handle_);
- }
-}
-
-NNAPIDelegate::~NNAPIDelegate() {
- if (nn_compiled_model_) {
- ANeuralNetworksCompilation_free(nn_compiled_model_);
- nn_compiled_model_ = nullptr;
- }
- if (nn_model_) {
- ANeuralNetworksModel_free(nn_model_);
- nn_model_ = nullptr;
- // TODO(aselle): Is this thread-safe and callable multiple times?
- }
- // ANeuralNetworksShutdown();
-}
-
-// Adds the tensors of the interpreter to the NN API model.
-// Returns the number of operands added.
-uint32_t addTensorOperands(tflite::Interpreter* interpreter,
- ANeuralNetworksModel* nn_model,
- const std::vector<uint32_t>& skip_list) {
- uint32_t next_id = 0;
- for (size_t i = 0; i < interpreter->tensors_size(); i++) {
- // skip temporaries tensors.
- bool shouldSkip = false;
- for (auto skip_idx : skip_list) {
- if (i == skip_idx) {
- shouldSkip = true;
- break;
- }
- }
- if (shouldSkip) continue;
-
- int32_t nn_type = 0;
- // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
- float scale = 0.0f;
- int32_t zeroPoint = 0;
- TfLiteTensor* tensor = interpreter->tensor(i);
- switch (tensor->type) {
- case kTfLiteNoType:
- // Tensors added during initialization of Ops don't have a type yet and
- // should not be registered with the NNAPI.
- continue;
- case kTfLiteFloat32:
- nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
- break;
- case kTfLiteUInt8:
- nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
- scale = tensor->params.scale;
- // FIXME The next line is a workaround because currently zero scale is
- // passed down from TF
- // Lite. Note that the latest NeuralNetworks.h (see
- // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/runtime/include/NeuralNetworks.h)
- // requires scale to be greater than zero. Remove this workaround
- // when the scale
- // value is correctly passed.
- scale = (scale == 0.0f) ? 1.0f : scale;
- zeroPoint = tensor->params.zero_point;
- break;
- case kTfLiteInt32:
- nn_type = ANEURALNETWORKS_TENSOR_INT32;
- scale = tensor->params.scale;
- zeroPoint = tensor->params.zero_point;
- break;
- default:
- FATAL("Unsupported type.");
- }
- // TODO(aselle): Note, many of these are intermediate results. Do I need
- // to ever specify these sizes. I am currently below doing setValue
- // on all of them, but I shouldn't in the future.
- // Answer(jeanluc): If all the operators can set the dimension correctly,
- // you won't need to.
- ANeuralNetworksOperandType operand_type{
- nn_type, static_cast<uint32_t>(tensor->dims->size),
- reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
- CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
- // TODO(aselle): Based on Michael's suggestion, limiting this to read
- // only memory
- if (tensor->allocation_type == kTfLiteMmapRo) {
- if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
- static_cast<const ::tflite::Allocation*>(tensor->allocation))) {
- CHECK_NN(ANeuralNetworksModel_setOperandValueFromMemory(
- nn_model, next_id, alloc->memory(), alloc->offset(tensor->data.raw),
- tensor->bytes));
- } else {
- CHECK_NN(ANeuralNetworksModel_setOperandValue(
- nn_model, next_id, tensor->data.raw, tensor->bytes));
- }
- } else if (tensor->bytes == 0) {
- // These size 0 tensors are optional tensors reserved.
- CHECK_NN(
- ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
- }
-
- ++next_id;
- }
- return next_id;
-}
-
-// Adds the operations and their parameters to the NN API model.
-// 'next-id' is the operand ID of the next operand of the model.
-void AddOpsAndParams(tflite::Interpreter* interpreter,
- ANeuralNetworksModel* nn_model, uint32_t next_id,
- std::vector<int>* model_state_inputs,
- std::vector<int>* model_state_outputs) {
- for (size_t i = 0; i < interpreter->nodes_size(); i++) {
- const auto* node_and_registration = interpreter->node_and_registration(i);
- const TfLiteNode& node = node_and_registration->first;
- const TfLiteRegistration& registration = node_and_registration->second;
- tflite::BuiltinOperator builtin =
- static_cast<tflite::BuiltinOperator>(registration.builtin_code);
-
- // Add the parameters.
- std::vector<uint32_t> augmented_inputs(
- node.inputs->data, node.inputs->data + node.inputs->size);
- std::vector<uint32_t> augmented_outputs(
- node.outputs->data, node.outputs->data + node.outputs->size);
-
- auto add_scalar_int32 = [&nn_model, &augmented_inputs,
- &next_id](int value) {
- ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
- CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
- CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
- sizeof(int32_t)))
- augmented_inputs.push_back(next_id++);
- };
-
- auto add_scalar_float32 = [&nn_model, &augmented_inputs,
- &next_id](float value) {
- ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
- CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
- CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
- sizeof(float)))
- augmented_inputs.push_back(next_id++);
- };
-
- // Handle state tensors of RNN, LSTM, SVDF.
- // For each state_out tensor, a corresponding state_in operand needs to be
- // created for NNAPI.
- auto duplicate_state_tensor_float32 =
- [interpreter, &nn_model, &next_id, &augmented_inputs,
- &model_state_inputs, &model_state_outputs](int tensor_id) {
- const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
- ANeuralNetworksOperandType operand_type{
- ANEURALNETWORKS_TENSOR_FLOAT32,
- static_cast<uint32_t>(tensor->dims->size),
- reinterpret_cast<uint32_t*>(tensor->dims->data),
- tensor->params.scale, tensor->params.zero_point};
- CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
- augmented_inputs.push_back(next_id);
- model_state_inputs->push_back(next_id);
- model_state_outputs->push_back(tensor_id);
- next_id++;
- };
-
- auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); };
-
- auto add_pooling_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
- add_scalar_int32(builtin->padding);
- add_scalar_int32(builtin->stride_width);
- add_scalar_int32(builtin->stride_height);
- add_scalar_int32(builtin->filter_width);
- add_scalar_int32(builtin->filter_height);
- add_scalar_int32(builtin->activation);
- };
-
- auto add_convolution_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteConvParams*>(data);
- add_scalar_int32(builtin->padding);
- add_scalar_int32(builtin->stride_width);
- add_scalar_int32(builtin->stride_height);
- add_scalar_int32(builtin->activation);
- };
-
- auto add_depthwise_conv_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data);
- add_scalar_int32(builtin->padding);
- add_scalar_int32(builtin->stride_width);
- add_scalar_int32(builtin->stride_height);
- add_scalar_int32(builtin->depth_multiplier);
- add_scalar_int32(builtin->activation);
- };
-
- auto add_fully_connected_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data);
- add_scalar_int32(builtin->activation);
- };
-
- auto add_concatenation_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data);
- add_scalar_int32(builtin->axis);
- if (builtin->activation != kTfLiteActNone) {
- FATAL("Concatenation does not support fused activation in NNAPI");
- }
- };
-
- auto add_softmax_params = [&add_scalar_float32](void* data) {
- auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data);
- add_scalar_float32(builtin->beta);
- };
-
- auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
- add_scalar_int32(builtin->block_size);
- };
-
- auto add_lstm_params = [&add_scalar_int32,
- &add_scalar_float32](void* data) {
- auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data);
- add_scalar_int32(builtin->activation);
- add_scalar_float32(builtin->cell_clip);
- add_scalar_float32(builtin->proj_clip);
- };
-
- // LSTM in NNAPI requires scratch tensor as an output operand.
- auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
- &next_id, &augmented_outputs]() {
- int scratch_buffer_index = node.temporaries->data[0];
- const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
- ANeuralNetworksOperandType operand_type{
- ANEURALNETWORKS_TENSOR_FLOAT32,
- static_cast<uint32_t>(tensor->dims->size),
- reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
- tensor->params.zero_point};
- CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
- augmented_outputs.insert(augmented_outputs.begin(), next_id++);
- };
-
- auto add_mean_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteMeanParams*>(data);
- add_scalar_int32(builtin->keep_dims);
- };
-
- auto add_svdf_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data);
- add_scalar_int32(builtin->rank);
- add_scalar_int32(builtin->activation);
- };
-
- auto add_rnn_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteRNNParams*>(data);
- add_scalar_int32(builtin->activation);
- };
-
- // Handle optional input tensors.
- auto add_optional_tensors = [&nn_model, &augmented_inputs,
- &next_id](int nn_type) {
- for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
- if (augmented_inputs[idx] == kOptionalTensor) {
- const std::vector<uint32_t> dim = {0, 0};
- ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
- CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
- CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
- nullptr, 0))
- augmented_inputs[idx] = next_id++;
- }
- }
- };
-
- int nnapi_version = 10;
-#include "nnapi_delegate_ex_AddOpsAndParams_lambda.inc"
-
- ANeuralNetworksOperationType nn_op_type;
-
- switch (builtin) {
- case tflite::BuiltinOperator_ADD:
- nn_op_type = ANEURALNETWORKS_ADD;
- add_add_params();
- break;
- case tflite::BuiltinOperator_MUL:
- nn_op_type = ANEURALNETWORKS_MUL;
- add_add_params();
- break;
- case tflite::BuiltinOperator_AVERAGE_POOL_2D:
- add_pooling_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
- break;
- case tflite::BuiltinOperator_MAX_POOL_2D:
- add_pooling_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_MAX_POOL_2D;
- break;
- case tflite::BuiltinOperator_L2_POOL_2D:
- add_pooling_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_L2_POOL_2D;
- break;
- case tflite::BuiltinOperator_CONV_2D:
- add_convolution_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_CONV_2D;
- break;
- case tflite::BuiltinOperator_RELU:
- nn_op_type = ANEURALNETWORKS_RELU;
- break;
- case tflite::BuiltinOperator_RELU_N1_TO_1:
- nn_op_type = ANEURALNETWORKS_RELU1;
- break;
- case tflite::BuiltinOperator_RELU6:
- nn_op_type = ANEURALNETWORKS_RELU6;
- break;
- case tflite::BuiltinOperator_TANH:
- nn_op_type = ANEURALNETWORKS_TANH;
- break;
- case tflite::BuiltinOperator_FLOOR:
- nn_op_type = ANEURALNETWORKS_FLOOR;
- break;
- case tflite::BuiltinOperator_LOGISTIC:
- nn_op_type = ANEURALNETWORKS_LOGISTIC;
- break;
- case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
- add_depthwise_conv_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
- break;
- case tflite::BuiltinOperator_CONCATENATION:
- add_concatenation_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_CONCATENATION;
- break;
- case tflite::BuiltinOperator_SOFTMAX:
- add_softmax_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_SOFTMAX;
- break;
- case tflite::BuiltinOperator_FULLY_CONNECTED:
- add_fully_connected_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
- break;
- case tflite::BuiltinOperator_RESHAPE:
- nn_op_type = ANEURALNETWORKS_RESHAPE;
- // add_reshape_params(node.builtin_data);
- break;
- case tflite::BuiltinOperator_RESIZE_BILINEAR:
- add_resize_bilinear_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR;
- break;
- case tflite::BuiltinOperator_SPACE_TO_DEPTH:
- add_space_to_depth_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
- break;
- case tflite::BuiltinOperator_LSTM: {
- duplicate_state_tensor_float32(
- node.outputs->data[/*kOutputStateTensor*/ 0]);
- duplicate_state_tensor_float32(
- node.outputs->data[/*kCellStateTensor*/ 1]);
- add_lstm_params(node.builtin_data);
- add_lstm_scratch_tensor_float32();
- add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32);
- nn_op_type = ANEURALNETWORKS_LSTM;
- break;
- }
- case tflite::BuiltinOperator_DEQUANTIZE:
- nn_op_type = ANEURALNETWORKS_DEQUANTIZE;
- break;
- case tflite::BuiltinOperator_SVDF: {
- duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]);
- add_svdf_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_SVDF;
- break;
- }
- case tflite::BuiltinOperator_RNN: {
- duplicate_state_tensor_float32(
- node.outputs->data[/*kHiddenStateTensor*/ 0]);
- add_rnn_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_RNN;
- break;
- }
- case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
- nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP;
- break;
- case tflite::BuiltinOperator_PAD:
- nnapi_version = 11; // require NNAPI 1.1
- nn_op_type = ANEURALNETWORKS_PAD;
- break;
- case tflite::BuiltinOperator_MEAN:
- nnapi_version = 11; // require NNAPI 1.1
- add_mean_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_MEAN;
- break;
- case tflite::BuiltinOperator_DIV:
- nnapi_version = 11; // require NNAPI 1.1
- nn_op_type = ANEURALNETWORKS_DIV;
- add_add_params();
- break;
- case tflite::BuiltinOperator_SUB:
- nnapi_version = 11; // require NNAPI 1.1
- nn_op_type = ANEURALNETWORKS_SUB;
- add_add_params();
- break;
- case tflite::BuiltinOperator_STRIDED_SLICE:
- add_strided_slice_params(node.builtin_data);
- nn_op_type = ANEURALNETWORKS_STRIDED_SLICE;
- break;
- case tflite::BuiltinOperator_CAST:
- CHECK_NN(ANeuralNetworksModel_addOperationEx(
- nn_model, ANEURALNETWORKS_CAST_EX,
- static_cast<uint32_t>(augmented_inputs.size()),
- augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
- reinterpret_cast<uint32_t*>(node.outputs->data)));
- continue;
- case tflite::BuiltinOperator_TOPK_V2:
- CHECK_NN(ANeuralNetworksModel_addOperationEx(
- nn_model, ANEURALNETWORKS_TOPK_V2_EX,
- static_cast<uint32_t>(augmented_inputs.size()),
- augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
- reinterpret_cast<uint32_t*>(node.outputs->data)));
- continue;
- case tflite::BuiltinOperator_GATHER:
- add_gather_ex_params(node.builtin_data);
- CHECK_NN(ANeuralNetworksModel_addOperationEx(
- nn_model, ANEURALNETWORKS_GATHER_EX,
- static_cast<uint32_t>(augmented_inputs.size()),
- augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
- reinterpret_cast<uint32_t*>(node.outputs->data)));
- continue;
- case tflite::BuiltinOperator_SPLIT:
- CHECK_NN(ANeuralNetworksModel_addOperationEx(
- nn_model, ANEURALNETWORKS_SPLIT_EX,
- static_cast<uint32_t>(augmented_inputs.size()),
- augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
- reinterpret_cast<uint32_t*>(node.outputs->data)));
- continue;
- case tflite::BuiltinOperator_TRANSPOSE:
- nn_op_type = ANEURALNETWORKS_TRANSPOSE;
- // param is almost same as reshape
- break;
- case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
- case tflite::BuiltinOperator_LSH_PROJECTION:
- case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
- case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
- case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
- case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
- case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
- case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
- case tflite::BuiltinOperator_L2_NORMALIZATION:
- case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
- case tflite::BuiltinOperator_PADV2:
- case tflite::BuiltinOperator_CALL:
- case tflite::BuiltinOperator_SKIP_GRAM:
- case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
- case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
- case tflite::BuiltinOperator_SQUEEZE:
- case tflite::BuiltinOperator_EXP:
- case tflite::BuiltinOperator_LOG_SOFTMAX:
- case tflite::BuiltinOperator_DELEGATE:
- case tflite::BuiltinOperator_PRELU:
- case tflite::BuiltinOperator_MAXIMUM:
- case tflite::BuiltinOperator_MINIMUM:
- case tflite::BuiltinOperator_ARG_MAX:
- case tflite::BuiltinOperator_GREATER:
- case tflite::BuiltinOperator_GREATER_EQUAL:
- case tflite::BuiltinOperator_LESS:
- case tflite::BuiltinOperator_LESS_EQUAL:
- case tflite::BuiltinOperator_NEG:
- case tflite::BuiltinOperator_SELECT:
- case tflite::BuiltinOperator_SLICE:
- case tflite::BuiltinOperator_SIN:
- case tflite::BuiltinOperator_TRANSPOSE_CONV:
- case tflite::BuiltinOperator_SPARSE_TO_DENSE:
- FATAL("Op code %d is currently not delegated to NNAPI", builtin);
- nn_op_type = -1; // set to invalid
- break;
- case tflite::BuiltinOperator_CUSTOM:
- std::string custom_name(registration.custom_name);
- if (custom_name.compare("TensorFlowMax") == 0) {
- CHECK_NN(ANeuralNetworksModel_addOperationEx(
- nn_model, ANEURALNETWORKS_TENSORFLOW_MAX_EX,
- static_cast<uint32_t>(augmented_inputs.size()),
- augmented_inputs.data(),
- static_cast<uint32_t>(node.outputs->size),
- reinterpret_cast<uint32_t*>(node.outputs->data)));
- continue;
- }
- else if (custom_name.compare("RSQRT") == 0) {
- CHECK_NN(ANeuralNetworksModel_addOperationEx(
- nn_model, ANEURALNETWORKS_RSQRT_EX,
- static_cast<uint32_t>(augmented_inputs.size()),
- augmented_inputs.data(),
- static_cast<uint32_t>(node.outputs->size),
- reinterpret_cast<uint32_t*>(node.outputs->data)));
- continue;
- }
- else if (custom_name.compare("SquaredDifference") == 0) {
- CHECK_NN(ANeuralNetworksModel_addOperationEx(
- nn_model, ANEURALNETWORKS_SQUARED_DIFFERENCE_EX,
- static_cast<uint32_t>(augmented_inputs.size()),
- augmented_inputs.data(),
- static_cast<uint32_t>(node.outputs->size),
- reinterpret_cast<uint32_t*>(node.outputs->data)));
- continue;
- }
-
- FATAL("Custom operations are not supported when using NNAPI.");
- nn_op_type = -1; // set to invalid
- break;
- }
-
- //if (nnapi_version == 11 && kAndroidSdkVersion < 28) {
- // FATAL("Op %d needs NNAPI1.1", builtin);
- //}
-
- // Add the operation.
- CHECK_NN(ANeuralNetworksModel_addOperation(
- nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
- augmented_inputs.data(),
- static_cast<uint32_t>(augmented_outputs.size()),
- reinterpret_cast<uint32_t*>(augmented_outputs.data())));
- }
-}
-
-TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) {
- // TODO(aselle): This is not correct. need to handle resize invalidation.
- if (nn_model_ && nn_compiled_model_) return kTfLiteOk;
-
- if (!nn_model_) {
- CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
-
- // Find all the temporary tensors and put them in a skip_list.
- std::vector<uint32_t> skip_list;
- for (size_t i = 0; i < interpreter->nodes_size(); i++) {
- const auto* node_and_registration = interpreter->node_and_registration(i);
- const TfLiteNode& node = node_and_registration->first;
- if (node.temporaries != nullptr) {
- for (int j = 0; j < node.temporaries->size; j++) {
- skip_list.push_back(static_cast<uint32_t>(node.temporaries->data[j]));
- }
- }
- }
-
- uint32_t next_id = addTensorOperands(interpreter, nn_model_, skip_list);
- AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
- &model_states_outputs_);
-
- std::vector<int> augmented_inputs = interpreter->inputs();
- std::vector<int> augmented_outputs = interpreter->outputs();
-
- // All state tensors input/output need to be treated as model input/output.
- augmented_inputs.insert(augmented_inputs.end(),
- model_states_inputs_.begin(),
- model_states_inputs_.end());
- augmented_outputs.insert(augmented_outputs.end(),
- model_states_outputs_.begin(),
- model_states_outputs_.end());
-
- CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
- nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
- reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
- static_cast<uint32_t>(augmented_outputs.size()),
- reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
- CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
- }
- if (!nn_compiled_model_) {
- CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
- CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
- }
- return kTfLiteOk;
-}
-
-TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) {
- if (!nn_model_) {
- TF_LITE_ENSURE_STATUS(BuildGraph(interpreter));
- }
-
- ANeuralNetworksExecution* execution = nullptr;
- CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
-
- // Currently perform deep copy of input buffer
- for (size_t i = 0; i < interpreter->inputs().size(); i++) {
- int input = interpreter->inputs()[i];
- // TODO(aselle): Is this what we want or do we want input instead?
- // TODO(aselle): This should be called setInputValue maybe to be cons.
- TfLiteTensor* tensor = interpreter->tensor(input);
- CHECK_NN(ANeuralNetworksExecution_setInput(
- execution, i, nullptr, tensor->data.raw, tensor->bytes));
- }
-
- // Tell nn api where to place final data.
- for (size_t i = 0; i < interpreter->outputs().size(); i++) {
- int output = interpreter->outputs()[i];
- TfLiteTensor* tensor = interpreter->tensor(output);
- CHECK_NN(ANeuralNetworksExecution_setOutput(
- execution, i, nullptr, tensor->data.raw, tensor->bytes));
- }
-
- // The state_out of previous invocation need to be mapped to state_in of
- // current invocation.
- for (size_t i = 0; i < model_states_outputs_.size(); i++) {
- int state_tensor_idx = model_states_outputs_[i];
- TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx);
- // Here we are using a deep copy for state_in tensors so that we are not
- // reading and writing into the same buffer during a invocation.
- // TODO(miaowang): using double shared buffer to minimize the copies.
- CHECK_NN(ANeuralNetworksExecution_setInput(
- execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw,
- tensor->bytes));
- // Tell NNAPI where to output the state_out.
- CHECK_NN(ANeuralNetworksExecution_setOutput(
- execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw,
- tensor->bytes));
- }
-
- // Currently use blocking compute.
- ANeuralNetworksEvent* event = nullptr;
- CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
- CHECK_NN(ANeuralNetworksEvent_wait(event));
- ANeuralNetworksEvent_free(event);
- ANeuralNetworksExecution_free(execution);
-
-#if 0
- printf("From the NN API:\n");
- TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
- if (float* data =
- interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
- size_t num = tensor->bytes / sizeof(float);
- for (float* p = data; p < data + num; p++) {
- printf(" %f", *p);
- }
- printf("\n");
- }
-#endif
-
- return kTfLiteOk;
-}
-
-} // namespace nnfw
-
-// clang-format on
diff --git a/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc b/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc
deleted file mode 100644
index ea485fe45..000000000
--- a/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc
+++ /dev/null
@@ -1,41 +0,0 @@
-// This file is included from AddOpsAndParams defined in nnapi_delegate.cc
-// and contains lambda for extened implementation to original Tensorflow Lite.
- auto add_resize_bilinear_params = [&add_scalar_int32, &interpreter, &augmented_inputs](void* data) {
- auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(data);
- if (builtin->align_corners) {
- FATAL("Resize bilinear does not support align corners in NNAPI");
- }
-
- TfLiteTensor* tensor = interpreter->tensor(augmented_inputs.back());
- assert(tensor->type == kTfLiteInt32);
- assert(tensor->bytes == sizeof(int)*2);
- augmented_inputs.pop_back();
-
- int height = ((int*)(tensor->data.raw))[1];
- int width = ((int*)(tensor->data.raw))[0];
- add_scalar_int32(height);
- add_scalar_int32(width);
- };
-
- auto add_strided_slice_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(data);
- add_scalar_int32(builtin->begin_mask);
- add_scalar_int32(builtin->end_mask);
- // ellipsis_mask and new_axis_mask are not supported on nn runtime
- // cf) tflite interpreter supports both operations
- if (builtin->ellipsis_mask) {
- FATAL("STRIDE_SLICE does not support ellipsis_mask in NNAPI");
- }
- if (builtin->new_axis_mask) {
- FATAL("STRIDE_SLICE does not support new_axis_mask in NNAPI");
- }
- add_scalar_int32(builtin->shrink_axis_mask);
- };
-
- auto add_gather_ex_params = [&add_scalar_int32](void* data) {
- auto builtin = reinterpret_cast<TfLiteGatherParams*>(data);
- add_scalar_int32(builtin->axis);
- if (builtin->axis != 0) {
- FATAL("GATHER does not support axis>0 in NNAPI");
- }
- };
diff --git a/libs/tflite/CMakeLists.txt b/libs/tflite/CMakeLists.txt
new file mode 100644
index 000000000..e844d1c68
--- /dev/null
+++ b/libs/tflite/CMakeLists.txt
@@ -0,0 +1,12 @@
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(nnfw_lib_tflite STATIC ${SOURCES})
+set_target_properties(nnfw_lib_tflite PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(nnfw_lib_tflite PUBLIC ${NNFW_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_link_libraries(nnfw_lib_tflite tensorflow-lite ${LIB_PTHREAD} dl)
+target_link_libraries(nnfw_lib_tflite nnfw_lib_misc)
+
+add_executable(nnfw_lib_tflite_test_TensorView src/TensorView.test.cpp)
+target_link_libraries(nnfw_lib_tflite_test_TensorView nnfw_lib_tflite)
diff --git a/libs/tflite/include/tflite/Assert.h b/libs/tflite/include/tflite/Assert.h
new file mode 100644
index 000000000..6d12d37f6
--- /dev/null
+++ b/libs/tflite/include/tflite/Assert.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Assert.h
+ * @brief This file contains helper function of assertion
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_ASSERT_H__
+#define __NNFW_TFLITE_ASSERT_H__
+
+#include "tensorflow/contrib/lite/context.h"
+
+#include <sstream>
+
+#define STR_DETAIL(value) #value
+#define STR(value) STR_DETAIL(value)
+
+#define TFLITE_ENSURE(exp) \
+ { \
+ const TfLiteStatus status = (exp); \
+ \
+ if (status != kTfLiteOk) \
+ { \
+ std::ostringstream ss; \
+ ss << #exp << " failed (" << __FILE__ << ":" << __LINE__ << ")"; \
+ throw std::runtime_error{ss.str()}; \
+ } \
+ }
+
+#endif // __NNFW_TFLITE_ASSERT_H__
diff --git a/libs/tflite/include/tflite/Diff.h b/libs/tflite/include/tflite/Diff.h
new file mode 100644
index 000000000..15c672831
--- /dev/null
+++ b/libs/tflite/include/tflite/Diff.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Diff.h
+ * @brief This file contains classes for testing correctess of implementation
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_DIFF_H__
+#define __NNFW_TFLITE_DIFF_H__
+
+#include "tensorflow/contrib/lite/interpreter.h"
+
+#include "misc/tensor/Index.h"
+#include "misc/tensor/Diff.h"
+#include "misc/tensor/Shape.h"
+#include "misc/tensor/Comparator.h"
+
+#include "tflite/TensorView.h"
+
+#include <functional>
+#include <vector>
+
+/**
+ * @brief Class to define TfLite interpreter match application
+ */
+class TfLiteInterpMatchApp
+{
+public:
+ /**
+ * @brief Construct a new TfLiteInterpMatchApp object with Comparator
+ * @param[in] comparator Comparator object for tensor comparation
+ */
+ TfLiteInterpMatchApp(const nnfw::misc::tensor::Comparator &comparator)
+ : _verbose{false}, _comparator(comparator)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Get reference verbose for debugging information
+ * @return Reference of verbose value
+ */
+ int &verbose(void) { return _verbose; }
+
+private:
+ int _verbose;
+
+public:
+ /**
+ * @brief Run two interpreter and return the output matching
+ * @param[in] pure Interpreter object of expected(with TfLite)
+ * @param[in] nnapi Interpreter object of obtained(through NNAPI)
+ * @return @c true if two Interpreter results are same, otherwise @c false
+ */
+ bool run(::tflite::Interpreter &pure, ::tflite::Interpreter &nnapi) const;
+ /**
+ * @brief Compare two TensorView values and return the match result
+ * @param[in] expected TensorView object to read expected values
+ * @param[in] obtained TensorView object to read obtained values
+ * @param[in] id Tensor ID value used for debug message
+ * @return @c true if two TensorView values are same, otherwise @c false
+ */
+ template <typename T>
+ bool compareSingleTensorView(const nnfw::tflite::TensorView<T> &expected,
+ const nnfw::tflite::TensorView<T> &obtained, int id) const;
+
+private:
+ const nnfw::misc::tensor::Comparator &_comparator;
+};
+
+#include "tflite/interp/Builder.h"
+#include "tflite/Quantization.h"
+
+#include <random>
+
+/**
+ * @brief Class to generate random values
+ */
+class RandomGenerator
+{
+public:
+ /**
+ * @brief Construct a new RandomGenerator object
+ * @param[in] seed Random seed value
+ * @param[in] mean Mean value of normal random number generation
+ * @param[in] stddev Standard deviation of random number generation
+ * @param[in] quantization TfLiteQuantizationParams type to represent quantization value
+ * (not used yet)
+ */
+ RandomGenerator(int seed, float mean, float stddev,
+ const TfLiteQuantizationParams quantization = make_default_quantization())
+ : _rand{seed}, _dist{mean, stddev}, _quantization{quantization}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Generate random numbers for type T
+ * @param[in] s Shape value
+ * @param[in] i Index value
+ * @return Random generated value
+ * @note This is same as T generate(void) as two input parameters are not used
+ */
+ template <typename T>
+ T generate(const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)
+ {
+ return generate<T>();
+ }
+
+ /**
+ * @brief Generate random numbers for type T
+ * @return Random generated value
+ */
+ template <typename T> T generate(void) { return _dist(_rand); }
+
+private:
+ std::minstd_rand _rand;
+ std::normal_distribution<float> _dist;
+ const TfLiteQuantizationParams _quantization;
+};
+
+template <> uint8_t RandomGenerator::generate<uint8_t>(void);
+
+/**
+ * @brief Structure for NNAPI correctness test
+ */
+struct RandomTestParam
+{
+ int verbose; //!< Verbosity of debug information
+ int tolerance; //!< Torlerance of value difference
+ int tensor_logging = 0; //!< Save logging to a file if not 0
+ std::string log_path = ""; //!< Path of log file, meaningful only when tensor_logging is 1
+};
+
+/**
+ * @brief Class to define Random test runner
+ */
+class RandomTestRunner
+{
+public:
+ /**
+ * @brief Construct a new RandomTestRunner object
+ * @param[in] seed Random seed value
+ * @param[in] param RandomTestParam object for test runner
+ * @param[in] quantization TfLiteQuantizationParams type to represent quantization value
+ */
+ RandomTestRunner(int seed, const RandomTestParam &param,
+ const TfLiteQuantizationParams quantization = make_default_quantization())
+ : _randgen{seed, 0.0f, 2.0f, quantization}, _param{param}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Run the random test runner
+ * @param[in] builder Interpreter Builder used to run
+ * @return 0 if test succeeds, otherwise failure
+ */
+ int run(const nnfw::tflite::Builder &builder);
+
+public:
+ /**
+ * @brief Get RandomGenerator reference
+ * @return RandomGenerator reference
+ */
+ RandomGenerator &generator() { return _randgen; };
+
+private:
+ RandomGenerator _randgen;
+ const RandomTestParam _param;
+
+public:
+ /**
+ * @brief Create a RandomTestRunner object
+ * @param[in] seed Random seed value
+ * @return RandomGenerator object
+ */
+ static RandomTestRunner make(int seed);
+};
+
+#endif // __NNFW_TFLITE_DIFF_H__
diff --git a/libs/tflite/include/tflite/FeatureView.h b/libs/tflite/include/tflite/FeatureView.h
new file mode 100644
index 000000000..06cbf4b14
--- /dev/null
+++ b/libs/tflite/include/tflite/FeatureView.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file FeatureView.h
+ * @brief This file contains FeatureView class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_FEATURE_VIEW_H__
+#define __NNFW_TFLITE_FEATURE_VIEW_H__
+
+#include "tensorflow/contrib/lite/interpreter.h"
+
+#include "tflite/InputIndex.h"
+#include "tflite/OutputIndex.h"
+
+#include "misc/feature/Shape.h"
+#include "misc/feature/Reader.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+
+template <typename T> class FeatureView;
+
+/**
+ * @brief Class to support reading element of float type feature
+ */
+template <> class FeatureView<float> : public nnfw::misc::feature::Reader<float>
+{
+public:
+ /**
+ * @brief Construct a new FeatureView object
+ * @param[in] interp Interpreter to read from
+ * @param[in] index InputIndex index of input
+ */
+ FeatureView(::tflite::Interpreter &interp, const InputIndex &index);
+ /**
+ * @brief Construct a new FeatureView object
+ * @param[in] interp Interpreter to read from
+ * @param[in] index OutputIndex index of output
+ */
+ FeatureView(::tflite::Interpreter &interp, const OutputIndex &index);
+
+public:
+ /**
+ * @brief Get value of element using channel, row and column index
+ * @param[in] ch Channel index
+ * @param[in] row Row index
+ * @param[in] col Column index
+ * @return Value of element
+ */
+ float at(uint32_t ch, uint32_t row, uint32_t col) const;
+ /**
+ * @brief Get reference of element using channel, row and column index
+ * @param[in] ch Channel index
+ * @param[in] row Row index
+ * @param[in] col Column index
+ * @return Reference of element
+ */
+ float &at(uint32_t ch, uint32_t row, uint32_t col);
+
+private:
+ /**
+ * @brief Get offset of element from channel, row and column index
+ * @param[in] ch Channel index
+ * @param[in] row Row index
+ * @param[in] col Column index
+ * @return Offset of element
+ */
+ uint32_t getElementOffset(uint32_t ch, uint32_t row, uint32_t col) const
+ {
+ uint32_t res = 0;
+
+ // TensorFlow Lite assumes that NHWC ordering for tessor
+ res += row * _shape.W * _shape.C;
+ res += col * _shape.C;
+ res += ch;
+
+ return res;
+ }
+
+private:
+ nnfw::misc::feature::Shape _shape;
+ float *_base;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_FEATURE_VIEW_H__
diff --git a/libs/tflite/include/tflite/InputIndex.h b/libs/tflite/include/tflite/InputIndex.h
new file mode 100644
index 000000000..f535b2626
--- /dev/null
+++ b/libs/tflite/include/tflite/InputIndex.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file InputIndex.h
+ * @brief This file contains InputIndex class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_INPUT_INDEX_H__
+#define __NNFW_TFLITE_INPUT_INDEX_H__
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Class to express index of input
+ */
+class InputIndex
+{
+public:
+ /**
+ * @brief Construct a new InputIndex object with index value
+ * @param [in] index The value of index
+ */
+ InputIndex(int index) : _index(index)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Get index value as int
+ * @return Index value as int
+ */
+ int asInt(void) const { return _index; }
+
+private:
+ int _index;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_INPUT_INDEX_H__
diff --git a/libs/tflite/include/tflite/InterpreterSession.h b/libs/tflite/include/tflite/InterpreterSession.h
new file mode 100644
index 000000000..deaf05a7f
--- /dev/null
+++ b/libs/tflite/include/tflite/InterpreterSession.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file InterpreterSession.h
+ * @brief This file contains InterpreterSession class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_INTERPRETER_SESSION_H__
+#define __NNFW_TFLITE_INTERPRETER_SESSION_H__
+
+#include "Session.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Class to define TfLite interpreter session which is inherited from Session class
+ */
+class InterpreterSession final : public Session
+{
+public:
+ /**
+ * @brief Construct a InterpreterSession object with interpreter of TfLite
+ * @param[in] interp The TfLite interpreter pointer
+ */
+ InterpreterSession(::tflite::Interpreter *interp) : _interp{interp}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Get TfLite interpreter pointer
+ * @return The TfLite interpreter
+ */
+ ::tflite::Interpreter *interp(void) override { return _interp; }
+
+public:
+ /**
+ * @brief Prepare the TfLite interpreter session
+ * @return @c true if tensor preparation is successful, otherwise @c false
+ */
+ bool prepare(void) override
+ {
+ _interp->UseNNAPI(false);
+
+ if (kTfLiteOk != _interp->AllocateTensors())
+ {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * @brief Run the Invoke function of TfLite interpreter
+ * @return @c true if Invoke() is successful, otherwise @c false
+ */
+ bool run(void) override
+ {
+ // Return true if Invoke returns kTfLiteOk
+ return kTfLiteOk == _interp->Invoke();
+ }
+
+ /**
+ * @brief Tear down TfLite interpreter session
+ * @return @c true always
+ */
+ bool teardown(void) override
+ {
+ // Do NOTHING currently
+ return true;
+ }
+
+private:
+ ::tflite::Interpreter *const _interp;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_INTERPRETER_SESSION_H__
diff --git a/libs/tflite/include/tflite/NNAPISession.h b/libs/tflite/include/tflite/NNAPISession.h
new file mode 100644
index 000000000..b2a999d10
--- /dev/null
+++ b/libs/tflite/include/tflite/NNAPISession.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file NNAPISession.h
+ * @brief This file contains NNAPISession class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_NNAPI_SESSION_H__
+#define __NNFW_TFLITE_NNAPI_SESSION_H__
+
+#include "Session.h"
+#include "tflite/ext/nnapi_delegate.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Class to define NNAPI interpreter session which is inherited from Session class
+ */
+class NNAPISession final : public Session
+{
+public:
+ /**
+ * @brief Construct a NNAPISession object with interpreter of TfLite
+ * @param[in] interp The TfLite interpreter pointer
+ * @note Invoke BuildGraph() of NNAPI delegate from Interpreter
+ */
+ NNAPISession(::tflite::Interpreter *interp) : _interp{interp}
+ {
+ // Construct Graph from Interpreter
+ _delegate.BuildGraph(_interp);
+ }
+
+public:
+ /**
+ * @brief Get TfLite interpreter pointer
+ * @return The TfLite interpreter
+ */
+ ::tflite::Interpreter *interp(void) override { return _interp; }
+
+public:
+ /**
+ * @brief Prepare the TfLite interpreter session
+ * @return @c true if tensor preparation is successful, otherwise @c false
+ */
+ bool prepare(void) override
+ {
+ // Explicitly turn off T/F lite internal NNAPI delegation in order to use locally defined
+ // NNAPI delegation.
+ _interp->UseNNAPI(false);
+
+ if (kTfLiteOk != _interp->AllocateTensors())
+ {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * @brief Run the Invoke function of NNAPI delegate
+ * @return @c true if Invoke() is successful, otherwise @c false
+ */
+ bool run(void) override { return kTfLiteOk == _delegate.Invoke(_interp); }
+
+ /**
+ * @brief Tear down TfLite interpreter session
+ * @return @c true always
+ */
+ bool teardown(void) override
+ {
+ // DO NOTHING
+ return true;
+ }
+
+private:
+ ::tflite::Interpreter *const _interp;
+ nnfw::tflite::NNAPIDelegate _delegate;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_NNAPI_SESSION_H__
diff --git a/libs/tflite/include/tflite/OutputIndex.h b/libs/tflite/include/tflite/OutputIndex.h
new file mode 100644
index 000000000..dd1ca8d44
--- /dev/null
+++ b/libs/tflite/include/tflite/OutputIndex.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OutputIndex.h
+ * @brief This file contains OutputIndex class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_OUTPUT_INDEX_H__
+#define __NNFW_TFLITE_OUTPUT_INDEX_H__
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Class to define OutputIndex
+ */
+class OutputIndex
+{
+public:
+ /**
+ * @brief Construct a OutputIndex object with index value
+ * @param[in] index The value of index
+ */
+ OutputIndex(int index) : _index(index)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Get index value as int
+ * @return Index valuel as int
+ */
+ int asInt(void) const { return _index; }
+
+private:
+ int _index;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_OUTPUT_INDEX_H__
diff --git a/libs/tflite/include/tflite/Quantization.h b/libs/tflite/include/tflite/Quantization.h
new file mode 100644
index 000000000..4a8a0f1ac
--- /dev/null
+++ b/libs/tflite/include/tflite/Quantization.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Quantization.h
+ * @brief This file contains BitwiseIntToFloat union and quantization related
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_QUANTIZATION_H__
+#define __NNFW_TFLITE_QUANTIZATION_H__
+
+/**
+ * @brief Union to provide bitwise conversion of integer and float
+ */
+union BitwiseIntToFloat {
+ int i;
+ float f;
+};
+
+static const float FLOAT_NEAREST_TO_1 = BitwiseIntToFloat{0x3f7fffff}.f;
+
+#include "tensorflow/contrib/lite/context.h"
+
+/**
+ * @brief Get TfLiteQuantizationParams object with default values
+ * @return TfLiteQuantizationParams object
+ */
+TfLiteQuantizationParams make_default_quantization(void);
+
+#endif // __NNFW_TFLITE_QUANTIZATION_H__
diff --git a/libs/tflite/include/tflite/Session.h b/libs/tflite/include/tflite/Session.h
new file mode 100644
index 000000000..4f2e5c54d
--- /dev/null
+++ b/libs/tflite/include/tflite/Session.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Session.h
+ * @brief This file contains Session class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_SESSION_H__
+#define __NNFW_TFLITE_SESSION_H__
+
+#include <tensorflow/contrib/lite/interpreter.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Structure to provide interface methods of interpreter session
+ */
+struct Session
+{
+ /**
+ * @brief Destruct Session object using default destructor
+ */
+ virtual ~Session() = default;
+
+ /**
+ * @brief Get the Interpreter object pointer
+ * @return The Interpreter object pointer
+ */
+ virtual ::tflite::Interpreter *interp(void) = 0;
+
+ /**
+ * @brief Prepare the session
+ * @return @c true if prepare method succeeded, otherwise @c false
+ */
+ virtual bool prepare(void) = 0;
+ /**
+ * @brief Run the session
+ * @return @c true if run method succeeded, otherwise @c false
+ */
+ virtual bool run(void) = 0;
+ /**
+ * @brief Teardown(release) the session
+ * @return @c true if teardown method succeeded, otherwise @c false
+ */
+ virtual bool teardown(void) = 0;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_INTERP_SESSION_H__
diff --git a/libs/tflite/include/tflite/TensorLogger.h b/libs/tflite/include/tflite/TensorLogger.h
new file mode 100644
index 000000000..e56a76b58
--- /dev/null
+++ b/libs/tflite/include/tflite/TensorLogger.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TensorLogger.h
+ * @brief This file contains TensorLogger class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_TENSOR_LOGGER_H__
+#define __NNFW_TFLITE_TENSOR_LOGGER_H__
+
+#include "misc/tensor/IndexIterator.h"
+#include "tflite/TensorView.h"
+
+#include <tensorflow/contrib/lite/interpreter.h>
+#include <tensorflow/contrib/lite/context.h>
+#include <fstream>
+#include <iomanip>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Class to write input and output value / shape into a file in python form
+ * @note This is a utility to write input and output value / shape into a file in python form.\n
+ * any python app can load this value by running the python code below:\n
+ * exec(open(filename).read())\n
+ * generated python code looks like the following: \n
+ * tensor_shape_gen = []\n
+ * tensor_value_gen = []\n\n
+ * tensor_shape_gen.append("{2, 1, 2}")\n
+ * tensor_value_gen.append([1, 2, 3, 4])\n\n
+ * tensor_shape_gen.append("{2}")\n
+ * tensor_value_gen.append([1, 2])\n\n
+ * tensor_shape_gen.append("{2, 1, 2}")\n
+ * tensor_value_gen.append([1, 4, 3, 8])\n
+ */
+class TensorLogger
+{
+private:
+ std::ofstream _outfile;
+
+public:
+ /**
+ * @brief Get TensorLogger instance
+ * @return The TensorLogger instance
+ */
+ static TensorLogger &instance()
+ {
+ static TensorLogger instance;
+ return instance;
+ }
+
+ /**
+ * @brief Save the tensor details to file from interpreter
+ * @param[in] path The file path to save
+ * @param[in] interp The TfLite interpreter
+ */
+ void save(const std::string &path, ::tflite::Interpreter &interp)
+ {
+ open(path);
+
+ int log_index = 0;
+ for (const auto id : interp.inputs())
+ {
+ _outfile << "# input tensors" << std::endl;
+ printTensor(interp, id, log_index++);
+ }
+ for (const auto id : interp.outputs())
+ {
+ _outfile << "# output tensors" << std::endl;
+ printTensor(interp, id, log_index++);
+ }
+ close();
+ }
+
+private:
+ void open(const std::string &path)
+ {
+ if (!_outfile.is_open())
+ _outfile.open(path, std::ios_base::out);
+
+ _outfile << "# ------ file: " << path << " ------" << std::endl
+ << "tensor_shape_gen = []" << std::endl
+ << "tensor_value_gen = []" << std::endl
+ << std::endl;
+ }
+
+ void printTensor(::tflite::Interpreter &interp, const int id, const int log_index)
+ {
+ const TfLiteTensor *tensor = interp.tensor(id);
+
+ _outfile << "# tensor name: " << tensor->name << std::endl;
+ _outfile << "# tflite::interpreter.tensor(" << id << ") -> "
+ "tensor_value_gen["
+ << log_index << "]" << std::endl;
+
+ if (tensor->type == kTfLiteInt32)
+ {
+ printTensorShape(tensor);
+ printTensorValue<int32_t>(tensor, tensor->data.i32);
+ }
+ else if (interp.tensor(id)->type == kTfLiteUInt8)
+ {
+ printTensorShape(tensor);
+ printTensorValue<uint8_t>(tensor, tensor->data.uint8);
+ }
+ else if (tensor->type == kTfLiteFloat32)
+ {
+ printTensorShape(tensor);
+ printTensorValue<float>(tensor, tensor->data.f);
+ }
+ }
+
+ void printTensorShape(const TfLiteTensor *tensor)
+ {
+ _outfile << "tensor_shape_gen.append('{";
+
+ size_t r = 0;
+ for (; r < tensor->dims->size - 1; r++)
+ {
+ _outfile << tensor->dims->data[r] << ", ";
+ }
+ _outfile << tensor->dims->data[r];
+
+ _outfile << "}')" << std::endl;
+ }
+
+ template <typename T> void printTensorValue(const TfLiteTensor *tensor, T *tensor_data_ptr)
+ {
+ _outfile << "tensor_value_gen.append([";
+
+ _outfile << std::fixed << std::setprecision(10);
+
+ const T *end = reinterpret_cast<const T *>(tensor->data.raw_const + tensor->bytes);
+ for (T *ptr = tensor_data_ptr; ptr < end; ptr++)
+ _outfile << *ptr << ", ";
+
+ _outfile << "])" << std::endl << std::endl;
+ }
+
+ void close()
+ {
+ _outfile << "# --------- tensor shape and value defined above ---------" << std::endl;
+ _outfile.close();
+ }
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_TENSOR_LOGGER_H__
diff --git a/libs/tflite/include/tflite/TensorShapeUtils.h b/libs/tflite/include/tflite/TensorShapeUtils.h
new file mode 100644
index 000000000..ba8687413
--- /dev/null
+++ b/libs/tflite/include/tflite/TensorShapeUtils.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TensorShapeUtils.h
+ * @brief This file contains utilities function of tensor shape
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__
+#define __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__
+
+#include "misc/tensor/Shape.h"
+
+#include <vector>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Converts tensor::Shape into a vector
+ * @param[in] shape The tensor shape to be converted
+ * @return vector value of given shape object
+ */
+static inline std::vector<int32_t> as_dims(const nnfw::misc::tensor::Shape &shape)
+{
+ std::vector<int32_t> dims;
+
+ for (uint32_t axis = 0; axis < shape.rank(); ++axis)
+ {
+ dims.emplace_back(shape.dim(axis));
+ }
+
+ return dims;
+}
+
+/**
+ * @brief Broadcasts between two given shapes
+ * @param[in] lhs_shape The left hand side shape
+ * @param[in] rhs_shape The right hand side shape
+ * @return The broadcasted shape
+ */
+nnfw::misc::tensor::Shape broadcast(const nnfw::misc::tensor::Shape &lhs_shape,
+ const nnfw::misc::tensor::Shape &rhs_shape);
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__
diff --git a/libs/tflite/include/tflite/TensorUtils.h b/libs/tflite/include/tflite/TensorUtils.h
new file mode 100644
index 000000000..6266c5dff
--- /dev/null
+++ b/libs/tflite/include/tflite/TensorUtils.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TensorUtils.h
+ * @brief This file contains utilities function
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_TENSOR_UTILS_H__
+#define __NNFW_TFLITE_TENSOR_UTILS_H__
+
+#include <tensorflow/contrib/lite/context.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Get @c true if tensor type is kTfLiteFloat32, otherwise @c false
+ * @param[in] tensor The tensor object to be compared
+ * @return @c true if tensor type is kTfLiteFloat32, otherwise @c false
+ */
+inline bool isFloatTensor(const TfLiteTensor *tensor) { return tensor->type == kTfLiteFloat32; }
+
+/**
+ * @brief Get @c true if tensor is 4-D tensor and the first dimension length is 1,
+ * otherwise @c false
+ * @param[in] tensor The tensor object to be compared
+ * @return @c true if tensor is 4-D tensor and the first dimension length is 1, otherwise @c false
+ */
+inline bool isFeatureTensor(const TfLiteTensor *tensor)
+{
+ return (tensor->dims->size == 4) && (tensor->dims->data[0] == 1);
+}
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_TENSOR_UTILS_H__
diff --git a/libs/tflite/include/tflite/TensorView.h b/libs/tflite/include/tflite/TensorView.h
new file mode 100644
index 000000000..79c754c78
--- /dev/null
+++ b/libs/tflite/include/tflite/TensorView.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TensorView.h
+ * @brief This file contains TensorView class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_TENSOR_VIEW_H__
+#define __NNFW_TFLITE_TENSOR_VIEW_H__
+
+#include "tensorflow/contrib/lite/interpreter.h"
+
+#include "misc/tensor/Shape.h"
+#include "misc/tensor/Index.h"
+#include "misc/tensor/Reader.h"
+#include "misc/tensor/NonIncreasingStride.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Class to define TensorView which is inherited from nnfw::misc::tensor::Reader<T> class
+ */
+template <typename T> class TensorView final : public nnfw::misc::tensor::Reader<T>
+{
+public:
+ /**
+ * @brief Construct a TensorView object with base and shape informations
+ * @param[in] shape The shape of a tensor
+ * @param[in] base The base address of a tensor
+ */
+ TensorView(const nnfw::misc::tensor::Shape &shape, T *base) : _shape{shape}, _base{base}
+ {
+ // Set 'stride'
+ _stride.init(_shape);
+ }
+
+public:
+ /**
+ * @brief Get shape of tensor
+ * @return Reference of shape
+ */
+ const nnfw::misc::tensor::Shape &shape(void) const { return _shape; }
+
+public:
+ /**
+ * @brief Get value of tensor index
+ * @param[in] index The tensor index
+ * @return The value at the index
+ */
+ T at(const nnfw::misc::tensor::Index &index) const override
+ {
+ const auto offset = _stride.offset(index);
+ return *(_base + offset);
+ }
+
+public:
+ /**
+ * @brief Get reference value of tensor index
+ * @param[in] index The tensor index
+ * @return The reference value at the index
+ */
+ T &at(const nnfw::misc::tensor::Index &index)
+ {
+ const auto offset = _stride.offset(index);
+ return *(_base + offset);
+ }
+
+private:
+ nnfw::misc::tensor::Shape _shape; /**< The tensor shape */
+
+public:
+ T *_base; /**< The base address of tensor */
+ nnfw::misc::tensor::NonIncreasingStride _stride; /**< The NonIncreasingStride object */
+
+public:
+ // TODO Introduce Operand ID class
+ /**
+ * @brief Create TensorView object using given parameters
+ * @param[in] interp The TfLite interpreter
+ * @param[in] tensor_index The tensor index
+ * @return The new TensorView<T> object
+ */
+ static TensorView<T> make(::tflite::Interpreter &interp, int tensor_index)
+ {
+ auto tensor_ptr = interp.tensor(tensor_index);
+
+ // Set 'shape'
+ nnfw::misc::tensor::Shape shape(tensor_ptr->dims->size);
+
+ for (uint32_t axis = 0; axis < shape.rank(); ++axis)
+ {
+ shape.dim(axis) = tensor_ptr->dims->data[axis];
+ }
+
+ return TensorView<T>(shape, interp.typed_tensor<T>(tensor_index));
+ }
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_TENSOR_VIEW_H__
diff --git a/libs/support/nnapi/src/feature/Utils.cpp b/libs/tflite/include/tflite/ext/kernels/Abs.h
index 62939ff4a..74e4aa658 100644
--- a/libs/support/nnapi/src/feature/Utils.cpp
+++ b/libs/tflite/include/tflite/ext/kernels/Abs.h
@@ -14,30 +14,28 @@
* limitations under the License.
*/
-#include "support/nnapi/feature/Utils.h"
+#ifndef __NNFW_TFLITE_EXT_KERNELS_ABS_H__
+#define __NNFW_TFLITE_EXT_KERNELS_ABS_H__
+
+#include "tensorflow/contrib/lite/context.h"
namespace nnfw
{
-namespace support
-{
-namespace nnapi
+namespace tflite
{
-namespace feature
+namespace custom
{
-
-uint32_t indexOf(const nnfw::util::feature::Shape &shape, uint32_t ch, uint32_t row, uint32_t col)
+namespace Abs
{
- uint32_t res = 0;
- // NNAPI assumes that NHWC ordering for feature map
- res += row * shape.W * shape.C;
- res += col * shape.C;
- res += ch;
+void *InitAbs(TfLiteContext *context, const char *buffer, size_t length);
+void FreeAbs(TfLiteContext *context, void *buffer);
+TfLiteStatus PrepareAbs(TfLiteContext *context, TfLiteNode *node);
+TfLiteStatus EvalAbs(TfLiteContext *context, TfLiteNode *node);
- return res;
-}
-
-} // namespace feature
-} // namespace nnapi
-} // namespace support
+} // namespace Abs
+} // namespace custom
+} // namespace tflite
} // namespace nnfw
+
+#endif // __NNFW_TFLITE_EXT_KERNELS_ABS_H__
diff --git a/libs/tflite/include/tflite/ext/kernels/CustomOps.h b/libs/tflite/include/tflite/ext/kernels/CustomOps.h
new file mode 100644
index 000000000..3f9459bb2
--- /dev/null
+++ b/libs/tflite/include/tflite/ext/kernels/CustomOps.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CustomOps.h
+ * @brief This file contains registration of custom operands
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_EXT_KERNELS_CUSTOM_OP_H__
+#define __NNFW_TFLITE_EXT_KERNELS_CUSTOM_OP_H__
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tflite/ext/kernels/TensorFlowMax.h"
+#include "tflite/ext/kernels/SquaredDifference.h"
+#include "tflite/ext/kernels/TensorFlowSum.h"
+#include "tflite/ext/kernels/Abs.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+namespace custom
+{
+
+#define REGISTER_FUNCTION(Name) \
+ TfLiteRegistration *Register_##Name(void) \
+ { \
+ static TfLiteRegistration r = { \
+ Name::Init##Name, Name::Free##Name, Name::Prepare##Name, Name::Eval##Name, \
+ }; \
+ r.custom_name = #Name; \
+ return &r; \
+ }
+
+REGISTER_FUNCTION(TensorFlowMax)
+REGISTER_FUNCTION(SquaredDifference)
+REGISTER_FUNCTION(TensorFlowSum)
+REGISTER_FUNCTION(Abs)
+
+#undef REGISTER_FUNCTION
+
+} // namespace custom
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_EXT_KERNELS_CUSTOM_OP_H__
diff --git a/libs/tflite/include/tflite/ext/kernels/SquaredDifference.h b/libs/tflite/include/tflite/ext/kernels/SquaredDifference.h
new file mode 100644
index 000000000..492523c02
--- /dev/null
+++ b/libs/tflite/include/tflite/ext/kernels/SquaredDifference.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file SquaredDifference.h
+ * @brief This file contains SquaredDifference namespace and SquaredDifference function
+ * definitions
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_EXT_KERNELS_SQUARED_DIFFERENCE_H__
+#define __NNFW_TFLITE_EXT_KERNELS_SQUARED_DIFFERENCE_H__
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+namespace custom
+{
+namespace SquaredDifference
+{
+
+/**
+ * @brief Initialize SquaredDifference operand using the contents of buffer
+ * @param[in] context The TfLite context
+ * @param[in] buffer The buffer with contents
+ * @param[in] length The buffer length
+ * @return The void pointer for user data
+ */
+void *InitSquaredDifference(TfLiteContext *context, const char *buffer, size_t length);
+
+/**
+ * @brief Release any memory it might have allocated via 'InitSquaredDifference'
+ * @param[in] context The TfLite context
+ * @param[in] buffer The buffer with contents
+ * @return N/A
+ */
+void FreeSquaredDifference(TfLiteContext *context, void *buffer);
+
+/**
+ * @brief Prepare the SquaredDifference operand for execution
+ * @param[in] context The TfLite context
+ * @param[in] node The operand node
+ * @return The TfLite status
+ */
+TfLiteStatus PrepareSquaredDifference(TfLiteContext *context, TfLiteNode *node);
+
+/**
+ * @brief Evaluation the SquaredDifference operand for execution
+ * @param[in] context The TfLite context
+ * @param[in] node The operand node
+ * @return The TfLite status
+ */
+TfLiteStatus EvalSquaredDifference(TfLiteContext *context, TfLiteNode *node);
+
+} // namespace SquaredDifference
+} // namespace custom
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_EXT_KERNELS_SQUARED_DIFFERENCE_H__
diff --git a/libs/tflite/include/tflite/ext/kernels/TensorFlowMax.h b/libs/tflite/include/tflite/ext/kernels/TensorFlowMax.h
new file mode 100644
index 000000000..d31d76483
--- /dev/null
+++ b/libs/tflite/include/tflite/ext/kernels/TensorFlowMax.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TensorFlowMax.h
+ * @brief This file contains TensorFlowMax namespace and TensorFlowMax function definitions
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_MAX_H__
+#define __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_MAX_H__
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+namespace custom
+{
+namespace TensorFlowMax
+{
+
+/**
+ * @brief Initialize TensorFlowMax operand using the contents of buffer
+ * @param[in] context The TfLite context
+ * @param[in] buffer The buffer with contents
+ * @param[in] length The buffer length
+ * @return The void pointer for user data
+ */
+void *InitTensorFlowMax(TfLiteContext *context, const char *buffer, size_t length);
+
+/**
+ * @brief Release any memory it might have allocated via 'InitTensorFlowMax'
+ * @param[in] context The TfLite context
+ * @param[in] buffer The buffer with contents
+ * @return N/A
+ */
+void FreeTensorFlowMax(TfLiteContext *context, void *buffer);
+
+/**
+ * @brief Prepare the TensorFlowMax operand for execution
+ * @param[in] context The TfLite context
+ * @param[in] node The operand node
+ * @return The TfLite status
+ */
+TfLiteStatus PrepareTensorFlowMax(TfLiteContext *context, TfLiteNode *node);
+
+/**
+ * @brief Evaluation the TensorFlowMax operand for execution
+ * @param[in] context The TfLite context
+ * @param[in] node The operand node
+ * @return The TfLite status
+ */
+TfLiteStatus EvalTensorFlowMax(TfLiteContext *context, TfLiteNode *node);
+
+} // namespace TensorFlowMax
+} // namespace custom
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_MAX_H__
diff --git a/libs/tflite/include/tflite/ext/kernels/TensorFlowSum.h b/libs/tflite/include/tflite/ext/kernels/TensorFlowSum.h
new file mode 100644
index 000000000..66783cf41
--- /dev/null
+++ b/libs/tflite/include/tflite/ext/kernels/TensorFlowSum.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_SUM_H__
+#define __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_SUM_H__
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+namespace custom
+{
+namespace TensorFlowSum
+{
+
+void *InitTensorFlowSum(TfLiteContext *context, const char *buffer, size_t length);
+void FreeTensorFlowSum(TfLiteContext *context, void *buffer);
+TfLiteStatus PrepareTensorFlowSum(TfLiteContext *context, TfLiteNode *node);
+TfLiteStatus EvalTensorFlowSum(TfLiteContext *context, TfLiteNode *node);
+
+} // namespace TensorFlowSum
+} // namespace custom
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_EXT_KERNELS_TENSORFLOW_SUM_H__
diff --git a/libs/tflite/include/tflite/ext/kernels/register.h b/libs/tflite/include/tflite/ext/kernels/register.h
new file mode 100644
index 000000000..124af7abc
--- /dev/null
+++ b/libs/tflite/include/tflite/ext/kernels/register.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
+// NOTE This header is derived from the following file (in TensorFlow)
+// 'externals/tensorflow/tensorflow/contrib/lite/kernels/register.h'
+#ifndef __NNFW_TFLITE_EXT_KERNELS_REGISTER_H__
+#define __NNFW_TFLITE_EXT_KERNELS_REGISTER_H__
+
+#include <unordered_map>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace nnfw {
+namespace tflite {
+
+class BuiltinOpResolver : public ::tflite::MutableOpResolver {
+ public:
+ BuiltinOpResolver();
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_EXT_KERNELS_REGISTER_H__
+
+// clang-format on
diff --git a/libs/tflite/include/tflite/ext/nnapi_delegate.h b/libs/tflite/include/tflite/ext/nnapi_delegate.h
new file mode 100644
index 000000000..3aac01af7
--- /dev/null
+++ b/libs/tflite/include/tflite/ext/nnapi_delegate.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
+// NOTE This header is derived from the following file (in TensorFlow v1.12)
+// 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.h'
+#ifndef __NNFW_TFLITE_EXT_NNAPI_DELEGATE_H__
+#define __NNFW_TFLITE_EXT_NNAPI_DELEGATE_H__
+
+#include "tensorflow/contrib/lite/allocation.h"
+#ifdef OBS_BUILD
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#else
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#endif
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "NeuralNetworksShim.h"
+
+class ANeuralNetworksModel;
+class ANeuralNetworksMemory;
+class ANeuralNetworksCompilation;
+
+namespace nnfw {
+namespace tflite {
+
+class NNAPIAllocation : public ::tflite::MMAPAllocation {
+ public:
+ NNAPIAllocation(const char* filename, ::tflite::ErrorReporter* error_reporter);
+ ~NNAPIAllocation();
+
+ size_t offset(const void* ptr) const {
+ auto signed_offset = reinterpret_cast<const uint8_t*>(ptr) -
+ reinterpret_cast<const uint8_t*>(mmapped_buffer_);
+
+ return static_cast<size_t>(signed_offset);
+ }
+
+ ANeuralNetworksMemory* memory() const { return handle_; }
+ bool valid() const override { return handle_ != nullptr; }
+
+ private:
+ mutable ANeuralNetworksMemory* handle_ = nullptr;
+};
+
+class NNAPIDelegate {
+ public:
+ ~NNAPIDelegate();
+
+ // Convert a tflite graph to NNAPI
+ TfLiteStatus BuildGraph(::tflite::Interpreter* interpreter);
+
+ // Run
+ TfLiteStatus Invoke(::tflite::Interpreter* interpreter);
+
+ // Whether the current platform supports NNAPI delegation.
+ static bool IsSupported();
+
+ private:
+ // The NN API model handle
+ ANeuralNetworksModel* nn_model_ = nullptr;
+ // The NN API compilation handle
+ ANeuralNetworksCompilation* nn_compiled_model_ = nullptr;
+ // Model status
+ TfLiteStatus model_status_ = kTfLiteOk;
+
+ // List of state tensors for LSTM, RNN, SVDF.
+ // NN API does not allow ops to maintain states across multiple
+ // invocations. We need to manually create state input tensors from
+ // corresponding state output tensors of TFLite operations, and map them
+ // correctly.
+ std::vector<int> model_states_inputs_; // holds NNAPI operand ids
+ std::vector<int> model_states_outputs_; // holds TFLite tensor ids
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_EXT_NNAPI_DELEGATE_H__
+
+// clang-format on
diff --git a/libs/tflite/include/tflite/interp/Builder.h b/libs/tflite/include/tflite/interp/Builder.h
new file mode 100644
index 000000000..b4d082419
--- /dev/null
+++ b/libs/tflite/include/tflite/interp/Builder.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Builder.h
+ * @brief This file contains Builder structure
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_INTERP_BUILDER_H__
+#define __NNFW_TFLITE_INTERP_BUILDER_H__
+
+#include <tensorflow/contrib/lite/interpreter.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Structure to Builder
+ */
+struct Builder
+{
+ /**
+ * @brief Destroy the Builder object
+ */
+ virtual ~Builder() = default;
+
+ /**
+ * @brief Build a FlatBuffer model
+ * @return The TfLite interpreter object
+ */
+ virtual std::unique_ptr<::tflite::Interpreter> build(void) const = 0;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_INTERP_BUILDER_H__
diff --git a/libs/tflite/include/tflite/interp/FlatBufferBuilder.h b/libs/tflite/include/tflite/interp/FlatBufferBuilder.h
new file mode 100644
index 000000000..13470b8c5
--- /dev/null
+++ b/libs/tflite/include/tflite/interp/FlatBufferBuilder.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file FlatBufferBuilder.h
+ * @brief This file contains FlatBufferBuilder class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_INTERP_FLAT_BUFFER_BUILDER_H__
+#define __NNFW_TFLITE_INTERP_FLAT_BUFFER_BUILDER_H__
+
+#include <tensorflow/contrib/lite/model.h>
+
+#include "tflite/interp/Builder.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Class to define FlatBufferBuilder which is inherited from Builder
+ */
+class FlatBufferBuilder final : public Builder
+{
+public:
+ /**
+ * @brief Construct a FlatBufferBuilder object with FlatBufferModel of TfLite
+ * @param[in] model The TfLite Flatbuffer model
+ */
+ FlatBufferBuilder(const ::tflite::FlatBufferModel &model) : _model{model}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Build a FlatBuffer model
+ * @return The TfLite interpreter pointer address
+ */
+ std::unique_ptr<::tflite::Interpreter> build(void) const override;
+
+private:
+ const ::tflite::FlatBufferModel &_model;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_INTERP_FLAT_BUFFER_BUILDER_H__
diff --git a/libs/tflite/include/tflite/interp/FunctionBuilder.h b/libs/tflite/include/tflite/interp/FunctionBuilder.h
new file mode 100644
index 000000000..064375939
--- /dev/null
+++ b/libs/tflite/include/tflite/interp/FunctionBuilder.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file FunctionBuilder.h
+ * @brief This file contains FunctionBuilder class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_TFLITE_INTERP_FUNCTION_BUILDER_H__
+#define __NNFW_TFLITE_INTERP_FUNCTION_BUILDER_H__
+
+#include <tensorflow/contrib/lite/model.h>
+
+#include "tflite/interp/Builder.h"
+
+namespace nnfw
+{
+namespace tflite
+{
+
+/**
+ * @brief Class to define FunctionBuilder which is inherited from Builder
+ */
+class FunctionBuilder final : public Builder
+{
+public:
+ using SetupFunc = std::function<void(::tflite::Interpreter &)>;
+
+public:
+ /**
+ * @brief Construct a FunctionBuilder object with SetupFunction
+ * @param[in] fn The SetupFunc object
+ */
+ FunctionBuilder(const SetupFunc &fn) : _fn{fn}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Build a SetupFunc
+ * @return The TfLite interpreter pointer address
+ */
+ std::unique_ptr<::tflite::Interpreter> build(void) const override;
+
+private:
+ SetupFunc _fn;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_INTERP_FUNCTION_BUILDER_H__
diff --git a/libs/support/tflite/src/Diff.cpp b/libs/tflite/src/Diff.cpp
index e875571cb..45ef06110 100644
--- a/libs/support/tflite/src/Diff.cpp
+++ b/libs/tflite/src/Diff.cpp
@@ -14,22 +14,22 @@
* limitations under the License.
*/
-#include "support/tflite/Diff.h"
-#include "support/tflite/nnapi_delegate.h"
+#include "tflite/Diff.h"
+#include "tflite/ext/nnapi_delegate.h"
-#include "util/fp32.h"
+#include "misc/fp32.h"
-#include "util/tensor/IndexIterator.h"
-#include "util/tensor/IndexFormatter.h"
-#include "util/tensor/Zipper.h"
-#include "util/tensor/Comparator.h"
+#include "misc/tensor/IndexIterator.h"
+#include "misc/tensor/IndexFormatter.h"
+#include "misc/tensor/Zipper.h"
+#include "misc/tensor/Comparator.h"
-#include "util/environment.h"
+#include "misc/environment.h"
#include <iostream>
#include <cassert>
-class DiffSummary : public nnfw::util::tensor::Comparator::Observer
+class DiffSummary : public nnfw::misc::tensor::Comparator::Observer
{
public:
DiffSummary()
@@ -41,21 +41,21 @@ public:
}
public:
- void notify(const nnfw::util::tensor::Index &index, float expected, float obtained) override;
+ void notify(const nnfw::misc::tensor::Index &index, float expected, float obtained) override;
public:
- nnfw::util::tensor::Index max_abs_diff_index;
+ nnfw::misc::tensor::Index max_abs_diff_index;
float max_abs_diff_expected;
float max_abs_diff_obtained;
float max_abs_diff_value;
- nnfw::util::tensor::Index max_rel_diff_index;
+ nnfw::misc::tensor::Index max_rel_diff_index;
float max_rel_diff_expected;
float max_rel_diff_obtained;
float max_rel_diff_value;
};
-void DiffSummary::notify(const nnfw::util::tensor::Index &index, float expected, float obtained)
+void DiffSummary::notify(const nnfw::misc::tensor::Index &index, float expected, float obtained)
{
const auto abs_diff_value = std::fabs(expected - obtained);
@@ -67,7 +67,7 @@ void DiffSummary::notify(const nnfw::util::tensor::Index &index, float expected,
max_abs_diff_obtained = obtained;
}
- const auto rel_diff_value = nnfw::util::fp32::relative_diff(expected, obtained);
+ const auto rel_diff_value = nnfw::misc::fp32::relative_diff(expected, obtained);
if (max_rel_diff_value < rel_diff_value)
{
@@ -79,15 +79,15 @@ void DiffSummary::notify(const nnfw::util::tensor::Index &index, float expected,
}
template <typename T>
-bool TfLiteInterpMatchApp::compareSingleTensorView(
- const nnfw::support::tflite::TensorView<T> &expected,
- const nnfw::support::tflite::TensorView<T> &obtained, int id) const
+bool TfLiteInterpMatchApp::compareSingleTensorView(const nnfw::tflite::TensorView<T> &expected,
+ const nnfw::tflite::TensorView<T> &obtained,
+ int id) const
{
- std::vector<nnfw::util::tensor::Diff<T>> diffs;
+ std::vector<nnfw::misc::tensor::Diff<T>> diffs;
assert(expected.shape() == obtained.shape());
- using nnfw::util::tensor::zip;
- using nnfw::util::tensor::Index;
+ using nnfw::misc::tensor::zip;
+ using nnfw::misc::tensor::Index;
zip(expected.shape(), expected, obtained)
<< [&](const Index &index, T expected_value, T obtained_value) {
@@ -113,7 +113,7 @@ bool TfLiteInterpMatchApp::compareSingleTensorView(
std::cout << " ---- Details ---" << std::endl;
for (const auto &diff : diffs)
{
- std::cout << " Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]"
+ std::cout << " Diff at [" << nnfw::misc::tensor::IndexFormatter(diff.index) << "]"
<< std::endl;
std::cout << " expected: " << diff.expected << std::endl;
std::cout << " obtained: " << diff.obtained << std::endl;
@@ -125,8 +125,8 @@ bool TfLiteInterpMatchApp::compareSingleTensorView(
template <>
bool TfLiteInterpMatchApp::compareSingleTensorView<float>(
- const nnfw::support::tflite::TensorView<float> &expected,
- const nnfw::support::tflite::TensorView<float> &obtained, int id) const
+ const nnfw::tflite::TensorView<float> &expected,
+ const nnfw::tflite::TensorView<float> &obtained, int id) const
{
DiffSummary summary;
@@ -148,7 +148,7 @@ bool TfLiteInterpMatchApp::compareSingleTensorView<float>(
if (summary.max_abs_diff_value > 0)
{
std::cout << " Max absolute diff at ["
- << nnfw::util::tensor::IndexFormatter(summary.max_abs_diff_index) << "]" << std::endl;
+ << nnfw::misc::tensor::IndexFormatter(summary.max_abs_diff_index) << "]" << std::endl;
std::cout << " expected: " << summary.max_abs_diff_expected << std::endl;
std::cout << " obtained: " << summary.max_abs_diff_obtained << std::endl;
std::cout << " absolute diff: " << summary.max_abs_diff_value << std::endl;
@@ -159,7 +159,7 @@ bool TfLiteInterpMatchApp::compareSingleTensorView<float>(
const auto tolerance_level = summary.max_rel_diff_value / FLT_EPSILON;
std::cout << " Max relative diff at ["
- << nnfw::util::tensor::IndexFormatter(summary.max_rel_diff_index) << "]" << std::endl;
+ << nnfw::misc::tensor::IndexFormatter(summary.max_rel_diff_index) << "]" << std::endl;
std::cout << " expected: " << summary.max_rel_diff_expected << std::endl;
std::cout << " obtained: " << summary.max_rel_diff_obtained << std::endl;
std::cout << " relative diff: " << summary.max_rel_diff_value << std::endl;
@@ -174,10 +174,10 @@ bool TfLiteInterpMatchApp::compareSingleTensorView<float>(
for (const auto &diff : diffs)
{
const auto absolute_diff = std::fabs(diff.expected - diff.obtained);
- const auto relative_diff = nnfw::util::fp32::relative_diff(diff.expected, diff.obtained);
+ const auto relative_diff = nnfw::misc::fp32::relative_diff(diff.expected, diff.obtained);
const auto tolerance_level = relative_diff / FLT_EPSILON;
- std::cout << " Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]"
+ std::cout << " Diff at [" << nnfw::misc::tensor::IndexFormatter(diff.index) << "]"
<< std::endl;
std::cout << " expected: " << diff.expected << std::endl;
std::cout << " obtained: " << diff.obtained << std::endl;
@@ -206,24 +206,32 @@ bool TfLiteInterpMatchApp::run(::tflite::Interpreter &interp, ::tflite::Interpre
comparators[kTfLiteUInt8] = [this](int id, ::tflite::Interpreter &interp,
::tflite::Interpreter &nnapi) {
- const auto expected = nnfw::support::tflite::TensorView<uint8_t>::make(interp, id);
- const auto obtained = nnfw::support::tflite::TensorView<uint8_t>::make(nnapi, id);
+ const auto expected = nnfw::tflite::TensorView<uint8_t>::make(interp, id);
+ const auto obtained = nnfw::tflite::TensorView<uint8_t>::make(nnapi, id);
return compareSingleTensorView(expected, obtained, id);
};
comparators[kTfLiteInt32] = [this](int id, ::tflite::Interpreter &interp,
::tflite::Interpreter &nnapi) {
- const auto expected = nnfw::support::tflite::TensorView<int32_t>::make(interp, id);
- const auto obtained = nnfw::support::tflite::TensorView<int32_t>::make(nnapi, id);
+ const auto expected = nnfw::tflite::TensorView<int32_t>::make(interp, id);
+ const auto obtained = nnfw::tflite::TensorView<int32_t>::make(nnapi, id);
return compareSingleTensorView(expected, obtained, id);
};
comparators[kTfLiteFloat32] = [this](int id, ::tflite::Interpreter &interp,
::tflite::Interpreter &nnapi) {
- const auto expected = nnfw::support::tflite::TensorView<float>::make(interp, id);
- const auto obtained = nnfw::support::tflite::TensorView<float>::make(nnapi, id);
+ const auto expected = nnfw::tflite::TensorView<float>::make(interp, id);
+ const auto obtained = nnfw::tflite::TensorView<float>::make(nnapi, id);
+
+ return compareSingleTensorView(expected, obtained, id);
+ };
+
+ comparators[kTfLiteBool] = [this](int id, ::tflite::Interpreter &interp,
+ ::tflite::Interpreter &nnapi) {
+ const auto expected = nnfw::tflite::TensorView<bool>::make(interp, id);
+ const auto obtained = nnfw::tflite::TensorView<bool>::make(nnapi, id);
return compareSingleTensorView(expected, obtained, id);
};
@@ -250,7 +258,7 @@ bool TfLiteInterpMatchApp::run(::tflite::Interpreter &interp, ::tflite::Interpre
return all_matched;
}
-#include "util/tensor/Object.h"
+#include "misc/tensor/Object.h"
using namespace std::placeholders;
@@ -265,11 +273,11 @@ template <> uint8_t RandomGenerator::generate<uint8_t>(void)
return static_cast<uint8_t>((_dist(_rand) - min_range) * type_range / (max_range - min_range));
}
-#include "support/tflite/TensorLogger.h"
+#include "tflite/TensorLogger.h"
//
// Random Test Runner
//
-int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
+int RandomTestRunner::run(const nnfw::tflite::Builder &builder)
{
auto tfl_interp = builder.build();
auto nnapi = builder.build();
@@ -293,15 +301,15 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
assert(tfl_interp->tensor(id)->type == kTfLiteInt32);
assert(nnapi->tensor(id)->type == kTfLiteInt32);
- auto tfl_interp_view = nnfw::support::tflite::TensorView<int32_t>::make(*tfl_interp, id);
- auto nnapi_view = nnfw::support::tflite::TensorView<int32_t>::make(*nnapi, id);
+ auto tfl_interp_view = nnfw::tflite::TensorView<int32_t>::make(*tfl_interp, id);
+ auto nnapi_view = nnfw::tflite::TensorView<int32_t>::make(*nnapi, id);
assert(tfl_interp_view.shape() == nnapi_view.shape());
int32_t value = 0;
- nnfw::util::tensor::iterate(tfl_interp_view.shape())
- << [&](const nnfw::util::tensor::Index &ind) {
+ nnfw::misc::tensor::iterate(tfl_interp_view.shape())
+ << [&](const nnfw::misc::tensor::Index &ind) {
// TODO Generate random values
tfl_interp_view.at(ind) = value;
nnapi_view.at(ind) = value;
@@ -314,15 +322,15 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
assert(tfl_interp->tensor(id)->type == kTfLiteInt32);
assert(nnapi->tensor(id)->type == kTfLiteInt32);
- auto tfl_interp_view = nnfw::support::tflite::TensorView<int32_t>::make(*tfl_interp, id);
- auto nnapi_view = nnfw::support::tflite::TensorView<int32_t>::make(*nnapi, id);
+ auto tfl_interp_view = nnfw::tflite::TensorView<int32_t>::make(*tfl_interp, id);
+ auto nnapi_view = nnfw::tflite::TensorView<int32_t>::make(*nnapi, id);
assert(tfl_interp_view.shape() == nnapi_view.shape());
int32_t value = 0;
- nnfw::util::tensor::iterate(tfl_interp_view.shape())
- << [&](const nnfw::util::tensor::Index &ind) {
+ nnfw::misc::tensor::iterate(tfl_interp_view.shape())
+ << [&](const nnfw::misc::tensor::Index &ind) {
// TODO Generate random values
tfl_interp_view.at(ind) = value;
nnapi_view.at(ind) = value;
@@ -333,20 +341,20 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
assert(tfl_interp->tensor(id)->type == kTfLiteUInt8);
assert(nnapi->tensor(id)->type == kTfLiteUInt8);
- auto tfl_interp_view = nnfw::support::tflite::TensorView<uint8_t>::make(*tfl_interp, id);
- auto nnapi_view = nnfw::support::tflite::TensorView<uint8_t>::make(*nnapi, id);
+ auto tfl_interp_view = nnfw::tflite::TensorView<uint8_t>::make(*tfl_interp, id);
+ auto nnapi_view = nnfw::tflite::TensorView<uint8_t>::make(*nnapi, id);
assert(tfl_interp_view.shape() == nnapi_view.shape());
- auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &,
- const ::nnfw::util::tensor::Index &)>(
+ auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &,
+ const ::nnfw::misc::tensor::Index &)>(
&RandomGenerator::generate<uint8_t>);
- const nnfw::util::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
+ const nnfw::misc::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
std::bind(fp, _randgen, _1, _2));
assert(tfl_interp_view.shape() == data.shape());
- nnfw::util::tensor::iterate(tfl_interp_view.shape())
- << [&](const nnfw::util::tensor::Index &ind) {
+ nnfw::misc::tensor::iterate(tfl_interp_view.shape())
+ << [&](const nnfw::misc::tensor::Index &ind) {
const auto value = data.at(ind);
tfl_interp_view.at(ind) = value;
@@ -358,22 +366,22 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
assert(tfl_interp->tensor(id)->type == kTfLiteUInt8);
assert(nnapi->tensor(id)->type == kTfLiteUInt8);
- auto tfl_interp_view = nnfw::support::tflite::TensorView<uint8_t>::make(*tfl_interp, id);
- auto nnapi_view = nnfw::support::tflite::TensorView<uint8_t>::make(*nnapi, id);
+ auto tfl_interp_view = nnfw::tflite::TensorView<uint8_t>::make(*tfl_interp, id);
+ auto nnapi_view = nnfw::tflite::TensorView<uint8_t>::make(*nnapi, id);
assert(tfl_interp_view.shape() == nnapi_view.shape());
- auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &,
- const ::nnfw::util::tensor::Index &)>(
+ auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &,
+ const ::nnfw::misc::tensor::Index &)>(
&RandomGenerator::generate<uint8_t>);
- const nnfw::util::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
+ const nnfw::misc::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
std::bind(fp, _randgen, _1, _2));
assert(tfl_interp_view.shape() == data.shape());
uint8_t value = 0;
- nnfw::util::tensor::iterate(tfl_interp_view.shape())
- << [&](const nnfw::util::tensor::Index &ind) {
+ nnfw::misc::tensor::iterate(tfl_interp_view.shape())
+ << [&](const nnfw::misc::tensor::Index &ind) {
tfl_interp_view.at(ind) = value;
nnapi_view.at(ind) = value;
};
@@ -383,21 +391,21 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
assert(tfl_interp->tensor(id)->type == kTfLiteFloat32);
assert(nnapi->tensor(id)->type == kTfLiteFloat32);
- auto tfl_interp_view = nnfw::support::tflite::TensorView<float>::make(*tfl_interp, id);
- auto nnapi_view = nnfw::support::tflite::TensorView<float>::make(*nnapi, id);
+ auto tfl_interp_view = nnfw::tflite::TensorView<float>::make(*tfl_interp, id);
+ auto nnapi_view = nnfw::tflite::TensorView<float>::make(*nnapi, id);
assert(tfl_interp_view.shape() == nnapi_view.shape());
- auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &,
- const ::nnfw::util::tensor::Index &)>(
+ auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &,
+ const ::nnfw::misc::tensor::Index &)>(
&RandomGenerator::generate<float>);
- const nnfw::util::tensor::Object<float> data(tfl_interp_view.shape(),
+ const nnfw::misc::tensor::Object<float> data(tfl_interp_view.shape(),
std::bind(fp, _randgen, _1, _2));
assert(tfl_interp_view.shape() == data.shape());
- nnfw::util::tensor::iterate(tfl_interp_view.shape())
- << [&](const nnfw::util::tensor::Index &ind) {
+ nnfw::misc::tensor::iterate(tfl_interp_view.shape())
+ << [&](const nnfw::misc::tensor::Index &ind) {
const auto value = data.at(ind);
tfl_interp_view.at(ind) = value;
@@ -409,23 +417,75 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
assert(tfl_interp->tensor(id)->type == kTfLiteFloat32);
assert(nnapi->tensor(id)->type == kTfLiteFloat32);
- auto tfl_interp_view = nnfw::support::tflite::TensorView<float>::make(*tfl_interp, id);
- auto nnapi_view = nnfw::support::tflite::TensorView<float>::make(*nnapi, id);
+ auto tfl_interp_view = nnfw::tflite::TensorView<float>::make(*tfl_interp, id);
+ auto nnapi_view = nnfw::tflite::TensorView<float>::make(*nnapi, id);
assert(tfl_interp_view.shape() == nnapi_view.shape());
- auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &,
- const ::nnfw::util::tensor::Index &)>(
+ auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &,
+ const ::nnfw::misc::tensor::Index &)>(
&RandomGenerator::generate<float>);
- const nnfw::util::tensor::Object<float> data(tfl_interp_view.shape(),
+ const nnfw::misc::tensor::Object<float> data(tfl_interp_view.shape(),
std::bind(fp, _randgen, _1, _2));
assert(tfl_interp_view.shape() == data.shape());
float value = 0;
- nnfw::util::tensor::iterate(tfl_interp_view.shape())
- << [&](const nnfw::util::tensor::Index &ind) {
+ nnfw::misc::tensor::iterate(tfl_interp_view.shape())
+ << [&](const nnfw::misc::tensor::Index &ind) {
+ tfl_interp_view.at(ind) = value;
+ nnapi_view.at(ind) = value;
+ };
+ };
+
+ initializers[kTfLiteBool] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
+ assert(tfl_interp->tensor(id)->type == kTfLiteBool);
+ assert(nnapi->tensor(id)->type == kTfLiteBool);
+
+ auto tfl_interp_view = nnfw::tflite::TensorView<bool>::make(*tfl_interp, id);
+ auto nnapi_view = nnfw::tflite::TensorView<bool>::make(*nnapi, id);
+
+ assert(tfl_interp_view.shape() == nnapi_view.shape());
+
+ auto fp = static_cast<bool (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &,
+ const ::nnfw::misc::tensor::Index &)>(
+ &RandomGenerator::generate<bool>);
+ const nnfw::misc::tensor::Object<bool> data(tfl_interp_view.shape(),
+ std::bind(fp, _randgen, _1, _2));
+
+ assert(tfl_interp_view.shape() == data.shape());
+
+ nnfw::misc::tensor::iterate(tfl_interp_view.shape())
+ << [&](const nnfw::misc::tensor::Index &ind) {
+ const auto value = data.at(ind);
+
+ tfl_interp_view.at(ind) = value;
+ nnapi_view.at(ind) = value;
+ };
+ };
+
+ reseters[kTfLiteBool] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
+ assert(tfl_interp->tensor(id)->type == kTfLiteBool);
+ assert(nnapi->tensor(id)->type == kTfLiteBool);
+
+ auto tfl_interp_view = nnfw::tflite::TensorView<bool>::make(*tfl_interp, id);
+ auto nnapi_view = nnfw::tflite::TensorView<bool>::make(*nnapi, id);
+
+ assert(tfl_interp_view.shape() == nnapi_view.shape());
+
+ auto fp = static_cast<bool (RandomGenerator::*)(const ::nnfw::misc::tensor::Shape &,
+ const ::nnfw::misc::tensor::Index &)>(
+ &RandomGenerator::generate<bool>);
+ const nnfw::misc::tensor::Object<bool> data(tfl_interp_view.shape(),
+ std::bind(fp, _randgen, _1, _2));
+
+ assert(tfl_interp_view.shape() == data.shape());
+
+ bool value = false;
+
+ nnfw::misc::tensor::iterate(tfl_interp_view.shape())
+ << [&](const nnfw::misc::tensor::Index &ind) {
tfl_interp_view.at(ind) = value;
nnapi_view.at(ind) = value;
};
@@ -475,7 +535,7 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
}
else
{
- nnfw::NNAPIDelegate d;
+ nnfw::tflite::NNAPIDelegate d;
if (d.BuildGraph(nnapi.get()))
{
@@ -496,15 +556,15 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
auto equals = [tolerance](float lhs, float rhs) {
// NOTE Hybrid approach
// TODO Allow users to set tolerance for absolute_epsilon_equal
- if (nnfw::util::fp32::absolute_epsilon_equal(lhs, rhs))
+ if (nnfw::misc::fp32::absolute_epsilon_equal(lhs, rhs))
{
return true;
}
- return nnfw::util::fp32::epsilon_equal(lhs, rhs, tolerance);
+ return nnfw::misc::fp32::epsilon_equal(lhs, rhs, tolerance);
};
- nnfw::util::tensor::Comparator comparator(equals);
+ nnfw::misc::tensor::Comparator comparator(equals);
TfLiteInterpMatchApp app(comparator);
app.verbose() = _param.verbose;
@@ -519,7 +579,7 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
std::cout << "[NNAPI TEST] PASSED" << std::endl;
if (_param.tensor_logging)
- nnfw::support::tflite::TensorLogger::instance().save(_param.log_path, *tfl_interp);
+ nnfw::tflite::TensorLogger::instance().save(_param.log_path, *tfl_interp);
return 0;
}
@@ -531,8 +591,8 @@ RandomTestRunner RandomTestRunner::make(int seed)
param.verbose = 0;
param.tolerance = 1;
- nnfw::util::env::IntAccessor("VERBOSE").access(param.verbose);
- nnfw::util::env::IntAccessor("TOLERANCE").access(param.tolerance);
+ nnfw::misc::env::IntAccessor("VERBOSE").access(param.verbose);
+ nnfw::misc::env::IntAccessor("TOLERANCE").access(param.tolerance);
return RandomTestRunner{seed, param};
}
diff --git a/libs/support/tflite/src/FeatureView.cpp b/libs/tflite/src/FeatureView.cpp
index 4c7636780..fdf5a4b00 100644
--- a/libs/support/tflite/src/FeatureView.cpp
+++ b/libs/tflite/src/FeatureView.cpp
@@ -14,21 +14,19 @@
* limitations under the License.
*/
-#include "support/tflite/FeatureView.h"
-#include "support/tflite/TensorUtils.h"
+#include "tflite/FeatureView.h"
+#include "tflite/TensorUtils.h"
#include <cassert>
namespace nnfw
{
-namespace support
-{
namespace tflite
{
-nnfw::util::feature::Shape getFeatureShape(const TfLiteTensor *tensor)
+nnfw::misc::feature::Shape getFeatureShape(const TfLiteTensor *tensor)
{
- nnfw::util::feature::Shape shape{tensor->dims->data[3], tensor->dims->data[1],
+ nnfw::misc::feature::Shape shape{tensor->dims->data[3], tensor->dims->data[1],
tensor->dims->data[2]};
return shape;
@@ -69,5 +67,4 @@ float &FeatureView<float>::at(uint32_t ch, uint32_t row, uint32_t col)
}
} // namespace tflite
-} // namespace support
} // namespace nnfw
diff --git a/libs/support/tflite/src/Quantization.cpp b/libs/tflite/src/Quantization.cpp
index b23204d41..9c162c342 100644
--- a/libs/support/tflite/src/Quantization.cpp
+++ b/libs/tflite/src/Quantization.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "support/tflite/Quantization.h"
+#include "tflite/Quantization.h"
TfLiteQuantizationParams make_default_quantization(void)
{
diff --git a/libs/support/tflite/src/TensorShapeUtils.cpp b/libs/tflite/src/TensorShapeUtils.cpp
index 611ba920e..b5d906719 100644
--- a/libs/support/tflite/src/TensorShapeUtils.cpp
+++ b/libs/tflite/src/TensorShapeUtils.cpp
@@ -1,14 +1,12 @@
-#include "support/tflite/TensorShapeUtils.h"
+#include "tflite/TensorShapeUtils.h"
namespace nnfw
{
-namespace support
-{
namespace tflite
{
-nnfw::util::tensor::Shape broadcast(const nnfw::util::tensor::Shape &lhs_shape,
- const nnfw::util::tensor::Shape &rhs_shape)
+nnfw::misc::tensor::Shape broadcast(const nnfw::misc::tensor::Shape &lhs_shape,
+ const nnfw::misc::tensor::Shape &rhs_shape)
{
const uint32_t lhs_rank = lhs_shape.rank();
const uint32_t rhs_rank = rhs_shape.rank();
@@ -36,7 +34,7 @@ nnfw::util::tensor::Shape broadcast(const nnfw::util::tensor::Shape &lhs_shape,
rhs_normalized_dims.emplace_back(rhs_shape.dim(axis));
}
- nnfw::util::tensor::Shape out_shape(out_rank);
+ nnfw::misc::tensor::Shape out_shape(out_rank);
for (uint32_t axis = 0; axis < out_rank; ++axis)
{
@@ -47,5 +45,4 @@ nnfw::util::tensor::Shape broadcast(const nnfw::util::tensor::Shape &lhs_shape,
}
} // namespace tflite
-} // namespace support
} // namespace nnfw
diff --git a/libs/tflite/src/TensorView.test.cpp b/libs/tflite/src/TensorView.test.cpp
new file mode 100644
index 000000000..c710b3c33
--- /dev/null
+++ b/libs/tflite/src/TensorView.test.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tflite/TensorView.h"
+
+#include <cassert>
+
+void int_test(void)
+{
+ int value[6] = {1, 2, 3, 4, 5, 6};
+
+ const nnfw::misc::tensor::Shape shape{2, 3};
+ const nnfw::tflite::TensorView<int> view{shape, value};
+
+ assert(view.at(nnfw::misc::tensor::Index{0, 0}) == 1);
+ assert(view.at(nnfw::misc::tensor::Index{0, 1}) == 2);
+ assert(view.at(nnfw::misc::tensor::Index{0, 2}) == 3);
+ assert(view.at(nnfw::misc::tensor::Index{1, 0}) == 4);
+ assert(view.at(nnfw::misc::tensor::Index{1, 1}) == 5);
+ assert(view.at(nnfw::misc::tensor::Index{1, 2}) == 6);
+}
+
+int main(int argc, char **argv)
+{
+ float value[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+ const nnfw::misc::tensor::Shape shape{2, 3};
+ const nnfw::tflite::TensorView<float> view{shape, value};
+
+ assert(view.at(nnfw::misc::tensor::Index{0, 0}) == 1.0f);
+ assert(view.at(nnfw::misc::tensor::Index{0, 1}) == 2.0f);
+ assert(view.at(nnfw::misc::tensor::Index{0, 2}) == 3.0f);
+ assert(view.at(nnfw::misc::tensor::Index{1, 0}) == 4.0f);
+ assert(view.at(nnfw::misc::tensor::Index{1, 1}) == 5.0f);
+ assert(view.at(nnfw::misc::tensor::Index{1, 2}) == 6.0f);
+
+ int_test();
+
+ return 0;
+}
diff --git a/libs/tflite/src/ext/kernels/Abs.cpp b/libs/tflite/src/ext/kernels/Abs.cpp
new file mode 100644
index 000000000..7e9c2338d
--- /dev/null
+++ b/libs/tflite/src/ext/kernels/Abs.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tflite/ext/kernels/Abs.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <iostream>
+#include <cmath>
+
+namespace nnfw
+{
+namespace tflite
+{
+namespace custom
+{
+namespace Abs
+{
+
+void *InitAbs(TfLiteContext *context, const char *buffer, size_t length) { return nullptr; }
+
+void FreeAbs(TfLiteContext *context, void *buffer) {}
+
+TfLiteStatus PrepareAbs(TfLiteContext *context, TfLiteNode *node)
+{
+ TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1);
+
+ const TfLiteTensor *input = ::tflite::GetInput(context, node, 0);
+ TfLiteTensor *output = ::tflite::GetOutput(context, node, 0);
+
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus EvalAbs(TfLiteContext *context, TfLiteNode *node)
+{
+ const TfLiteTensor *input = ::tflite::GetInput(context, node, 0);
+ TfLiteTensor *output = ::tflite::GetOutput(context, node, 0);
+ size_t elements = ::tflite::NumElements(input);
+ switch (input->type)
+ {
+ case kTfLiteFloat32:
+ {
+ auto *in = input->data.f;
+ auto *in_end = in + elements;
+ auto *out = output->data.f;
+ for (; in < in_end; in++, out++)
+ *out = std::abs(*in);
+ return kTfLiteOk;
+ }
+ case kTfLiteInt32:
+ {
+ auto *in = input->data.i32;
+ auto *in_end = in + elements;
+ auto *out = output->data.i32;
+ for (; in < in_end; in++, out++)
+ *out = std::abs(*in);
+ return kTfLiteOk;
+ }
+ case kTfLiteInt64:
+ {
+ auto *in = input->data.i64;
+ auto *in_end = in + elements;
+ auto *out = output->data.i64;
+ for (; in < in_end; in++, out++)
+ *out = std::abs(*in);
+ return kTfLiteOk;
+ }
+ case kTfLiteUInt8:
+ {
+ auto *in = input->data.uint8;
+ auto *in_end = in + elements;
+ auto *out = output->data.uint8;
+ for (; in < in_end; in++, out++)
+ *out = std::abs(*in);
+ return kTfLiteOk;
+ }
+ default:
+ {
+ context->ReportError(context, "Input type %d is not supported", input->type);
+ return kTfLiteError;
+ }
+ }
+}
+
+} // namespace Abs
+} // namespace custom
+} // namespace tflite
+} // namespace nnfw
diff --git a/libs/support/tflite/src/kernels/SquaredDifference.cpp b/libs/tflite/src/ext/kernels/SquaredDifference.cpp
index 25e10a8ed..8ac2b1de0 100644
--- a/libs/support/tflite/src/kernels/SquaredDifference.cpp
+++ b/libs/tflite/src/ext/kernels/SquaredDifference.cpp
@@ -14,19 +14,17 @@
* limitations under the License.
*/
-#include "support/tflite/kernels/SquaredDifference.h"
+#include "tflite/ext/kernels/SquaredDifference.h"
#include "tensorflow/contrib/lite/kernels/kernel_util.h"
#include <iostream>
-namespace tflite
+namespace nnfw
{
-namespace ops
+namespace tflite
{
namespace custom
{
-namespace nnfw
-{
namespace SquaredDifference
{
@@ -39,12 +37,12 @@ void FreeSquaredDifference(TfLiteContext *context, void *buffer) {}
TfLiteStatus PrepareSquaredDifference(TfLiteContext *context, TfLiteNode *node)
{
- TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
- TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 2);
+ TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1);
- const TfLiteTensor *input1 = GetInput(context, node, 0);
- const TfLiteTensor *input2 = GetInput(context, node, 1);
- TfLiteTensor *output = GetOutput(context, node, 0);
+ const TfLiteTensor *input1 = ::tflite::GetInput(context, node, 0);
+ const TfLiteTensor *input2 = ::tflite::GetInput(context, node, 1);
+ TfLiteTensor *output = ::tflite::GetOutput(context, node, 0);
TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
TF_LITE_ENSURE_EQ(context, input1->type, output->type);
@@ -55,12 +53,12 @@ TfLiteStatus PrepareSquaredDifference(TfLiteContext *context, TfLiteNode *node)
TfLiteStatus EvalSquaredDifference(TfLiteContext *context, TfLiteNode *node)
{
- const TfLiteTensor *input1 = GetInput(context, node, 0);
- const TfLiteTensor *input2 = GetInput(context, node, 1);
+ const TfLiteTensor *input1 = ::tflite::GetInput(context, node, 0);
+ const TfLiteTensor *input2 = ::tflite::GetInput(context, node, 1);
- TfLiteTensor *output = GetOutput(context, node, 0);
+ TfLiteTensor *output = ::tflite::GetOutput(context, node, 0);
- size_t elements = NumElements(input1);
+ size_t elements = ::tflite::NumElements(input1);
switch (input1->type)
{
@@ -109,7 +107,6 @@ TfLiteStatus EvalSquaredDifference(TfLiteContext *context, TfLiteNode *node)
}
} // namespace SquaredDifference
-} // nnfw
} // namespace custom
-} // namespace ops
} // namespace tflite
+} // namespace nnfw
diff --git a/libs/support/tflite/src/kernels/TensorFlowMax.cpp b/libs/tflite/src/ext/kernels/TensorFlowMax.cpp
index abc6fda4e..d72ad242c 100644
--- a/libs/support/tflite/src/kernels/TensorFlowMax.cpp
+++ b/libs/tflite/src/ext/kernels/TensorFlowMax.cpp
@@ -14,19 +14,17 @@
* limitations under the License.
*/
-#include "support/tflite/kernels/TensorFlowMax.h"
+#include "tflite/ext/kernels/TensorFlowMax.h"
#include "tensorflow/contrib/lite/kernels/kernel_util.h"
#include <iostream>
-namespace tflite
+namespace nnfw
{
-namespace ops
+namespace tflite
{
namespace custom
{
-namespace nnfw
-{
namespace TensorFlowMax
{
@@ -34,9 +32,9 @@ struct TensorFlowMaxOp
{
TensorFlowMaxOp(TfLiteContext *context, TfLiteNode *node)
{
- input = tflite::GetInput(context, node, 0);
- axis = tflite::GetInput(context, node, 1);
- output = tflite::GetOutput(context, node, 0);
+ input = ::tflite::GetInput(context, node, 0);
+ axis = ::tflite::GetInput(context, node, 1);
+ output = ::tflite::GetOutput(context, node, 0);
}
const TfLiteTensor *input;
const TfLiteTensor *axis;
@@ -62,16 +60,16 @@ TfLiteStatus ResizeTempAxis(TfLiteContext *context, TensorFlowMaxOp *op_context,
TfLiteTensor *resolved_axis)
{
TfLiteIntArray *axis_size = TfLiteIntArrayCreate(1);
- axis_size->data[0] = static_cast<int>(tflite::NumElements(op_context->axis));
+ axis_size->data[0] = static_cast<int>(::tflite::NumElements(op_context->axis));
return context->ResizeTensor(context, resolved_axis, axis_size);
}
// Resizes output array based on the input size and resolved axis.
TfLiteStatus ResizeOutputTensor(TfLiteContext *context, TensorFlowMaxOp *op_context)
{
- size_t num_axis = tflite::NumElements(op_context->axis);
- const TfLiteIntArray *input_dims = op_context->input->dims;
- int input_num_dims = tflite::NumDimensions(op_context->input);
+ size_t num_axis = ::tflite::NumElements(op_context->axis);
+ TfLiteIntArray *input_dims = op_context->input->dims;
+ int input_num_dims = ::tflite::NumDimensions(op_context->input);
const int *axis = op_context->axis->data.i32;
{
@@ -100,26 +98,43 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext *context, TensorFlowMaxOp *op_cont
}
}
// Determines output dimensions.
- TfLiteIntArray *output_dims = TfLiteIntArrayCreate(input_num_dims - num_reduce_axis);
- int num_skip_axis = 0;
- for (int idx = 0; idx < input_num_dims; ++idx)
+ int output_num_dims = ::tflite::NumDimensions(op_context->output);
+ TF_LITE_ENSURE(context, (input_num_dims == output_num_dims) ||
+ (input_num_dims - num_reduce_axis == output_num_dims));
+
+ if (input_num_dims == output_num_dims)
{
- bool is_axis = false;
+ TfLiteIntArray *output_dims = TfLiteIntArrayCopy(input_dims);
for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
{
- if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx)
- {
- ++num_skip_axis;
- is_axis = true;
- break;
- }
+ int current = axis[axis_idx];
+ output_dims->data[current] = 1;
}
- if (!is_axis)
+ return context->ResizeTensor(context, op_context->output, output_dims);
+ }
+ else
+ {
+ TfLiteIntArray *output_dims = TfLiteIntArrayCreate(output_num_dims);
+ int num_skip_axis = 0;
+ for (int idx = 0; idx < input_num_dims; ++idx)
{
- output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
+ bool is_axis = false;
+ for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+ {
+ if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx)
+ {
+ ++num_skip_axis;
+ is_axis = true;
+ break;
+ }
+ }
+ if (!is_axis)
+ {
+ output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
+ }
}
+ return context->ResizeTensor(context, op_context->output, output_dims);
}
- return context->ResizeTensor(context, op_context->output, output_dims);
}
}
@@ -136,7 +151,7 @@ TfLiteStatus InitializeTemporaries(TfLiteContext *context, TfLiteNode *node,
scratch_tensor->type = kTfLiteInt32;
scratch_tensor->allocation_type = kTfLiteArenaRw;
TfLiteIntArray *index_size = TfLiteIntArrayCreate(1);
- index_size->data[0] = tflite::NumDimensions(op_context->input);
+ index_size->data[0] = ::tflite::NumDimensions(op_context->input);
TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor, index_size));
// Creates a temp tensor to store resolved axis given input data.
@@ -148,18 +163,18 @@ TfLiteStatus InitializeTemporaries(TfLiteContext *context, TfLiteNode *node,
TfLiteStatus PrepareTensorFlowMax(TfLiteContext *context, TfLiteNode *node)
{
- TF_LITE_ENSURE_EQ(context, tflite::NumInputs(node), 2);
- TF_LITE_ENSURE_EQ(context, tflite::NumOutputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 2);
+ TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1);
TensorFlowMaxOp op_context(context, node);
TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]];
// Leaves work to Eval if axis is not constant; else resizes output.
- if (!tflite::IsConstantTensor(op_context.axis))
+ if (!::tflite::IsConstantTensor(op_context.axis))
{
- tflite::SetTensorToDynamic(op_context.output);
- tflite::SetTensorToDynamic(resolved_axis);
+ ::tflite::SetTensorToDynamic(op_context.output);
+ ::tflite::SetTensorToDynamic(resolved_axis);
return kTfLiteOk;
}
resolved_axis->allocation_type = kTfLiteArenaRw;
@@ -336,11 +351,11 @@ TfLiteStatus EvalTensorFlowMax(TfLiteContext *context, TfLiteNode *node)
{
TensorFlowMaxOp op_context(context, node);
- int num_axis = static_cast<int>(tflite::NumElements(op_context.axis));
+ int num_axis = static_cast<int>(::tflite::NumElements(op_context.axis));
TfLiteTensor *temp_index = &context->tensors[node->temporaries->data[0]];
TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]];
// Resize the output tensor if the output tensor is dynamic.
- if (tflite::IsDynamicTensor(op_context.output))
+ if (::tflite::IsDynamicTensor(op_context.output))
{
TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis));
TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
@@ -383,8 +398,8 @@ TfLiteStatus EvalTensorFlowMax(TfLiteContext *context, TfLiteNode *node)
return returnStatus;
}
+
} // namespace TensorFlowMax
-} // namespace nnfw
} // namespace custom
-} // namespace ops
} // namespace tflite
+} // namespace nnfw
diff --git a/libs/tflite/src/ext/kernels/TensorFlowSum.cpp b/libs/tflite/src/ext/kernels/TensorFlowSum.cpp
new file mode 100644
index 000000000..cbf97970c
--- /dev/null
+++ b/libs/tflite/src/ext/kernels/TensorFlowSum.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tflite/ext/kernels/TensorFlowSum.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <iostream>
+
+namespace nnfw
+{
+namespace tflite
+{
+namespace custom
+{
+namespace TensorFlowSum
+{
+
+struct TensorFlowSumOp
+{
+ TensorFlowSumOp(TfLiteContext *context, TfLiteNode *node)
+ {
+ input = ::tflite::GetInput(context, node, 0);
+ axis = ::tflite::GetInput(context, node, 1);
+ output = ::tflite::GetOutput(context, node, 0);
+ }
+ const TfLiteTensor *input;
+ const TfLiteTensor *axis;
+ TfLiteTensor *output;
+};
+
+void *InitTensorFlowSum(TfLiteContext *context, const char *buffer, size_t length)
+{
+ // Creates two temp tensors to store index and axis for internal
+ // implementation only.
+ auto *scratch_tensor_index = new int;
+ context->AddTensors(context, 2, scratch_tensor_index);
+ return scratch_tensor_index;
+}
+
+void FreeTensorFlowSum(TfLiteContext *context, void *buffer)
+{
+ delete static_cast<TensorFlowSumOp *>(buffer);
+}
+
+// Resizes the temp tensor that stores resolved axis.
+TfLiteStatus ResizeTempAxis(TfLiteContext *context, TensorFlowSumOp *op_context,
+ TfLiteTensor *resolved_axis)
+{
+ TfLiteIntArray *axis_size = TfLiteIntArrayCreate(1);
+ axis_size->data[0] = static_cast<int>(::tflite::NumElements(op_context->axis));
+ return context->ResizeTensor(context, resolved_axis, axis_size);
+}
+
+// Resizes output array based on the input size and resolved axis.
+TfLiteStatus ResizeOutputTensor(TfLiteContext *context, TensorFlowSumOp *op_context)
+{
+ size_t num_axis = ::tflite::NumElements(op_context->axis);
+ TfLiteIntArray *input_dims = op_context->input->dims;
+ int input_num_dims = ::tflite::NumDimensions(op_context->input);
+ const int *axis = op_context->axis->data.i32;
+
+ {
+ // Calculates size of reducing axis.
+ int num_reduce_axis = num_axis;
+ for (int i = 0; i < num_axis; ++i)
+ {
+ int current = axis[i];
+ if (current < 0)
+ {
+ current += input_num_dims;
+ }
+ TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims);
+ for (int j = 0; j < i; ++j)
+ {
+ int previous = axis[j];
+ if (previous < 0)
+ {
+ previous += input_num_dims;
+ }
+ if (current == previous)
+ {
+ --num_reduce_axis;
+ break;
+ }
+ }
+ }
+ // Determines output dimensions.
+ int output_num_dims = ::tflite::NumDimensions(op_context->output);
+ TF_LITE_ENSURE(context, (input_num_dims == output_num_dims) ||
+ (input_num_dims - num_reduce_axis == output_num_dims));
+
+ if (input_num_dims == output_num_dims)
+ {
+ TfLiteIntArray *output_dims = TfLiteIntArrayCopy(input_dims);
+ for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+ {
+ int current = axis[axis_idx];
+ output_dims->data[current] = 1;
+ }
+ return context->ResizeTensor(context, op_context->output, output_dims);
+ }
+ else
+ {
+ TfLiteIntArray *output_dims = TfLiteIntArrayCreate(output_num_dims);
+ int num_skip_axis = 0;
+ for (int idx = 0; idx < input_num_dims; ++idx)
+ {
+ bool is_axis = false;
+ for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+ {
+ if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx)
+ {
+ ++num_skip_axis;
+ is_axis = true;
+ break;
+ }
+ }
+ if (!is_axis)
+ {
+ output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
+ }
+ }
+ return context->ResizeTensor(context, op_context->output, output_dims);
+ }
+ }
+}
+
+// Initializes temp tensors to store index and resolved axis.
+TfLiteStatus InitializeTemporaries(TfLiteContext *context, TfLiteNode *node,
+ TensorFlowSumOp *op_context)
+{
+ // Creates a temp index to iterate through input data.
+ int *scratch_tensor_index = reinterpret_cast<int *>(node->user_data);
+ TfLiteIntArrayFree(node->temporaries);
+ node->temporaries = TfLiteIntArrayCreate(2);
+ node->temporaries->data[0] = *scratch_tensor_index;
+ TfLiteTensor *scratch_tensor = &context->tensors[node->temporaries->data[0]];
+ scratch_tensor->type = kTfLiteInt32;
+ scratch_tensor->allocation_type = kTfLiteArenaRw;
+ TfLiteIntArray *index_size = TfLiteIntArrayCreate(1);
+ index_size->data[0] = ::tflite::NumDimensions(op_context->input);
+ TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor, index_size));
+
+ // Creates a temp tensor to store resolved axis given input data.
+ node->temporaries->data[1] = *scratch_tensor_index + 1;
+ TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]];
+ resolved_axis->type = kTfLiteInt32;
+ return kTfLiteOk;
+}
+
+TfLiteStatus PrepareTensorFlowSum(TfLiteContext *context, TfLiteNode *node)
+{
+ TF_LITE_ENSURE_EQ(context, ::tflite::NumInputs(node), 2);
+ TF_LITE_ENSURE_EQ(context, ::tflite::NumOutputs(node), 1);
+
+ TensorFlowSumOp op_context(context, node);
+ TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+
+ TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]];
+ // Leaves work to Eval if axis is not constant; else resizes output.
+ if (!::tflite::IsConstantTensor(op_context.axis))
+ {
+ ::tflite::SetTensorToDynamic(op_context.output);
+ ::tflite::SetTensorToDynamic(resolved_axis);
+ return kTfLiteOk;
+ }
+ resolved_axis->allocation_type = kTfLiteArenaRw;
+ TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis));
+ return ResizeOutputTensor(context, &op_context);
+}
+
+// Gets offset of index if expanded on axis. When expanded, the flattened offset
+// will not change, if the output index changes on the given axis. For example,
+// if you have a 2D tensor and you are expanding to 3D on axis 0,
+// then index (0, 1, 2) and index (1, 1, 2) will map from the same flattened
+// offset.
+inline size_t ExpandedInputOffset(const int num_dims, const int *dims, const int *index,
+ const int num_axis, const int *axis)
+{
+ size_t offset = 0;
+ int out_idx = 0;
+ for (int in_idx = 0; in_idx < num_dims; ++in_idx)
+ {
+ // if we need to expand this axis
+ bool is_axis = false;
+ if (axis != nullptr)
+ {
+ for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+ {
+ if (in_idx == axis[axis_idx])
+ {
+ is_axis = true;
+ break;
+ }
+ }
+ }
+ if (!is_axis)
+ {
+ offset = offset * static_cast<size_t>(dims[in_idx]) + static_cast<size_t>(index[out_idx]);
+ out_idx++;
+ }
+ else
+ {
+ offset = offset * static_cast<size_t>(dims[in_idx]);
+ }
+ }
+ return offset;
+}
+
+// Gets offset of index if reducing on axis. When reducing, the flattened offset
+// will not change, if the input index changes on the given axis. For example,
+// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
+// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
+// offset.
+// TODO(kanlig): uses Dims to represent dimensions.
+inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index,
+ const int num_axis, const int *axis)
+{
+ size_t offset = 0;
+ for (int idx = 0; idx < num_dims; ++idx)
+ {
+ // if we need to skip this axis
+ bool is_axis = false;
+ if (axis != nullptr)
+ {
+ for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+ {
+ if (idx == axis[axis_idx])
+ {
+ is_axis = true;
+ break;
+ }
+ }
+ }
+ if (!is_axis)
+ {
+ offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
+ }
+ }
+ return offset;
+}
+
+// Gets next index to iterate through a multidimensional array.
+inline bool NextIndex(TfLiteContext *context, const int num_dims, const int *dims, int *current)
+{
+ int carry = 1;
+ for (int idx = num_dims - 1; idx >= 0; --idx)
+ {
+ int current_val = current[idx] + carry;
+ TF_LITE_ENSURE(context, (dims[idx] >= current_val));
+ if (dims[idx] == current_val)
+ {
+ current[idx] = 0;
+ }
+ else
+ {
+ current[idx] = current_val;
+ carry = 0;
+ break;
+ }
+ }
+ return (carry == 0);
+}
+
+template <typename T>
+inline TfLiteStatus
+CustomSum(TfLiteContext *context, T *input_data, const int *input_dims, const int input_num_dims,
+ T *output_data, const int *output_dims, const int output_num_dims, const int *axis,
+ const int num_axis_dimensions, bool keep_dims, int *temp_index, int *resolved_axis)
+{
+ // resolves axis.
+ int num_resolved_axis = 0;
+ for (int idx = 0; idx < num_axis_dimensions; ++idx)
+ {
+ int current = axis[idx];
+ TF_LITE_ENSURE(context, (current < input_num_dims && current + input_num_dims >= 0));
+ if (current < 0)
+ {
+ current += input_num_dims;
+ }
+ bool is_dup = false;
+ for (int j = 0; j < num_resolved_axis; ++j)
+ {
+ if (resolved_axis[j] == current)
+ {
+ is_dup = true;
+ break;
+ }
+ }
+ if (!is_dup)
+ {
+ resolved_axis[num_resolved_axis++] = current;
+ }
+ }
+
+ TF_LITE_ENSURE(context, (input_num_dims > 0));
+ TF_LITE_ENSURE(context, (input_dims != nullptr));
+ TF_LITE_ENSURE(context, (temp_index != nullptr));
+
+ // resets output data.
+ for (int idx = 0; idx < output_num_dims; ++idx)
+ {
+ temp_index[idx] = 0;
+ }
+ for (bool has_next = true; has_next;
+ has_next = NextIndex(context, output_num_dims, output_dims, temp_index))
+ {
+ size_t output_offset =
+ ReducedOutputOffset(output_num_dims, output_dims, temp_index, 0, nullptr);
+ output_data[output_offset] = 0;
+ }
+
+ // resets temp index.
+ for (int idx = 0; idx < input_num_dims; ++idx)
+ {
+ temp_index[idx] = 0;
+ }
+
+ // iterates through input_data.
+ for (bool has_next = true; has_next;
+ has_next = NextIndex(context, input_num_dims, input_dims, temp_index))
+ {
+ size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
+ size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims, temp_index,
+ num_resolved_axis, resolved_axis);
+ output_data[output_offset] += input_data[input_offset];
+ }
+
+ return kTfLiteOk;
+}
+
+TfLiteStatus EvalTensorFlowSum(TfLiteContext *context, TfLiteNode *node)
+{
+
+ TensorFlowSumOp op_context(context, node);
+ int num_axis = static_cast<int>(::tflite::NumElements(op_context.axis));
+ TfLiteTensor *temp_index = &context->tensors[node->temporaries->data[0]];
+ TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]];
+ // Resize the output tensor if the output tensor is dynamic.
+ if (::tflite::IsDynamicTensor(op_context.output))
+ {
+ TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis));
+ TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+ }
+
+ TfLiteStatus returnStatus = kTfLiteOk;
+ switch (op_context.input->type)
+ {
+ case kTfLiteFloat32:
+ returnStatus = CustomSum<float>(
+ context, op_context.input->data.f, op_context.input->dims->data,
+ op_context.input->dims->size, op_context.output->data.f, op_context.output->dims->data,
+ op_context.output->dims->size, op_context.axis->data.i32, num_axis, false,
+ temp_index->data.i32, resolved_axis->data.i32);
+ break;
+ case kTfLiteInt32:
+ returnStatus = CustomSum<int>(context, op_context.input->data.i32,
+ op_context.input->dims->data, op_context.input->dims->size,
+ op_context.output->data.i32, op_context.output->dims->data,
+ op_context.output->dims->size, op_context.axis->data.i32,
+ num_axis, false, temp_index->data.i32, resolved_axis->data.i32);
+ break;
+ case kTfLiteUInt8:
+ returnStatus = CustomSum<uint8_t>(
+ context, op_context.input->data.uint8, op_context.input->dims->data,
+ op_context.input->dims->size, op_context.output->data.uint8,
+ op_context.output->dims->data, op_context.output->dims->size, op_context.axis->data.i32,
+ num_axis, false, temp_index->data.i32, resolved_axis->data.i32);
+ break;
+ case kTfLiteInt64:
+ returnStatus = CustomSum<int64_t>(
+ context, op_context.input->data.i64, op_context.input->dims->data,
+ op_context.input->dims->size, op_context.output->data.i64, op_context.output->dims->data,
+ op_context.output->dims->size, op_context.axis->data.i32, num_axis, false,
+ temp_index->data.i32, resolved_axis->data.i32);
+ break;
+ default:
+ returnStatus = kTfLiteError;
+ }
+
+ return returnStatus;
+}
+
+} // namespace TensorFlowSum
+} // namespace custom
+} // namespace tflite
+} // namespace nnfw
diff --git a/libs/support/tflite/src/kernels/register.cpp b/libs/tflite/src/ext/kernels/register.cpp
index 6700b4de4..b822bd616 100644
--- a/libs/support/tflite/src/kernels/register.cpp
+++ b/libs/tflite/src/ext/kernels/register.cpp
@@ -14,18 +14,17 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
// NOTE This code is derived from the following file (in TensorFlow)
// 'externals/tensorflow/tensorflow/contrib/lite/kernels/register.cc'
-#include "support/tflite/kernels/register.h"
-#include "support/tflite/kernels/CustomOps.h"
+#include "tflite/ext/kernels/register.h"
+#include "tflite/ext/kernels/CustomOps.h"
-// TODO Use namespace nnfw
-namespace tflite
-{
-namespace ops
-{
-namespace builtin
-{
+namespace tflite {
+namespace ops {
+namespace builtin {
TfLiteRegistration *Register_RELU();
TfLiteRegistration *Register_RELU_N1_TO_1();
@@ -91,9 +90,41 @@ TfLiteRegistration *Register_SLICE();
TfLiteRegistration *Register_SIN();
TfLiteRegistration *Register_TRANSPOSE_CONV();
TfLiteRegistration *Register_SPARSE_TO_DENSE();
+#ifndef OBS_BUILD
+TfLiteRegistration *Register_SUM();
+TfLiteRegistration *Register_REDUCE_MAX();
+TfLiteRegistration *Register_REDUCE_MIN();
+TfLiteRegistration *Register_EQUAL();
+TfLiteRegistration *Register_NOT_EQUAL();
+TfLiteRegistration *Register_SQRT();
+TfLiteRegistration *Register_RSQRT();
+TfLiteRegistration *Register_SHAPE();
+TfLiteRegistration *Register_POW();
+TfLiteRegistration *Register_FAKE_QUANT();
+TfLiteRegistration *Register_PACK();
+TfLiteRegistration *Register_ONE_HOT();
+TfLiteRegistration *Register_LOGICAL_OR();
+TfLiteRegistration *Register_LOGICAL_AND();
+TfLiteRegistration *Register_LOGICAL_NOT();
+TfLiteRegistration *Register_UNPACK();
+TfLiteRegistration *Register_FLOOR_DIV();
+TfLiteRegistration *Register_SQUARE();
+TfLiteRegistration *Register_ZEROS_LIKE();
+#endif // OBS_BUILD
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
+
+namespace nnfw {
+namespace tflite {
BuiltinOpResolver::BuiltinOpResolver()
{
+ // Using namespace directive to minimize diff with upstream tensorflow
+ using namespace ::tflite::ops::builtin;
+ using namespace ::tflite;
+
AddBuiltin(BuiltinOperator_RELU, Register_RELU());
AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
@@ -156,14 +187,35 @@ BuiltinOpResolver::BuiltinOpResolver()
AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+#ifndef OBS_BUILD
+ AddBuiltin(BuiltinOperator_SUM, Register_SUM());
+ AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
+ AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
+ AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
+ AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+ AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
+ AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
+ AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
+ AddBuiltin(BuiltinOperator_POW, Register_POW());
+ AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
+ AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+ AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
+ AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
+ AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
+ AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
+ AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
+ AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV());
+ AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
+ AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
+#endif // OBS_BUILD
- AddCustom("TensorFlowMax", tflite::ops::custom::nnfw::Register_TensorFlowMax());
- AddCustom("RSQRT", tflite::ops::custom::nnfw::Register_RSQRT());
- AddCustom("SquaredDifference", tflite::ops::custom::nnfw::Register_SquaredDifference());
+ AddCustom("TensorFlowMax", nnfw::tflite::custom::Register_TensorFlowMax());
+ AddCustom("SquaredDifference", nnfw::tflite::custom::Register_SquaredDifference());
+ AddCustom("TensorFlowSum", nnfw::tflite::custom::Register_TensorFlowSum());
+ AddCustom("Abs", nnfw::tflite::custom::Register_Abs());
}
-} // namespace builtin
-} // namespace ops
-} // namespace tflite
+} // namespace tflite
+} // namespace nnfw
diff --git a/libs/tflite/src/ext/nnapi_delegate.cpp b/libs/tflite/src/ext/nnapi_delegate.cpp
new file mode 100644
index 000000000..25858a7b4
--- /dev/null
+++ b/libs/tflite/src/ext/nnapi_delegate.cpp
@@ -0,0 +1,1209 @@
+/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
+// NOTE This code is derived from the following file (in TensorFlow v1.12)
+// 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.cc'
+#include "tflite/ext/nnapi_delegate.h"
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifdef OBS_BUILD
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#else
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#endif
+#include "tensorflow/contrib/lite/model.h"
+#include "NeuralNetworksShim.h"
+#include "NeuralNetworksExShim.h"
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#include <sys/system_properties.h>
+#endif
+
+namespace nnfw {
+namespace tflite {
+
+void logError(const char* format, ...) {
+ // stderr is convenient for native tests, but is not captured for apps
+ va_list args_for_stderr;
+ va_start(args_for_stderr, format);
+ vfprintf(stderr, format, args_for_stderr);
+ va_end(args_for_stderr);
+ fprintf(stderr, "\n");
+ fflush(stderr);
+#ifdef __ANDROID__
+ // produce logcat output for general consumption
+ va_list args_for_log;
+ va_start(args_for_log, format);
+ __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
+ va_end(args_for_log);
+#endif
+}
+
+#define FATAL(...) \
+ logError(__VA_ARGS__); \
+ exit(1);
+
+// TODO(aselle): Change the error model to use status codes.
+#define CHECK_TFLITE_SUCCESS(x) \
+ if (x != kTfLiteOk) { \
+ FATAL("Aborting since tflite returned failure nnapi_delegate.cc:%d.", \
+ __LINE__); \
+ }
+
+#define CHECK_NN(x) \
+ if (x != ANEURALNETWORKS_NO_ERROR) { \
+ FATAL("Aborting since NNAPI returned failure nnapi_delegate.cc:%d", \
+ __LINE__); \
+ }
+
+#define RETURN_ERROR_IF_TFLITE_FAILED(x) \
+ if (x != kTfLiteOk) { \
+ logError( \
+ "Returning error since TFLite returned failure nnapi_delegate.cc:%d.", \
+ __LINE__); \
+ return kTfLiteError; \
+ }
+
+#define RETURN_ERROR_IF_NN_FAILED(x) \
+ if (x != ANEURALNETWORKS_NO_ERROR) { \
+ logError( \
+ "Returning error since NNAPI returned failure nnapi_delegate.cc:%d.", \
+ __LINE__); \
+ return kTfLiteError; \
+ }
+
+// Tracking of NNAPI operand ids
+static const int64_t kOperandIdNotSet = -1;
+static const int64_t kOperandNotNeeded = -2;
+
+namespace {
+
+int32_t GetAndroidSdkVersion() {
+#ifdef __ANDROID__
+ const char* sdkProp = "ro.build.version.sdk";
+ char sdkVersion[PROP_VALUE_MAX];
+ int length = __system_property_get(sdkProp, sdkVersion);
+ if (length != 0) {
+ for (int i = 0; i < length; ++i) {
+ int digit = sdkVersion[i] - '0';
+ if (digit < 0 || digit > 9) {
+ // Non-numeric SDK version, assume it's higher then expected;
+ return 0xFFFF;
+ }
+ }
+ return atoi(sdkVersion);
+ }
+ FATAL("No %s prop", sdkProp);
+#endif // __ANDROID__
+ return 0;
+}
+
+int32_t GetAndroidSdkVersionCached() {
+ static int32_t androidSdkVersion = GetAndroidSdkVersion();
+ return androidSdkVersion;
+}
+
+static const uint32_t dimension_for_scalar[1] = {1};
+
+} // namespace
+
+NNAPIAllocation::NNAPIAllocation(const char* filename,
+ ::tflite::ErrorReporter* error_reporter)
+ : MMAPAllocation(filename, error_reporter) {
+ if (mmapped_buffer_ != MAP_FAILED)
+ CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
+ mmap_fd_, 0, &handle_));
+}
+
+NNAPIAllocation::~NNAPIAllocation() {
+ if (handle_) {
+ ANeuralNetworksMemory_free(handle_);
+ }
+}
+
+NNAPIDelegate::~NNAPIDelegate() {
+ if (nn_compiled_model_) {
+ ANeuralNetworksCompilation_free(nn_compiled_model_);
+ nn_compiled_model_ = nullptr;
+ }
+ if (nn_model_) {
+ ANeuralNetworksModel_free(nn_model_);
+ nn_model_ = nullptr;
+ // TODO(aselle): Is this thread-safe and callable multiple times?
+ }
+ // ANeuralNetworksShutdown();
+}
+
+// Adds the tensors of the interpreter to the NN API model.
+TfLiteStatus addTensorOperands(::tflite::Interpreter* interpreter,
+ ANeuralNetworksModel* nn_model,
+ uint32_t* no_of_operands_added,
+ std::vector<int64_t>* nnapi_ids) {
+ uint32_t next_id = 0;
+ for (size_t i = 0; i < interpreter->tensors_size(); i++) {
+ // Skip temporaries and RNN back-edges.
+ if ((*nnapi_ids)[i] == kOperandNotNeeded) continue;
+
+ (*nnapi_ids)[i] = int64_t(next_id);
+
+ int32_t nn_type = 0;
+ // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
+ float scale = 0.0f;
+ int32_t zeroPoint = 0;
+ TfLiteTensor* tensor = interpreter->tensor(i);
+ switch (tensor->type) {
+ case kTfLiteNoType:
+ // Tensors added during initialization of Ops don't have a type yet and
+ // should not be registered with the NNAPI.
+ continue;
+ case kTfLiteFloat32:
+ nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+ break;
+ case kTfLiteUInt8:
+ nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+ scale = tensor->params.scale;
+ // FIXME The next line is a workaround because currently zero scale is
+ // passed down from TF
+ // Lite. Note that the latest NeuralNetworks.h (see
+ // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/runtime/include/NeuralNetworks.h)
+ // requires scale to be greater than zero. Remove this workaround
+ // when the scale
+ // value is correctly passed.
+ scale = (scale == 0.0f) ? 1.0f : scale;
+ zeroPoint = tensor->params.zero_point;
+ break;
+ case kTfLiteInt32:
+ nn_type = ANEURALNETWORKS_TENSOR_INT32;
+ scale = tensor->params.scale;
+ zeroPoint = tensor->params.zero_point;
+ break;
+ case kTfLiteBool:
+ // Workaround to pass bool type under NNAPI
+ // Use bool type using ANEURALNETWORKS_TENSOR_QUANT8_ASYMM with scale = 1.0f and zero_point = 0
+ nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+ scale = 1.0f;
+ zeroPoint = 0;
+ break;
+ default:
+ logError("Unsupported tensor type %d", tensor->type);
+ return kTfLiteError;
+ }
+ if (tensor->dims->size == 0) {
+ // WORKAROUND Some model have dimension zero
+ switch (tensor->type) {
+ case kTfLiteFloat32:
+ nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+ break;
+ case kTfLiteInt32:
+ nn_type = ANEURALNETWORKS_TENSOR_INT32;
+ break;
+ default:
+ logError("NNAPI doesn't support tensors with rank 0 (index %d name %s)",
+ i, tensor->name);
+ return kTfLiteError;
+ }
+ }
+ if (tensor->dims->size > 4) {
+ logError("NNAPI doesn't support tensors with rank > 4 (index %d name %s)",
+ i, tensor->name);
+ return kTfLiteError;
+ }
+ // TODO(aselle): Note, many of these are intermediate results. Do I need
+ // to ever specify these sizes. I am currently below doing setValue
+ // on all of them, but I shouldn't in the future.
+ // Answer(jeanluc): If all the operators can set the dimension correctly,
+ // you won't need to.
+ ANeuralNetworksOperandType operand_type{
+ nn_type, static_cast<uint32_t>(tensor->dims->size),
+ reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
+ if (tensor->dims->size == 0) {
+ // WORKAROUND Some model have dimension zero
+ // Consider scalar as vector size 1
+ operand_type.dimensions = dimension_for_scalar;
+ operand_type.dimensionCount = 1;
+ }
+ RETURN_ERROR_IF_NN_FAILED(
+ ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+ // TODO(aselle): Based on Michael's suggestion, limiting this to read
+ // only memory
+ if (tensor->allocation_type == kTfLiteMmapRo) {
+ if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
+ static_cast<const ::tflite::Allocation*>(tensor->allocation))) {
+ RETURN_ERROR_IF_NN_FAILED(
+ ANeuralNetworksModel_setOperandValueFromMemory(
+ nn_model, next_id, alloc->memory(),
+ alloc->offset(tensor->data.raw), tensor->bytes));
+ } else {
+ RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue(
+ nn_model, next_id, tensor->data.raw, tensor->bytes));
+ }
+ } else if (tensor->bytes == 0) {
+ // These size 0 tensors are optional tensors reserved.
+ RETURN_ERROR_IF_NN_FAILED(
+ ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
+ }
+
+ ++next_id;
+ }
+ *no_of_operands_added = next_id;
+ return kTfLiteOk;
+}
+
+void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count,
+ std::vector<uint32_t>* into,
+ const std::vector<int64_t>& map) {
+ for (size_t i = 0; i < from_ids_count; i++) {
+ int from_id = from_ids_buf[i];
+ if (from_id == kOptionalTensor) {
+ into->push_back(from_id);
+ } else {
+ into->push_back(map[from_id]);
+ }
+ }
+}
+
+// Adds the operations and their parameters to the NN API model.
+// 'next-id' is the operand ID of the next operand of the model.
+TfLiteStatus AddOpsAndParams(
+ ::tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model,
+ uint32_t next_id, std::vector<int>* model_state_inputs,
+ std::vector<int>* model_state_outputs,
+ const std::vector<int64_t>& tensor_id_to_nnapi_id) {
+ for (size_t i = 0; i < interpreter->nodes_size(); i++) {
+ const auto* node_and_registration = interpreter->node_and_registration(i);
+ const TfLiteNode& node = node_and_registration->first;
+ const TfLiteRegistration& registration = node_and_registration->second;
+ ::tflite::BuiltinOperator builtin =
+ static_cast<::tflite::BuiltinOperator>(registration.builtin_code);
+
+ // Add the parameters.
+ std::vector<uint32_t> augmented_inputs, augmented_outputs;
+ MapAndAddTensorIds(node.inputs->data, node.inputs->size, &augmented_inputs,
+ tensor_id_to_nnapi_id);
+ MapAndAddTensorIds(node.outputs->data, node.outputs->size,
+ &augmented_outputs, tensor_id_to_nnapi_id);
+
+ auto add_scalar_int32 = [&nn_model, &augmented_inputs,
+ &next_id](int value) {
+ ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
+ CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+ CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
+ sizeof(int32_t)))
+ augmented_inputs.push_back(next_id++);
+ };
+
+ auto add_scalar_float32 = [&nn_model, &augmented_inputs,
+ &next_id](float value) {
+ ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
+ CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+ CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
+ sizeof(float)))
+ augmented_inputs.push_back(next_id++);
+ };
+
+ auto add_vector_int32 = [&](const int* values, uint32_t num_values) {
+ ANeuralNetworksOperandType operand_type{
+ .type = ANEURALNETWORKS_TENSOR_INT32,
+ .dimensionCount = 1,
+ .dimensions = &num_values};
+ CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+ CHECK_NN(ANeuralNetworksModel_setOperandValue(
+ nn_model, next_id, values, sizeof(int32_t) * num_values));
+ augmented_inputs.push_back(next_id++);
+ };
+
+ // Handle state tensors of RNN, LSTM, SVDF.
+ // For each state_out tensor, a corresponding state_in operand needs to be
+ // created for NNAPI.
+ auto duplicate_state_tensor_float32 =
+ [interpreter, &nn_model, &next_id, &augmented_inputs,
+ &model_state_inputs, &model_state_outputs](int tensor_id) {
+ const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
+ ANeuralNetworksOperandType operand_type{
+ ANEURALNETWORKS_TENSOR_FLOAT32,
+ static_cast<uint32_t>(tensor->dims->size),
+ reinterpret_cast<uint32_t*>(tensor->dims->data),
+ tensor->params.scale, tensor->params.zero_point};
+ CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+ augmented_inputs.push_back(next_id);
+ model_state_inputs->push_back(next_id);
+ model_state_outputs->push_back(tensor_id);
+ next_id++;
+ };
+ auto check_and_add_activation = [&add_scalar_int32](int activation) {
+ if (activation > kTfLiteActRelu6) {
+ logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
+ return kTfLiteError;
+ }
+ add_scalar_int32(activation);
+ return kTfLiteOk;
+ };
+
+ auto add_add_params = [&add_scalar_int32](void* data) {
+ auto* builtin = reinterpret_cast<TfLiteAddParams*>(data);
+ if (builtin->activation > kTfLiteActRelu6) {
+ logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
+ return kTfLiteError;
+ }
+ add_scalar_int32(builtin->activation);
+ return kTfLiteOk;
+ };
+
+ auto add_pooling_params = [&add_scalar_int32,
+ &check_and_add_activation](void* data) {
+ auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
+ add_scalar_int32(builtin->padding);
+ add_scalar_int32(builtin->stride_width);
+ add_scalar_int32(builtin->stride_height);
+ add_scalar_int32(builtin->filter_width);
+ add_scalar_int32(builtin->filter_height);
+ return check_and_add_activation(builtin->activation);
+ };
+
+ auto add_convolution_params = [&add_scalar_int32,
+ &check_and_add_activation](void* data) {
+ auto builtin = reinterpret_cast<TfLiteConvParams*>(data);
+ add_scalar_int32(builtin->padding);
+ add_scalar_int32(builtin->stride_width);
+ add_scalar_int32(builtin->stride_height);
+ return check_and_add_activation(builtin->activation);
+ };
+
+ auto add_depthwise_conv_params = [&add_scalar_int32,
+ &check_and_add_activation](void* data) {
+ auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data);
+ add_scalar_int32(builtin->padding);
+ add_scalar_int32(builtin->stride_width);
+ add_scalar_int32(builtin->stride_height);
+ add_scalar_int32(builtin->depth_multiplier);
+ return check_and_add_activation(builtin->activation);
+ };
+
+ auto add_fully_connected_params = [&check_and_add_activation](void* data) {
+ auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data);
+ return check_and_add_activation(builtin->activation);
+ };
+
+ auto add_concatenation_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data);
+ add_scalar_int32(builtin->axis);
+ if (builtin->activation != kTfLiteActNone) {
+ logError("Concatenation does not support fused activation in NNAPI");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+ };
+
+ auto add_softmax_params = [&add_scalar_float32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data);
+ add_scalar_float32(builtin->beta);
+ };
+
+ auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
+ add_scalar_int32(builtin->block_size);
+ };
+
+ auto add_lstm_params = [&add_scalar_int32,
+ &add_scalar_float32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data);
+ add_scalar_int32(builtin->activation);
+ add_scalar_float32(builtin->cell_clip);
+ add_scalar_float32(builtin->proj_clip);
+ };
+
+ // LSTM in NNAPI requires scratch tensor as an output operand.
+ auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
+ &next_id, &augmented_outputs]() {
+ if (node.temporaries->size == 0) return;
+ int scratch_buffer_index = node.temporaries->data[0];
+ const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
+ ANeuralNetworksOperandType operand_type{
+ ANEURALNETWORKS_TENSOR_FLOAT32,
+ static_cast<uint32_t>(tensor->dims->size),
+ reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
+ tensor->params.zero_point};
+ CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+ augmented_outputs.insert(augmented_outputs.begin(), next_id++);
+ };
+
+ auto add_mean_params = [&add_scalar_int32](void* data) {
+#ifdef OBS_BUILD
+ auto builtin = reinterpret_cast<TfLiteMeanParams*>(data);
+#else
+ auto builtin = reinterpret_cast<TfLiteReducerParams*>(data);
+#endif
+ add_scalar_int32(builtin->keep_dims);
+ };
+
+ auto add_svdf_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data);
+ add_scalar_int32(builtin->rank);
+ add_scalar_int32(builtin->activation);
+ };
+
+ auto add_rnn_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteRNNParams*>(data);
+ add_scalar_int32(builtin->activation);
+ };
+
+ auto add_squeeze_params = [&](void* data) {
+ const auto* builtin = reinterpret_cast<TfLiteSqueezeParams*>(data);
+ // Note that we add the squeeze dimensions even if the dimensions were
+ // unspecified (empty), as NNAPI requires the operand.
+ add_vector_int32(builtin->squeeze_dims,
+ static_cast<uint32_t>(builtin->num_squeeze_dims));
+ };
+
+ // Handle optional input tensors.
+ auto add_optional_tensors = [&nn_model, &augmented_inputs,
+ &next_id](int nn_type) {
+ for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
+ if (augmented_inputs[idx] == kOptionalTensor) {
+ const std::vector<uint32_t> dim = {0, 0};
+ ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
+ CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+ CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
+ nullptr, 0))
+ augmented_inputs[idx] = next_id++;
+ }
+ }
+ };
+
+ int nnapi_version = 10;
+#include "nnapi_delegate_ex_AddOpsAndParams_lambda.inc"
+
+ ANeuralNetworksOperationType nn_op_type;
+
+ // Using namespace directive to minimize diff with upstream tensorflow
+ namespace tflite = ::tflite;
+
+ switch (builtin) {
+ case tflite::BuiltinOperator_ADD:
+ nn_op_type = ANEURALNETWORKS_ADD;
+ RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
+ break;
+ case tflite::BuiltinOperator_MUL:
+ nn_op_type = ANEURALNETWORKS_MUL;
+ RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
+ break;
+ case tflite::BuiltinOperator_AVERAGE_POOL_2D:
+ RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
+ nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
+ break;
+ case tflite::BuiltinOperator_MAX_POOL_2D:
+ RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
+ nn_op_type = ANEURALNETWORKS_MAX_POOL_2D;
+ break;
+ case tflite::BuiltinOperator_L2_POOL_2D:
+ RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
+ nn_op_type = ANEURALNETWORKS_L2_POOL_2D;
+ break;
+ case tflite::BuiltinOperator_CONV_2D: {
+ auto builtin = reinterpret_cast<TfLiteConvParams*>(node.builtin_data);
+ if (builtin->dilation_width_factor != 1 ||
+ builtin->dilation_height_factor != 1 || node.inputs->size != 3) {
+ logError("NNAPI does not support dilated Conv2D.");
+ return kTfLiteError;
+ }
+ }
+ RETURN_ERROR_IF_TFLITE_FAILED(
+ add_convolution_params(node.builtin_data));
+ nn_op_type = ANEURALNETWORKS_CONV_2D;
+ break;
+ case tflite::BuiltinOperator_RELU:
+ nn_op_type = ANEURALNETWORKS_RELU;
+ break;
+ case tflite::BuiltinOperator_RELU_N1_TO_1:
+ nn_op_type = ANEURALNETWORKS_RELU1;
+ break;
+ case tflite::BuiltinOperator_RELU6:
+ nn_op_type = ANEURALNETWORKS_RELU6;
+ break;
+ case tflite::BuiltinOperator_TANH:
+ nn_op_type = ANEURALNETWORKS_TANH;
+ break;
+ case tflite::BuiltinOperator_FLOOR:
+ nn_op_type = ANEURALNETWORKS_FLOOR;
+ break;
+ case tflite::BuiltinOperator_LOGISTIC:
+ nn_op_type = ANEURALNETWORKS_LOGISTIC;
+ break;
+ case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
+ RETURN_ERROR_IF_TFLITE_FAILED(
+ add_depthwise_conv_params(node.builtin_data));
+ nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
+ break;
+ case tflite::BuiltinOperator_CONCATENATION:
+ RETURN_ERROR_IF_TFLITE_FAILED(
+ add_concatenation_params(node.builtin_data));
+ nn_op_type = ANEURALNETWORKS_CONCATENATION;
+ break;
+ case tflite::BuiltinOperator_SOFTMAX:
+ add_softmax_params(node.builtin_data);
+ nn_op_type = ANEURALNETWORKS_SOFTMAX;
+ break;
+ case tflite::BuiltinOperator_FULLY_CONNECTED:
+ RETURN_ERROR_IF_TFLITE_FAILED(
+ add_fully_connected_params(node.builtin_data));
+ nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
+ break;
+ case tflite::BuiltinOperator_RESHAPE:
+ if (node.inputs->size != 2) {
+ logError("NNAPI only supports 2-input RESHAPE");
+ return kTfLiteError;
+ }
+ nn_op_type = ANEURALNETWORKS_RESHAPE;
+ // add_reshape_params(node.builtin_data);
+ break;
+ case tflite::BuiltinOperator_RESIZE_BILINEAR:
+ add_resize_bilinear_params(node.builtin_data);
+ nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR;
+ break;
+ case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+ add_space_to_depth_params(node.builtin_data);
+ nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
+ break;
+ case tflite::BuiltinOperator_LSTM: {
+ if (node.inputs->size + /* no of params */ 3 != 21) {
+ logError("NNAPI only supports 21-input LSTMs");
+ return kTfLiteError;
+ }
+ duplicate_state_tensor_float32(
+ node.outputs->data[/*kOutputStateTensor*/ 0]);
+ duplicate_state_tensor_float32(
+ node.outputs->data[/*kCellStateTensor*/ 1]);
+ add_lstm_params(node.builtin_data);
+ add_lstm_scratch_tensor_float32();
+ add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32);
+ nn_op_type = ANEURALNETWORKS_LSTM;
+ break;
+ }
+ case tflite::BuiltinOperator_DEQUANTIZE:
+ nn_op_type = ANEURALNETWORKS_DEQUANTIZE;
+ break;
+ case tflite::BuiltinOperator_SVDF: {
+ duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]);
+ add_svdf_params(node.builtin_data);
+ nn_op_type = ANEURALNETWORKS_SVDF;
+ break;
+ }
+ case tflite::BuiltinOperator_RNN: {
+ duplicate_state_tensor_float32(
+ node.outputs->data[/*kHiddenStateTensor*/ 0]);
+ add_rnn_params(node.builtin_data);
+ nn_op_type = ANEURALNETWORKS_RNN;
+ break;
+ }
+ case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
+ nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP;
+ break;
+ case tflite::BuiltinOperator_PAD:
+ nnapi_version = 11; // require NNAPI 1.1
+ nn_op_type = ANEURALNETWORKS_PAD;
+ break;
+ case tflite::BuiltinOperator_MEAN:
+ nnapi_version = 11; // require NNAPI 1.1
+ add_mean_params(node.builtin_data);
+ nn_op_type = ANEURALNETWORKS_MEAN;
+ break;
+ case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
+ nn_op_type = ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
+ add_lrn_params(node.builtin_data);
+ break;
+ case tflite::BuiltinOperator_DIV:
+ nnapi_version = 11; // require NNAPI 1.1
+ nn_op_type = ANEURALNETWORKS_DIV;
+ RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
+ reinterpret_cast<TfLiteDivParams*>(node.builtin_data)->activation));
+ break;
+ case tflite::BuiltinOperator_SUB:
+ nnapi_version = 11; // require NNAPI 1.1
+ nn_op_type = ANEURALNETWORKS_SUB;
+ RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
+ reinterpret_cast<TfLiteSubParams*>(node.builtin_data)->activation));
+ break;
+ case tflite::BuiltinOperator_SQUEEZE:
+ nnapi_version = 11; // requires NNAPI 1.1
+ add_squeeze_params(node.builtin_data);
+ nn_op_type = ANEURALNETWORKS_SQUEEZE;
+ break;
+ case tflite::BuiltinOperator_TRANSPOSE:
+ // The permutation input tensor value dictates the output dimensions.
+ // TODO(b/110888333): Support dynamically-sized tensors in delegates.
+ if ((node.inputs->size > 1) &&
+ (interpreter->tensor(node.inputs->data[1])->allocation_type !=
+ kTfLiteMmapRo)) {
+ logError("NNAPI does not yet support dynamic tensors.");
+ return kTfLiteError;
+ }
+ nnapi_version = 11; // require NNAPI 1.1
+ nn_op_type = ANEURALNETWORKS_TRANSPOSE;
+ break;
+ case tflite::BuiltinOperator_L2_NORMALIZATION:
+ nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
+ if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data)
+ ->activation != kTfLiteActNone) {
+ logError(
+ "NNAPI does not support L2Normalization with fused activations");
+ return kTfLiteError;
+ }
+ if ((node.inputs->size > 0) &&
+ (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) {
+ logError("NNAPI only supports input rank 4 for L2Normalization");
+ return kTfLiteError;
+ }
+ break;
+ case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
+ if (interpreter->tensor(node.outputs->data[0])->type !=
+ kTfLiteFloat32) {
+ logError("NNAPI only support HASHTABLE_LOOKUP with float32 output",
+ builtin);
+ return kTfLiteError;
+ }
+ nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP;
+ break;
+ case tflite::BuiltinOperator_STRIDED_SLICE:
+ add_strided_slice_params(node.builtin_data);
+ nn_op_type = ANEURALNETWORKS_STRIDED_SLICE;
+ break;
+ case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
+ nnapi_version = 11; // require NNAPI 1.1
+ nn_op_type = ANEURALNETWORKS_SPACE_TO_BATCH_ND;
+ break;
+ case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
+ nnapi_version = 11; // require NNAPI 1.1
+ nn_op_type = ANEURALNETWORKS_BATCH_TO_SPACE_ND;
+ check_batch_to_space_params();
+ break;
+ case tflite::BuiltinOperator_CAST:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_CAST_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_TOPK_V2:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_TOPK_V2_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_GATHER:
+ add_gather_ex_params(node.builtin_data);
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_GATHER_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_SPLIT:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_SPLIT_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_NEG:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_NEG_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_EXP:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_EXP_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_TRANSPOSE_CONV:
+ add_transpose_conv_params(node.builtin_data);
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_TRANSPOSE_CONV_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_PRELU:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_PRELU_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_ARG_MAX:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_ARGMAX_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+#ifndef OBS_BUILD
+ case tflite::BuiltinOperator_PACK:
+ add_pack_ex_params(node.builtin_data);
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_PACK_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_UNPACK:
+ add_unpack_ex_params(node.builtin_data);
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_UNPACK_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_SQRT:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_SQRT_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_RSQRT:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_RSQRT_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_EQUAL:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_EQUAL_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_NOT_EQUAL:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_NOT_EQUAL_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_SUM:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_REDUCE_SUM_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_REDUCE_MAX:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_TENSORFLOW_MAX_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_REDUCE_MIN:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_REDUCE_MIN_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_LOGICAL_AND:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_LOGICAL_AND_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ case tflite::BuiltinOperator_LOGICAL_OR:
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_LOGICAL_OR_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+#endif
+ case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
+ case tflite::BuiltinOperator_LSH_PROJECTION:
+ case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+ case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
+ case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
+ case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
+ case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
+ //case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
+ case tflite::BuiltinOperator_PADV2:
+ //case tflite::BuiltinOperator_RESIZE_BILINEAR:
+ case tflite::BuiltinOperator_CALL:
+ case tflite::BuiltinOperator_SKIP_GRAM:
+ //case tflite::BuiltinOperator_RELU_N1_TO_1:
+ //case tflite::BuiltinOperator_GATHER:
+ //case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
+ //case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
+ //case tflite::BuiltinOperator_TOPK_V2:
+ //case tflite::BuiltinOperator_SPLIT:
+ //case tflite::BuiltinOperator_STRIDED_SLICE:
+ //case tflite::BuiltinOperator_EXP:
+ case tflite::BuiltinOperator_LOG_SOFTMAX:
+ //case tflite::BuiltinOperator_DEQUANTIZE:
+ case tflite::BuiltinOperator_DELEGATE:
+ //case tflite::BuiltinOperator_CAST:
+ //case tflite::BuiltinOperator_PRELU:
+ case tflite::BuiltinOperator_MAXIMUM:
+ case tflite::BuiltinOperator_MINIMUM:
+#ifndef OBS_BUILD
+ case tflite::BuiltinOperator_ARG_MIN:
+#endif
+ case tflite::BuiltinOperator_GREATER:
+ case tflite::BuiltinOperator_GREATER_EQUAL:
+ case tflite::BuiltinOperator_LESS:
+ case tflite::BuiltinOperator_LESS_EQUAL:
+ //case tflite::BuiltinOperator_NEG:
+ case tflite::BuiltinOperator_SELECT:
+ case tflite::BuiltinOperator_SLICE:
+ case tflite::BuiltinOperator_SIN:
+ //case tflite::BuiltinOperator_LOG:
+ //case tflite::BuiltinOperator_TRANSPOSE_CONV:
+#ifndef OBS_BUILD
+ case tflite::BuiltinOperator_TILE:
+ case tflite::BuiltinOperator_EXPAND_DIMS:
+ case tflite::BuiltinOperator_SPARSE_TO_DENSE:
+ //case tflite::BuiltinOperator_EQUAL:
+ //case tflite::BuiltinOperator_NOT_EQUAL:
+ //case tflite::BuiltinOperator_SUM:
+ //case tflite::BuiltinOperator_REDUCE_MAX:
+ //case tflite::BuiltinOperator_REDUCE_MIN:
+ case tflite::BuiltinOperator_REDUCE_PROD:
+ //case tflite::BuiltinOperator_SQRT:
+ //case tflite::BuiltinOperator_RSQRT:
+ case tflite::BuiltinOperator_SHAPE:
+ case tflite::BuiltinOperator_POW:
+ case tflite::BuiltinOperator_FAKE_QUANT:
+ //case tflite::BuiltinOperator_PACK:
+ //case tflite::BuiltinOperator_LOGICAL_OR:
+ case tflite::BuiltinOperator_ONE_HOT:
+ //case tflite::BuiltinOperator_LOGICAL_AND:
+ case tflite::BuiltinOperator_LOGICAL_NOT:
+ //case tflite::BuiltinOperator_UNPACK:
+ case tflite::BuiltinOperator_FLOOR_DIV:
+ case tflite::BuiltinOperator_REDUCE_ANY:
+ case tflite::BuiltinOperator_SQUARE:
+ case tflite::BuiltinOperator_ZEROS_LIKE:
+ case tflite::BuiltinOperator_FILL:
+#endif
+ logError("Op code %d is currently not delegated to NNAPI", builtin);
+ return kTfLiteError;
+ break;
+ case tflite::BuiltinOperator_CUSTOM: {
+ std::string custom_name(registration.custom_name);
+ if (custom_name.compare("TensorFlowMax") == 0) {
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_TENSORFLOW_MAX_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ }
+ else if (custom_name.compare("SquaredDifference") == 0) {
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_SQUARED_DIFFERENCE_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ }
+ else if (custom_name.compare("TensorFlowSum") == 0) {
+ CHECK_NN(ANeuralNetworksModel_addOperationEx(
+ nn_model, ANEURALNETWORKS_REDUCE_SUM_EX,
+ static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(node.outputs->size),
+ reinterpret_cast<uint32_t*>(node.outputs->data)));
+ continue;
+ }
+ logError("Custom operations are not supported when using NNAPI.");
+ return kTfLiteError;
+ break;
+ }
+#ifdef OBS_BUILD
+ default:
+ logError("Op code %d is currently not delegated to NNAPI", builtin);
+ return kTfLiteError;
+ break;
+#endif
+ }
+
+ //if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) {
+ // FATAL("Op %d needs NNAPI1.1", builtin);
+ //}
+
+ // Add the operation.
+ RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation(
+ nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
+ augmented_inputs.data(),
+ static_cast<uint32_t>(augmented_outputs.size()),
+ reinterpret_cast<uint32_t*>(augmented_outputs.data())));
+ }
+ return kTfLiteOk;
+}
+
+TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) {
+ if (nn_model_ && nn_compiled_model_) return model_status_;
+
+ // TODO(aselle): This is not correct. need to handle resize invalidation.
+ if (!nn_model_) {
+ CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
+
+ // Find which tensors should be added to NNAPI. TFLite has temporaries
+ // and RNN back-edges which are are not valid for NNAPI. We look through all
+ // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with
+ // kOperandIdNotSet. addTensorOperands will replace those with the
+ // corresponding NNAPI operand ids and skip kOperandNotNeeded entries.
+ std::vector<int64_t> tensor_id_to_nnapi_id(interpreter->tensors_size(),
+ kOperandNotNeeded);
+ auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf,
+ size_t count) {
+ for (int j = 0; j < count; j++) {
+ auto tensor_id = buf[j];
+ if (tensor_id != kOptionalTensor) {
+ tensor_id_to_nnapi_id[tensor_id] = kOperandIdNotSet;
+ }
+ }
+ };
+ for (size_t i = 0; i < interpreter->nodes_size(); i++) {
+ const auto* node_and_registration = interpreter->node_and_registration(i);
+ const TfLiteNode& node = node_and_registration->first;
+ set_ids_to_not_set(node.inputs->data, node.inputs->size);
+ set_ids_to_not_set(node.outputs->data, node.outputs->size);
+ }
+ set_ids_to_not_set(interpreter->inputs().data(),
+ interpreter->inputs().size());
+ set_ids_to_not_set(interpreter->outputs().data(),
+ interpreter->outputs().size());
+
+ uint32_t next_id = 0;
+ RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands(
+ interpreter, nn_model_, &next_id, &tensor_id_to_nnapi_id));
+ RETURN_ERROR_IF_TFLITE_FAILED(
+ AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
+ &model_states_outputs_, tensor_id_to_nnapi_id));
+
+ std::vector<uint32_t> augmented_inputs;
+ MapAndAddTensorIds(interpreter->inputs().data(),
+ interpreter->inputs().size(), &augmented_inputs,
+ tensor_id_to_nnapi_id);
+ augmented_inputs.insert(augmented_inputs.end(),
+ model_states_inputs_.begin(),
+ model_states_inputs_.end());
+ std::vector<uint32_t> augmented_outputs;
+ MapAndAddTensorIds(interpreter->outputs().data(),
+ interpreter->outputs().size(), &augmented_outputs,
+ tensor_id_to_nnapi_id);
+ MapAndAddTensorIds(model_states_outputs_.data(),
+ model_states_outputs_.size(), &augmented_outputs,
+ tensor_id_to_nnapi_id);
+
+ CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
+ nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
+ reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
+ static_cast<uint32_t>(augmented_outputs.size()),
+ reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
+
+ // TODO Support ANeuralNetworksModel_relaxComputationFloat32toFloat16
+ //if (GetAndroidSdkVersionCached() >= 28) {
+ // CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+ // nn_model_, interpreter->GetAllowFp16PrecisionForFp32()));
+ //}
+ CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
+ }
+ if (!nn_compiled_model_) {
+ CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
+ CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
+ }
+ return kTfLiteOk;
+}
+
+#include <unordered_map>
+
+TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) {
+ if (!nn_model_) {
+ model_status_ = BuildGraph(interpreter);
+ if (model_status_ != kTfLiteOk) {
+ logError("Failed to build graph for NNAPI");
+ }
+ }
+ if (model_status_ != kTfLiteOk) {
+ return model_status_;
+ }
+
+ ANeuralNetworksExecution* execution = nullptr;
+ CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
+
+ // Allocate temporary buffer to save casted boolean tensor
+ std::unordered_map<size_t, uint8_t*> input_boolean_tensors;
+ std::unordered_map<size_t, uint8_t*> output_boolean_tensors;
+ for (size_t i = 0; i < interpreter->inputs().size(); i++)
+ {
+ int input = interpreter->inputs()[i];
+ TfLiteTensor* tensor = interpreter->tensor(input);
+ if (tensor->type == kTfLiteBool)
+ {
+ size_t elements = tensor->bytes / sizeof(bool);
+ uint8_t* temp_tensor = new uint8_t[tensor->bytes / sizeof(bool)];
+ input_boolean_tensors[i] = temp_tensor;
+ for (size_t idx = 0; idx < elements; idx++)
+ {
+ temp_tensor[idx] = (tensor->data.b[idx] ? 0x00 : 0xff);
+ }
+ }
+ }
+ for (size_t i = 0; i < interpreter->outputs().size(); i++)
+ {
+ int output = interpreter->outputs()[i];
+ TfLiteTensor* tensor = interpreter->tensor(output);
+ if (tensor->type == kTfLiteBool)
+ {
+ uint8_t* temp_tensor = new uint8_t[tensor->bytes / sizeof(bool)];
+ output_boolean_tensors[i] = temp_tensor;
+ }
+ }
+
+ // Currently perform deep copy of input buffer
+ for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+ int input = interpreter->inputs()[i];
+ // TODO(aselle): Is this what we want or do we want input instead?
+ // TODO(aselle): This should be called setInputValue maybe to be cons.
+ TfLiteTensor* tensor = interpreter->tensor(input);
+ if (tensor->type == kTfLiteBool)
+ {
+ CHECK_NN(ANeuralNetworksExecution_setInput(
+ execution, i, nullptr, input_boolean_tensors[i], tensor->bytes * sizeof(uint8_t) / sizeof(bool)));
+ }
+ else
+ {
+ CHECK_NN(ANeuralNetworksExecution_setInput(
+ execution, i, nullptr, tensor->data.raw, tensor->bytes));
+ }
+ }
+
+ // Tell nn api where to place final data.
+ for (size_t i = 0; i < interpreter->outputs().size(); i++) {
+ int output = interpreter->outputs()[i];
+ TfLiteTensor* tensor = interpreter->tensor(output);
+
+ if (tensor->type == kTfLiteBool)
+ {
+ CHECK_NN(ANeuralNetworksExecution_setOutput(
+ execution, i, nullptr, output_boolean_tensors[i], tensor->bytes * sizeof(uint8_t) / sizeof(bool)));
+ }
+ else
+ {
+ CHECK_NN(ANeuralNetworksExecution_setOutput(
+ execution, i, nullptr, tensor->data.raw, tensor->bytes));
+ }
+ }
+
+ // The state_out of previous invocation need to be mapped to state_in of
+ // current invocation.
+ for (size_t i = 0; i < model_states_outputs_.size(); i++) {
+ int state_tensor_idx = model_states_outputs_[i];
+ TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx);
+ // Here we are using a deep copy for state_in tensors so that we are not
+ // reading and writing into the same buffer during a invocation.
+ // TODO(miaowang): using double shared buffer to minimize the copies.
+ CHECK_NN(ANeuralNetworksExecution_setInput(
+ execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw,
+ tensor->bytes));
+ // Tell NNAPI where to output the state_out.
+ CHECK_NN(ANeuralNetworksExecution_setOutput(
+ execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw,
+ tensor->bytes));
+ }
+
+ // Currently use blocking compute.
+ ANeuralNetworksEvent* event = nullptr;
+ CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
+ CHECK_NN(ANeuralNetworksEvent_wait(event));
+ ANeuralNetworksEvent_free(event);
+ ANeuralNetworksExecution_free(execution);
+
+ // Tell nn api where to place final data.
+ for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+ int input = interpreter->inputs()[i];
+ TfLiteTensor* tensor = interpreter->tensor(input);
+
+ if (tensor->type == kTfLiteBool)
+ {
+ uint8_t* temp_tensor = input_boolean_tensors[i];
+ input_boolean_tensors[i] = nullptr;
+ delete temp_tensor;
+ }
+ }
+ for (size_t i = 0; i < interpreter->outputs().size(); i++) {
+ int output = interpreter->outputs()[i];
+ TfLiteTensor* tensor = interpreter->tensor(output);
+
+ if (tensor->type == kTfLiteBool)
+ {
+ uint8_t* temp_tensor = output_boolean_tensors[i];
+ size_t elements = tensor->bytes / sizeof(bool);
+ for (size_t idx = 0; idx < elements; idx++)
+ {
+ tensor->data.b[idx] = ((temp_tensor[idx] == 0x00) ? false : true);
+ }
+ output_boolean_tensors[i] = nullptr;
+ delete temp_tensor;
+ }
+ }
+
+#if 0
+ printf("From the NN API:\n");
+ TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+ if (float* data =
+ interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
+ size_t num = tensor->bytes / sizeof(float);
+ for (float* p = data; p < data + num; p++) {
+ printf(" %f", *p);
+ }
+ printf("\n");
+ }
+#endif
+
+ return kTfLiteOk;
+}
+
+bool NNAPIDelegate::IsSupported() { return nnfw::NNAPIExists(); }
+
+} // namespace tflite
+} // namespace nnfw
+
+// clang-format on
diff --git a/libs/tflite/src/ext/nnapi_delegate_ex_AddOpsAndParams_lambda.inc b/libs/tflite/src/ext/nnapi_delegate_ex_AddOpsAndParams_lambda.inc
new file mode 100644
index 000000000..a91e4de60
--- /dev/null
+++ b/libs/tflite/src/ext/nnapi_delegate_ex_AddOpsAndParams_lambda.inc
@@ -0,0 +1,106 @@
+// This file is included from AddOpsAndParams defined in nnapi_delegate.cc
+// and contains lambda for extened implementation to original Tensorflow Lite.
+ auto add_resize_bilinear_params = [&add_scalar_int32, &interpreter, &augmented_inputs](void* data) {
+ auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(data);
+ if (builtin->align_corners) {
+ FATAL("Resize bilinear does not support align corners in NNAPI");
+ }
+
+ TfLiteTensor* tensor = interpreter->tensor(augmented_inputs.back());
+ assert(tensor->type == kTfLiteInt32);
+ assert(tensor->bytes == sizeof(int)*2);
+ augmented_inputs.pop_back();
+
+ int height = ((int*)(tensor->data.raw))[1];
+ int width = ((int*)(tensor->data.raw))[0];
+ add_scalar_int32(height);
+ add_scalar_int32(width);
+ };
+
+ auto check_l2normalization_params = [interpreter, &node](void* data) {
+ auto builtin = reinterpret_cast<TfLiteL2NormParams*>(data);
+ if (builtin->activation != kTfLiteActNone) {
+ FATAL("NNAPI does not support L2Normalization with fused activations");
+ }
+ if ((node.inputs->size > 0) &&
+ (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) {
+ FATAL("NNAPI only supports input rank 4 for L2Normalization");
+ }
+ };
+
+ auto add_transpose_conv_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteTransposeConvParams*>(data);
+ add_scalar_int32(builtin->padding);
+ add_scalar_int32(builtin->stride_width);
+ add_scalar_int32(builtin->stride_height);
+ };
+
+ auto add_lrn_params = [&add_scalar_int32,
+ &add_scalar_float32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(data);
+ add_scalar_int32(builtin->radius);
+ add_scalar_float32(builtin->bias);
+ add_scalar_float32(builtin->alpha);
+ add_scalar_float32(builtin->beta);
+ };
+
+ auto add_strided_slice_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(data);
+ add_scalar_int32(builtin->begin_mask);
+ add_scalar_int32(builtin->end_mask);
+ // ellipsis_mask and new_axis_mask are not supported on nn runtime
+ // cf) tflite interpreter supports both operations
+ if (builtin->ellipsis_mask) {
+ FATAL("STRIDE_SLICE does not support ellipsis_mask in NNAPI");
+ }
+ if (builtin->new_axis_mask) {
+ FATAL("STRIDE_SLICE does not support new_axis_mask in NNAPI");
+ }
+ add_scalar_int32(builtin->shrink_axis_mask);
+ };
+
+ auto add_gather_ex_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteGatherParams*>(data);
+ add_scalar_int32(builtin->axis);
+ if (builtin->axis != 0) {
+ FATAL("GATHER does not support axis>0 in NNAPI");
+ }
+ };
+
+#ifndef OBS_BUILD
+ auto add_pack_ex_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLitePackParams*>(data);
+ add_scalar_int32(builtin->values_count);
+ add_scalar_int32(builtin->axis);
+ };
+
+ auto add_unpack_ex_params = [&add_scalar_int32](void* data) {
+ auto builtin = reinterpret_cast<TfLiteUnpackParams*>(data);
+ add_scalar_int32(builtin->num);
+ add_scalar_int32(builtin->axis);
+ };
+#endif
+
+ auto check_batch_to_space_params = [interpreter, &node, &augmented_inputs]() {
+
+ //If there are 3 inputs, check if crops is having default values {0, 0, 0, 0}
+ //Else unsupported by NNAPI
+
+ if(augmented_inputs.size() == 3)
+ {
+ const uint32_t crops_buffer_index = node.inputs->data[2];
+ const TfLiteTensor* crops = interpreter->tensor(crops_buffer_index);
+ const int *crops_value = crops->data.i32;
+
+ //Check if crops is having default values {0, 0, 0, 0}
+ if(crops_value[0] != 0 || crops_value[1] != 0 || crops_value[2] != 0 || crops_value[3] != 0)
+ {
+ FATAL("BATCH_TO_SPACE_ND does not support Explicit crops in NNAPI");
+ }
+ else
+ {
+ //Restrict crops input and pass only other two inputs
+ augmented_inputs.pop_back();
+ }
+ }
+ };
diff --git a/libs/support/tflite/src/interp/FlatBufferBuilder.cpp b/libs/tflite/src/interp/FlatBufferBuilder.cpp
index 67df13f34..4b9cde719 100644
--- a/libs/support/tflite/src/interp/FlatBufferBuilder.cpp
+++ b/libs/tflite/src/interp/FlatBufferBuilder.cpp
@@ -14,24 +14,20 @@
* limitations under the License.
*/
-#include "support/tflite/interp/FlatBufferBuilder.h"
+#include "tflite/interp/FlatBufferBuilder.h"
-#include "support/tflite/kernels/register.h"
+#include "tflite/ext/kernels/register.h"
namespace nnfw
{
-namespace support
-{
namespace tflite
{
-namespace interp
-{
std::unique_ptr<::tflite::Interpreter> FlatBufferBuilder::build(void) const
{
std::unique_ptr<::tflite::Interpreter> interpreter;
- ::tflite::ops::builtin::BuiltinOpResolver resolver;
+ nnfw::tflite::BuiltinOpResolver resolver;
::tflite::InterpreterBuilder builder(_model, resolver);
@@ -40,7 +36,5 @@ std::unique_ptr<::tflite::Interpreter> FlatBufferBuilder::build(void) const
return std::move(interpreter);
}
-} // namespace interp
} // namespace tflite
-} // namespace support
} // namespace nnfw
diff --git a/libs/support/tflite/src/interp/FunctionBuilder.cpp b/libs/tflite/src/interp/FunctionBuilder.cpp
index 65783bd37..eab940c18 100644
--- a/libs/support/tflite/src/interp/FunctionBuilder.cpp
+++ b/libs/tflite/src/interp/FunctionBuilder.cpp
@@ -14,16 +14,12 @@
* limitations under the License.
*/
-#include "support/tflite/interp/FunctionBuilder.h"
+#include "tflite/interp/FunctionBuilder.h"
namespace nnfw
{
-namespace support
-{
namespace tflite
{
-namespace interp
-{
std::unique_ptr<::tflite::Interpreter> FunctionBuilder::build(void) const
{
@@ -34,7 +30,5 @@ std::unique_ptr<::tflite::Interpreter> FunctionBuilder::build(void) const
return std::move(res);
}
-} // namespace interp
} // namespace tflite
-} // namespace support
} // namespace nnfw
diff --git a/libs/util/CMakeLists.txt b/libs/util/CMakeLists.txt
deleted file mode 100644
index eaa7ae8cf..000000000
--- a/libs/util/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-# Library `nnfw_util`
-set(NNFW_UTILITY_SRCS src/environment.cpp)
-list(APPEND NNFW_UTILITY_SRCS src/tensor/Shape.cpp)
-list(APPEND NNFW_UTILITY_SRCS src/tensor/NonIncreasingStride.cpp)
-list(APPEND NNFW_UTILITY_SRCS src/tensor/IndexFormatter.cpp)
-list(APPEND NNFW_UTILITY_SRCS src/tensor/Comparator.cpp)
-if(BUILD_TFLITE_BENCHMARK_MODEL)
- list(APPEND NNFW_UTILITY_SRCS src/profiling/time.cc)
-endif()
-
-add_library(nnfw_util SHARED ${NNFW_UTILITY_SRCS})
-target_include_directories(nnfw_util PUBLIC ${NNFW_INCLUDE_DIR})
-
-add_library(static_nnfw_util STATIC ${NNFW_UTILITY_SRCS})
-target_include_directories(static_nnfw_util PUBLIC ${NNFW_INCLUDE_DIR})
-set_target_properties(static_nnfw_util PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-install(TARGETS nnfw_util
- RUNTIME DESTINATION bin COMPONENT libraries
- LIBRARY DESTINATION lib COMPONENT libraries)
-
-add_executable(nnfw_util_tensor_index_iterator "examples/tensor_index_iterator.cpp")
-target_link_libraries(nnfw_util_tensor_index_iterator nnfw_util)
diff --git a/libs/util/src/profiling/time.cc b/libs/util/src/profiling/time.cc
deleted file mode 100644
index 6fe1b54dc..000000000
--- a/libs/util/src/profiling/time.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "util/profiling/time.h"
-
-#include <sys/time.h>
-
-namespace tflite
-{
-namespace profiling
-{
-namespace time
-{
-uint64_t NowMicros()
-{
- struct timeval tv;
- gettimeofday(&tv, nullptr);
- return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-}
-} // namespace time
-} // namespace profiling
-} // namespace tflite