summaryrefslogtreecommitdiff
path: root/compute
diff options
context:
space:
mode:
Diffstat (limited to 'compute')
-rw-r--r--compute/ARMComputeEx/CMakeLists.txt32
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h245
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h101
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h62
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h98
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h58
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h113
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h109
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h129
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h100
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h55
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h59
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h104
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h69
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h58
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h657
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h85
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h72
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h69
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h70
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h80
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h80
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h102
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h79
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h118
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h96
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h115
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h83
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h84
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h82
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h92
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h81
-rw-r--r--compute/ARMComputeEx/arm_compute/core/TypesEx.h64
-rw-r--r--compute/ARMComputeEx/arm_compute/core/UtilsEx.h47
-rw-r--r--compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h222
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h42
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h106
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h45
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h41
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h52
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h44
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h54
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h89
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h62
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h59
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h80
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h38
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h39
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h40
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h80
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h104
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h96
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h56
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h44
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h58
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h141
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h157
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h79
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h49
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h38
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h81
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h98
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h63
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h62
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h54
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h65
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h164
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h148
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h98
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h155
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h69
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h77
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h100
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h47
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h114
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h83
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h85
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h82
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h83
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h120
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h63
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h162
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h84
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h87
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h112
-rw-r--r--compute/ARMComputeEx/resolve_includes.py102
-rw-r--r--compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp359
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl113
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl167
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl106
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl209
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl161
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl113
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl139
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl117
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h352
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h406
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl251
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl55
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl135
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl96
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl114
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl188
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl250
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl161
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl98
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl129
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl269
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp157
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp172
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp105
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp116
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp114
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp137
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp178
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp177
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp88
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp186
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp179
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp241
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp124
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp473
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp164
-rw-r--r--compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp102
-rw-r--r--compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp346
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp237
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp653
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp165
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp205
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp118
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp252
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp181
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp280
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp213
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp274
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp224
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp677
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp165
-rw-r--r--compute/ARMComputeEx/src/core/UtilsEx.cpp45
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp20
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp120
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp39
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp28
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp28
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp29
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp60
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp36
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp29
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp48
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp28
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp39
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp147
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp151
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp29
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp28
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp311
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp238
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp67
-rw-r--r--compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp37
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp20
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp109
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp70
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp45
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp48
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp44
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp29
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp282
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp477
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp91
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp503
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp47
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp37
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp97
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp39
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp146
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp164
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp164
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp165
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp157
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp99
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp49
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp307
-rw-r--r--compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp92
-rw-r--r--compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp128
-rw-r--r--compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp38
-rw-r--r--compute/ARMComputeEx/src/runtime/topk_v2.h191
-rw-r--r--compute/CMakeLists.txt1
-rw-r--r--compute/cker/CMakeLists.txt11
-rw-r--r--compute/cker/README.md7
-rw-r--r--compute/cker/include/cker/Shape.h286
-rw-r--r--compute/cker/include/cker/Types.h82
-rw-r--r--compute/cker/include/cker/Utils.h159
-rw-r--r--compute/cker/include/cker/eigen/Utils.h56
-rw-r--r--compute/cker/include/cker/gemmlowp/FixedPoint.h289
-rw-r--r--compute/cker/include/cker/operation/AveragePool.h101
-rw-r--r--compute/cker/include/cker/operation/BinaryArithmeticOps.h172
-rw-r--r--compute/cker/include/cker/operation/Concatenation.h93
-rw-r--r--compute/cker/include/cker/operation/Conv.h217
-rw-r--r--compute/cker/include/cker/operation/DepthwiseConv.h217
-rw-r--r--compute/cker/include/cker/operation/FullyConnected.h144
-rw-r--r--compute/cker/include/cker/operation/Gather.h78
-rw-r--r--compute/cker/include/cker/operation/InstanceNorm.h99
-rw-r--r--compute/cker/include/cker/operation/Logistic.h44
-rw-r--r--compute/cker/include/cker/operation/MaxPool.h98
-rw-r--r--compute/cker/include/cker/operation/Pad.h224
-rw-r--r--compute/cker/include/cker/operation/SoftMax.h130
-rw-r--r--compute/cker/include/cker/operation/TransposeConv.h135
-rw-r--r--compute/cker/include/cker/operation/optimized/AveragePool.h105
-rw-r--r--compute/cker/include/cker/operation/optimized/MaxPool.h97
-rw-r--r--compute/cker/include/cker/operation/optimized/SoftMax.h59
-rw-r--r--compute/cker/include/cker/operation/reference/AveragePool.h90
-rw-r--r--compute/cker/include/cker/operation/reference/MaxPool.h84
-rw-r--r--compute/cker/include/cker/operation/reference/SoftMax.h70
-rw-r--r--compute/ncnn/CMakeLists.txt34
-rw-r--r--compute/ncnn/README.md9
-rw-r--r--compute/ncnn/include/ncnn/layer/binaryop.h69
-rw-r--r--compute/ncnn/include/ncnn/layer/instance_norm.h59
-rw-r--r--compute/ncnn/include/ncnn/mat.h738
-rw-r--r--compute/ncnn/include/ncnn/srcn/conv_type.h74
-rw-r--r--compute/ncnn/include/ncnn/srcn/srcn_conv.h65
-rw-r--r--compute/ncnn/src/layer/arm/neon_mathfun.h315
-rw-r--r--compute/ncnn/src/layer/binaryop.cc1640
-rw-r--r--compute/ncnn/src/layer/instance_norm.cc371
-rw-r--r--compute/ncnn/src/mat.cc940
-rw-r--r--compute/ncnn/src/srcn/common.h162
-rw-r--r--compute/ncnn/src/srcn/conv_sgemm_multithreads.cc483
-rw-r--r--compute/ncnn/src/srcn/conv_sgemm_multithreads.h86
-rw-r--r--compute/ncnn/src/srcn/conv_sgemm_singlethread.cc366
-rw-r--r--compute/ncnn/src/srcn/conv_sgemm_singlethread.h73
-rw-r--r--compute/ncnn/src/srcn/conv_sparse.cc271
-rw-r--r--compute/ncnn/src/srcn/conv_sparse.h79
-rw-r--r--compute/ncnn/src/srcn/conv_winograd.cc341
-rw-r--r--compute/ncnn/src/srcn/conv_winograd.h72
-rw-r--r--compute/ncnn/src/srcn/conv_winograd_batch.cc304
-rw-r--r--compute/ncnn/src/srcn/conv_winograd_batch.h67
-rw-r--r--compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc387
-rw-r--r--compute/ncnn/src/srcn/deconv_sgemm_multithreads.h85
-rw-r--r--compute/ncnn/src/srcn/depthwise_conv.cc2684
-rw-r--r--compute/ncnn/src/srcn/direct_conv_colmajor.cc5872
-rw-r--r--compute/ncnn/src/srcn/direct_conv_colmajor.h33
-rw-r--r--compute/ncnn/src/srcn/sgemm_kernel.cc2508
-rw-r--r--compute/ncnn/src/srcn/sgemm_kernel.h52
-rw-r--r--compute/ncnn/src/srcn/sgemm_pack.cc2316
-rw-r--r--compute/ncnn/src/srcn/sgemm_pack.h73
-rw-r--r--compute/ncnn/src/srcn/sgemm_singlethread.cc689
-rw-r--r--compute/ncnn/src/srcn/sgemm_singlethread.h88
-rw-r--r--compute/ncnn/src/srcn/sgemm_test.cc1883
-rw-r--r--compute/ncnn/src/srcn/srcn_conv.cc614
-rw-r--r--compute/ncnn/src/srcn/winograd.h148
249 files changed, 51005 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt
new file mode 100644
index 000000000..aaebff758
--- /dev/null
+++ b/compute/ARMComputeEx/CMakeLists.txt
@@ -0,0 +1,32 @@
+nnas_find_package(ARMCompute QUIET)
+
+if(NOT ARMCompute_FOUND)
+ message(STATUS "Check ARM Compute library extension build: need ARM Compute library")
+ return()
+else(NOT ARMCompute_FOUND)
+ message(STATUS "Check ARM Compute library extension build: OK")
+endif(NOT ARMCompute_FOUND)
+
+set(ACL_EX_BASE ${CMAKE_CURRENT_SOURCE_DIR})
+
+file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp")
+
+# generate embeded cl_kernel
+execute_process (
+ WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+ COMMAND bash -c "python resolve_includes.py"
+)
+
+add_library(arm_compute_ex SHARED ${ACL_EX_SRCS})
+target_include_directories(arm_compute_ex PUBLIC ${ACL_EX_BASE})
+target_link_libraries(arm_compute_ex PRIVATE arm_compute)
+target_link_libraries(arm_compute_ex PRIVATE nnfw_common)
+target_link_libraries(arm_compute_ex PRIVATE nnfw_coverage)
+# Defines to enable validate check in debug build
+target_compile_definitions(arm_compute_ex PRIVATE EMBEDDED_KERNELS
+ $<$<CONFIG:Debug>:ARM_COMPUTE_DEBUG_ENABLED ARM_COMPUTE_ASSERTS_ENABLED
+ ARM_COMPUTE_LOGGING_ENABLED>)
+# Validate check functions are not used on release build
+# Some parameter are used for validate check function call, and these parameter may not used on release build
+target_compile_options(arm_compute_ex PRIVATE $<$<NOT:$<CONFIG:Debug>>:-Wno-unused-parameter -Wno-unused-function>)
+install(TARGETS arm_compute_ex DESTINATION lib)
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
new file mode 100644
index 000000000..e4e752ef9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLKernelLibraryEx.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines
+ * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL.
+ */
+
+#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+
+/**
+ * @brief Class to build OpenCL kernels added from nnfw
+ * */
+class CLKernelLibraryEx
+{
+ using StringSet = std::set<std::string>;
+
+private:
+ /**
+ * @brief Construct a new CLKernelLibraryEx object
+ */
+ CLKernelLibraryEx();
+
+public:
+ /**
+ * @brief Prevent instances of this class from being copied.
+ */
+ CLKernelLibraryEx(const CLKernelLibraryEx &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied.
+ */
+ const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete;
+
+ /**
+ * @brief Get the KernelLibrary singleton.
+ * @return The KernelLibrary instance
+ */
+ static CLKernelLibraryEx &get();
+
+ /**
+ * @brief Initialise the kernel library.
+ * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+ * @param[in] context CL context used to create programs.
+ * @param[in] device CL device for which the programs are created.
+ * @return N/A
+ */
+ void init(std::string kernel_path, cl::Context context, cl::Device device)
+ {
+ _kernel_path = std::move(kernel_path);
+ _context = std::move(context);
+ _device = std::move(device);
+ }
+
+ /**
+ * @brief Set the path that the kernels reside in.
+ * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+ * @return N/A
+ */
+ void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; };
+
+ /**
+ * @brief Get the path that the kernels reside in.
+ * @return the path of kernel files
+ */
+ std::string get_kernel_path() { return _kernel_path; };
+
+ /**
+ * @brief Get the source of the selected program.
+ * @param[in] program_name Program name.
+ * @return Source of the selected program.
+ */
+ std::string get_program_source(const std::string &program_name);
+
+ /**
+ * @brief Set the CL context used to create programs.
+ * @note Setting the context also resets the device to the
+ * first one available in the new context.
+ * @param[in] context A CL context.
+ * @return N/A
+ */
+ void set_context(cl::Context context)
+ {
+ _context = std::move(context);
+ if (_context.get() == nullptr)
+ {
+ _device = cl::Device();
+ }
+ else
+ {
+ const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
+
+ if (cl_devices.empty())
+ {
+ _device = cl::Device();
+ }
+ else
+ {
+ _device = cl_devices[0];
+ }
+ }
+ }
+
+ /**
+ * @brief Return associated CL context.
+ * @return A CL context.
+ */
+ cl::Context &context() { return _context; }
+
+ /**
+ * @brief Set the CL device for which the programs are created.
+ * @param[in] device A CL device.
+ * @return N/A
+ */
+ void set_device(cl::Device device) { _device = std::move(device); }
+
+ /**
+ * @brief Gets the CL device for which the programs are created.
+ * @return A CL device.
+ */
+ cl::Device &get_device() { return _device; }
+
+ /**
+ * @brief Return the device version
+ * @return The content of CL_DEVICE_VERSION
+ */
+ std::string get_device_version();
+
+ /**
+ * @brief Create a kernel from the kernel library.
+ * @param[in] kernel_name Kernel name.
+ * @param[in] build_options_set Kernel build options as a set.
+ * @return The created kernel.
+ */
+ Kernel create_kernel(const std::string &kernel_name,
+ const StringSet &build_options_set = {}) const;
+
+ /**
+ * @brief Find the maximum number of local work items in a workgroup can be supported for the
+ * kernel.
+ * @param[in] kernel kernel object
+ */
+
+ size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
+ /**
+ * @brief Return the default NDRange for the device.
+ * @return default NDRangeof the device
+ */
+ cl::NDRange default_ndrange() const;
+
+ /**
+ * @brief Clear the library's cache of binary programs
+ * @return N/A
+ */
+ void clear_programs_cache()
+ {
+ _programs_map.clear();
+ _built_programs_map.clear();
+ }
+
+ /**
+ * @brief Access the cache of built OpenCL programs
+ * @return program map data structure of which key is name of kernel and value is
+ * kerel source name. (*.cl)
+ */
+ const std::map<std::string, cl::Program> &get_built_programs() const
+ {
+ return _built_programs_map;
+ }
+
+ /**
+ * @brief Add a new built program to the cache
+ * @param[in] built_program_name Name of the program
+ * @param[in] program Built program to add to the cache
+ * @return N/A
+ */
+ void add_built_program(const std::string &built_program_name, cl::Program program);
+
+ /**
+ * @brief Returns true if FP16 is supported by the CL device
+ * @return true if the CL device supports FP16
+ */
+ bool fp16_supported() const;
+
+ /**
+ * @brief Returns true if int64_base_atomics extension is supported by the CL device
+ * @return true if the CL device supports int64_base_atomics extension
+ */
+ bool int64_base_atomics_supported() const;
+
+private:
+ /**
+ * @brief Load program and its dependencies.
+ * @param[in] program_name Name of the program to load.
+ */
+ const Program &load_program(const std::string &program_name) const;
+ /**
+ * @brief Concatenates contents of a set into a single string.
+ * @param[in] s Input set to concatenate.
+ * @return Concatenated string.
+ */
+ std::string stringify_set(const StringSet &s) const;
+
+ cl::Context _context; /**< Underlying CL context. */
+ cl::Device _device; /**< Underlying CL device. */
+ std::string _kernel_path; /**< Path to the kernels folder. */
+ mutable std::map<std::string, const Program>
+ _programs_map; /**< Map with all already loaded program data. */
+ mutable std::map<std::string, cl::Program>
+ _built_programs_map; /**< Map with all already built program data. */
+ static const std::map<std::string, std::string>
+ _kernel_program_map; /**< Map that associates kernel names with programs. */
+ static const std::map<std::string, std::string>
+ _program_source_map; /**< Contains sources for all programs.
+ Used for compile-time kernel inclusion. >*/
+};
+}
+#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
new file mode 100644
index 000000000..b98b174f7
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgOperationKernel.h
+ * @brief This file defines CLArgOperationKernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the argop kernel.
+ */
+class CLArgOperationKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Default constructor.
+ */
+ CLArgOperationKernel();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
+ */
+ CLArgOperationKernel(const CLArgOperationKernel &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
+ * @return Reference of this instance
+ */
+ CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
+ */
+ CLArgOperationKernel(CLArgOperationKernel &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
+ * @return Reference of this instance
+ */
+ CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default;
+ /**
+ * @brief Initialise the kernel's input, output and border mode.
+ * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[out] output The output tensor, Data types supported: S32.
+ * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+ * @param[in] op Arg operation to perform.
+ * return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op);
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLArgOperationKernel
+ * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] output The output tensor info, Data types supported: S32.
+ * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+ * @param[in] op Arg operation to perform.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ArgOperation op);
+
+ /*
+ * @brief Run CLArgOperationKernel op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ uint32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
new file mode 100644
index 000000000..ab33d9d3a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/
+class CLBinaryLogicalOpKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLBinaryLogicalOpKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input1 Source tensor1.
+ * @param[in] input2 Source tensor2.
+ * @param[out] output Output tensor.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+ BinaryLogicalOperation op);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ ICLTensor *_output;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
new file mode 100644
index 000000000..16cef0b61
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLCastKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLCastKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
+#define __ARM_COMPUTE_CLCASTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define OpenCL kernel for cast operation
+ */
+class CLCastKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Construct CLCastKernel object
+ */
+ CLCastKernel();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLCastKernel(const CLCastKernel &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLCastKernel &operator=(const CLCastKernel &) = delete;
+
+ /**
+ * @brief Construct CLCastKernel object using default move constructor
+ * @param[in] CLCastKernel object to move
+ */
+ CLCastKernel(CLCastKernel &&) = default;
+
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param[in] CLCastKernel object to move
+ */
+ CLCastKernel &operator=(CLCastKernel &&) = default;
+
+ /**
+ * @brief Destruct this CLCastKernel object
+ */
+ ~CLCastKernel() = default;
+
+ /**
+ * @brief Initialise the kernel's input and output.
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] input_subtype Sub data type of input.
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
+
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
new file mode 100644
index 000000000..60ec7a82a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform depthTospace operation */
+class CLDepthToSpaceKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLDepthToSpaceKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
+ /** Default destructor */
+ ~CLDepthToSpaceKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
new file mode 100644
index 000000000..da075db69
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLEmbeddingLookupKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLEmbeddingLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform EmbeddingLookup operation with opencl kernel
+*/
+class CLEmbeddingLookupKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Construct a CLEmbeddingLookupKernel object
+ * */
+ CLEmbeddingLookupKernel();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete;
+
+ /**
+ * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor
+ * @param[in] CLEmbeddingLookupKernel object to move
+ * */
+ CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default;
+
+ /**
+ * @brief Move assignment operator
+ * @param[in] CLEmbeddingLookupKernel object to move
+ * */
+ CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default;
+
+ /**
+ * @brief Destruct this object
+ * */
+ ~CLEmbeddingLookupKernel() = default;
+
+ /**
+ * @brief Set the input and output of the kernel
+ * @param[in] input Source tensor.
+ * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] lookups Lookups are 1D tensor that values are indices into the first
+ * dimension of input.
+ * Data types supported: S32.
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLEmbeddingLookupKernel
+ * @param[in] input The input tensor info.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * @param[in] lookups Lookups info. Data types supported: S32.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *lookups);
+
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /** Source tensor */
+ ICLTensor *_output; /** Destination tensor */
+ const ICLTensor *_lookups; /** Lookups tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
new file mode 100644
index 000000000..aa81a1efa
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLGatherExKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLGatherExKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__
+#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define an interface for the gather kernel.
+ */
+class CLGatherExKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Construct CLGatherExKernel object
+ * */
+ CLGatherExKernel();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ */
+ CLGatherExKernel(const CLGatherExKernel &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ */
+ CLGatherExKernel &operator=(const CLGatherExKernel &) = delete;
+
+ /**
+ * @brief Construct CLGatherExKernel object by using default move constructor
+ * @param[in] CLGatherExKernel object to move
+ */
+ CLGatherExKernel(CLGatherExKernel &&) = default;
+
+ /**
+ * @brief Move assignment operator
+ * @param[in] CLGatherExKernel object to move
+ */
+ CLGatherExKernel &operator=(CLGatherExKernel &&) = default;
+
+ /**
+ * @brief Initialise the kernel's input, output and border mode.
+ * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] indices Indices tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative
+ * values wrap around. Defaults to 0
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLGatherExKernel
+ * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] indices Indices tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative
+ * values wrap around. Defaults to 0
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis = 0);
+
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_indices;
+ ICLTensor *_output;
+ int _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGATHEREXKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
new file mode 100644
index 000000000..8269e5a7a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLHashtableLookupKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLHashtableLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform HashtableLookup operation with opencl kernel
+*/
+class CLHashtableLookupKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Construct a CLHashtableLookupKernel object
+ * */
+ CLHashtableLookupKernel();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * */
+ CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete;
+
+ /**
+ * @brief Construct a CLHashtableLookupKernel object by using default move constructor
+ * @param[in] CLHashtableLookupKernel object to move
+ * */
+ CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default;
+
+ /**
+ * @brief Move assignment operator
+ * @param[in] CLHashtableLookupKernel object to move
+ * */
+ CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default;
+
+ /**
+ * @brief Destruct this object
+ * */
+ ~CLHashtableLookupKernel() = default;
+
+ /**
+ * @brief Set the input and output of the kernel
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input.
+ * @param[in] keys Keys 1D tensor. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+ * (True) or not (False). Data types supported: U8/QASYMM8
+ * @return N/A
+ */
+ void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input,
+ ICLTensor *output, ICLTensor *hits);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLHashtableLookupKernel
+ * @param[in] lookups The lookups tensor info. Data types supported: S32.
+ * @param[in] keys The keys tensor info. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input The input tensor info.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output The output tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup
+ * hits
+ * (True) or not (False). Data types supported: U8/QASYMM8
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits);
+
+ /**
+ * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+ * queue.
+ * @note The queue is *not* flushed by this method, and therefore the kernel will not have
+ * been executed by the time this method returns.
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of
+ * the window returned by window()).
+ * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_lookups{nullptr}; /** Lookups tensor */
+ const ICLTensor *_keys{nullptr}; /** Keys tensor */
+ const ICLTensor *_input{nullptr}; /** Source tensor */
+ ICLTensor *_output{nullptr}; /** Destination tensor */
+ ICLTensor *_hits{nullptr}; /** Hits tensor */
+ std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
new file mode 100644
index 000000000..f5e147e03
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
+#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for performing an instance normalization */
+class CLInstanceNormalizationLayerKernelEx : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLInstanceNormalizationLayerKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLInstanceNormalizationLayerKernelEx &
+ operator=(const CLInstanceNormalizationLayerKernelEx &) = delete;
+ /** Default Move Constructor. */
+ CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default;
+ /** Default move assignment operator */
+ CLInstanceNormalizationLayerKernelEx &
+ operator=(CLInstanceNormalizationLayerKernelEx &&) = default;
+ /** Default destructor */
+ ~CLInstanceNormalizationLayerKernelEx() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported:
+ * NCHW
+ * @param[out] output Destination tensor. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults
+ * to nullptr
+ * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults
+ * to nullptr
+ * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+ */
+ void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr,
+ ICLTensor *beta = nullptr, float epsilon = 1e-12f);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLInstanceNormalizationLayerEx.
+ *
+ * @param[in] input Source tensor info. In case of @p output tensor = nullptr this tensor will
+ * store the result of the normalization.
+ * Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+ * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to
+ * nullptr
+ * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to
+ * nullptr
+ * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+ float epsilon = 1e-12f);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_output;
+ ICLTensor *_gamma;
+ ICLTensor *_beta;
+ float _epsilon;
+ bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
new file mode 100644
index 000000000..ccbea147e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
+#define __ARM_COMPUTE_CLNEGKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a negation operation on tensor*/
+class CLNegKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLNegKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLNegKernel(const CLNegKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLNegKernel &operator=(const CLNegKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLNegKernel(CLNegKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLNegKernel &operator=(CLNegKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input Source tensor.
+ * @param[out] output Destination tensor.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
new file mode 100644
index 000000000..eff1b8bd5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
+#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to calculate PReLU*/
+class CLPReLUKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLPReLUKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLPReLUKernel(const CLPReLUKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLPReLUKernel(CLPReLUKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input Source tensor1.
+ * @param[in] alpha Source tensor2.
+ * @param[out] output Output tensor.
+ */
+ void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+ BorderSize border_size() const override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_alpha;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
new file mode 100644
index 000000000..a26a4a7fc
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperationKernel.h
+ * @brief This file defines CLReduceOperationKernel class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the reduce operation kernel
+ */
+class CLReduceOperationKernel : public ICLKernel
+{
+public:
+ /**
+ * @brief Default constructor
+ */
+ CLReduceOperationKernel();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLReduceOperationKernel(const CLReduceOperationKernel &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ */
+ CLReduceOperationKernel(CLReduceOperationKernel &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ */
+ CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default;
+ /**
+ * @brief Default destructor
+ */
+ ~CLReduceOperationKernel() = default;
+
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] input Source tensor. Data types supported: U8/S32/F32.
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce.
+ * @param[in] op Reduce operation to perform.
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+ ReduceOperation op);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLReduceOperationKernel.
+ * @param[in] input Source tensor info. Data types supported: U8/S32/F32.
+ * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce.
+ * @param[in] op Reduce operation to perform.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ReduceOperation op);
+
+ /*
+ * @brief Run CLReduceOperationKernel op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue CLQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ uint32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
new file mode 100644
index 000000000..577e38cc4
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */
+class CLSpaceToBatchNDKernel final : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLSpaceToBatchNDKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default;
+ /** Default destructor */
+ ~CLSpaceToBatchNDKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @note The data layout of input and output must be the same.
+ * @note The number of dimensions of input and output must be 4, and `spatial` dimensions
+ * are height and width.
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
+ * @param[in] block_size Block size tensor. Data types supported: S32.
+ * @param[in] padding_size Padding size tensor. Data types supported: S32.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
+ */
+ void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+ ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input{nullptr}; /**< Source tensor */
+ const ICLTensor *_block_size{nullptr}; /**< Block size tensor */
+ const ICLTensor *_padding_size{nullptr}; /**< Padding size tensor */
+ ICLTensor *_output{nullptr}; /**< Destination tensor */
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
new file mode 100644
index 000000000..be845a549
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform spaceTodepth operation */
+class CLSpaceToDepthKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLSpaceToDepthKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
+ /** Default destructor */
+ ~CLSpaceToDepthKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
new file mode 100644
index 000000000..8da2daecc
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLTopKV2Kernel.h
+ * @brief This file defines classes for TopKV2Kernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+// these parameters can be changed
+#define _ITEMS 16 // number of items in a group
+#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS
+#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram
+#define PERMUT // store the final permutation
+////////////////////////////////////////////////////////
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+// Invalid result on GPU
+#if 0
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define CLTopKV2Single
+ */
+class CLTopKV2Single : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLTopKV2Single();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+ */
+ CLTopKV2Single(const CLTopKV2Single &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+ * @return Reference of this instance
+ */
+ CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+ */
+ CLTopKV2Single(CLTopKV2Single &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+ * @return Reference of this instance
+ */
+ CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[in] input An input tensor
+ * @param[in] topk_values Values of the top k predictions
+ * @param[in] topk_indices Indices of the top k predictions
+ * @param[in] indices Indices
+ * @param[in] temp_stack Temp stack
+ * @param[in] k K of the top k predictions
+ * @param[in] n Number times to quick-sort
+ * return N/A
+ */
+ void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+ cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n);
+
+ /*
+ * @brief Run CLTopKV2Single op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+ ICLTensor *_topk_values;
+ ICLTensor *_topk_indices;
+};
+
+/**
+ * @brief Class to define CLTopKV2Init
+ */
+class CLTopKV2Init : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLTopKV2Init();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+ */
+ CLTopKV2Init(const CLTopKV2Init &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+ * @return Reference of this instance
+ */
+ CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+ */
+ CLTopKV2Init(CLTopKV2Init &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+ * @return Reference of this instance
+ */
+ CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[in] input An input tensor
+ * @param[in] in_key_buf Buffer of input key
+ * @param[in] in_ind_buf Buffer of input index
+ * @param[in] n Number times to quick-sort
+ * return N/A
+ */
+ void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n);
+
+ /*
+ * @brief Run CLTopKV2Init op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_input;
+};
+
+/**
+ * @brief Class to define CLRadixSortHistogram
+ */
+class CLRadixSortHistogram : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLRadixSortHistogram();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+ */
+ CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+ * @return Reference of this instance
+ */
+ CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+ */
+ CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+ * @return Reference of this instance
+ */
+ CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] hist_buf Buffer of histogram
+ * @param[in] bits Number of bits to be used for radix sort
+ * @param[in] n Integer number size to sort
+ * return N/A
+ */
+ void configure(cl::Buffer *hist_buf, int bits, int n);
+
+ /**
+ * @brief Set pass
+ * @param[in] pass Passes made of in radix sort algorithm
+ * @param[in] in_key_buf Buffer of input key
+ * return N/A
+ */
+ void setPass(int pass, cl::Buffer *in_key_buf)
+ {
+ _pass = pass;
+ _in_key_buf = in_key_buf;
+ }
+
+ /*
+ * @brief Run CLRadixSortHistogram op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ int _pass;
+ cl::Buffer *_in_key_buf;
+};
+
+/**
+ * @brief Class to define CLRadixSortScanHistogram
+ */
+class CLRadixSortScanHistogram : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLRadixSortScanHistogram();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+ */
+ CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+ * @return Reference of this instance
+ */
+ CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+ */
+ CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+ * @return Reference of this instance
+ */
+ CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] hist_buf Buffer of histogram
+ * @param[out] glob_sum_buf Buffer of global sum
+ * @param[in] bits Number of bits to be used for radix sort
+ * return N/A
+ */
+ void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+ /*
+ * @brief Run CLRadixSortScanHistogram op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortGlobalScanHistogram
+ */
+class CLRadixSortGlobalScanHistogram : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLRadixSortGlobalScanHistogram();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+ */
+ CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+ * @return Reference of this instance
+ */
+ CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+ */
+ CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+ * @return Reference of this instance
+ */
+ CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] glob_sum_buf Buffer of global sum
+ * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram
+ * @param[in] bits Number of bits to be used for radix sort
+ * return N/A
+ */
+ void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits);
+
+ /*
+ * @brief Run CLRadixSortGlobalScanHistogram op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortPasteHistogram
+ */
+class CLRadixSortPasteHistogram : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLRadixSortPasteHistogram();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+ */
+ CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+ * @return Reference of this instance
+ */
+ CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+ */
+ CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+ * @return Reference of this instance
+ */
+ CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] hist_buf Buffer of histogram
+ * @param[out] glob_sum_buf Buffer of global sum
+ * @param[in] bits Number of bits to be used for radix sort
+ * return N/A
+ */
+ void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+ /*
+ * @brief Run CLRadixSortPasteHistogram op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortReorder
+ */
+class CLRadixSortReorder : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLRadixSortReorder();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+ */
+ CLRadixSortReorder(const CLRadixSortReorder &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+ * @return Reference of this instance
+ */
+ CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+ */
+ CLRadixSortReorder(CLRadixSortReorder &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+ * @return Reference of this instance
+ */
+ CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] hist_buf Buffer of histogram
+ * @param[in] bits Number of bits to be used for radix sort
+ * @param[in] n Integer number size to sort
+ * return N/A
+ */
+ void configure(cl::Buffer *hist_buf, int bits, int n);
+
+ /**
+ * @brief Set pass
+ * @param[in] pass Passes made of in radix sort algorithm
+ * @param[in] in_key_buf Buffer of input key
+ * @param[out] out_key_buf Buffer of output key
+ * @param[in] in_ind_buf Buffer of input index
+ * @param[out] out_ind_buf Buffer of output index
+ * return N/A
+ */
+ void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+ cl::Buffer *out_ind_buf)
+ {
+ _pass = pass;
+ _in_key_buf = in_key_buf;
+ _out_key_buf = out_key_buf;
+ _in_ind_buf = in_ind_buf;
+ _out_ind_buf = out_ind_buf;
+ }
+ /*
+ * @brief Run CLRadixSortReorder op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ int _pass;
+ cl::Buffer *_in_key_buf;
+ cl::Buffer *_out_key_buf;
+ cl::Buffer *_in_ind_buf;
+ cl::Buffer *_out_ind_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2FindFirstNegative
+ */
+class CLTopKV2FindFirstNegative : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLTopKV2FindFirstNegative();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+ */
+ CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+ * @return Reference of this instance
+ */
+ CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+ */
+ CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+ * @return Reference of this instance
+ */
+ CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] first_negative_idx_buf Buffer of the first negative index
+ * @param[in] n Number times to find
+ * return N/A
+ */
+ void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+ /**
+ * @brief Set output buffer
+ * @param[out] out_key_buf Buffer of output key
+ * return N/A
+ */
+ void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; }
+
+ /*
+ * @brief Run CLTopKV2FindFirstNegative op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ cl::Buffer *_out_key_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2ReorderNegatives
+ */
+class CLTopKV2ReorderNegatives : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLTopKV2ReorderNegatives();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+ */
+ CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+ * @return Reference of this instance
+ */
+ CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+ */
+ CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+ * @return Reference of this instance
+ */
+ CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] first_negative_idx_buf Buffer of the first negative index
+ * @param[in] n Number times to find
+ * return N/A
+ */
+ void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+ /**
+ * @brief Set buffers
+ * @param[in] in_key_buf Buffer of input key
+ * @param[out] out_key_buf Buffer of output key
+ * @param[in] in_ind_buf Buffer of input index
+ * @param[out] out_ind_buf Buffer of output index
+ * return N/A
+ */
+ void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+ cl::Buffer *out_ind_buf)
+ {
+ _in_key_buf = in_key_buf;
+ _out_key_buf = out_key_buf;
+ _in_ind_buf = in_ind_buf;
+ _out_ind_buf = out_ind_buf;
+ }
+
+ /*
+ * @brief Run CLTopKV2ReorderNegatives op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ cl::Buffer *_in_key_buf;
+ cl::Buffer *_out_key_buf;
+ cl::Buffer *_in_ind_buf;
+ cl::Buffer *_out_ind_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2Store
+ */
+class CLTopKV2Store : public ICLKernel
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ CLTopKV2Store();
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+ */
+ CLTopKV2Store(const CLTopKV2Store &) = delete;
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers).
+ * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+ * @return Reference of this instance
+ */
+ CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+ */
+ CLTopKV2Store(CLTopKV2Store &&) = default;
+ /**
+ * @brief Allow instances of this class to be moved
+ * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+ * @return Reference of this instance
+ */
+ CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
+
+ /**
+ * @brief Initialise kernel with params
+ * @param[out] values Values tensor to store
+ * @param[out] indices Indices tensor to be used for store
+ * @param[in] k K of the top k predictions
+ * @param[in] n Number times to store
+ * return N/A
+ */
+ void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
+
+ /**
+ * @brief Set buffers
+ * @param[out] out_key_buf Buffer of output key
+ * @param[out] out_ind_buf Buffer of output index
+ * return N/A
+ */
+ void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
+
+ /*
+ * @brief Run CLTopKV2Store op
+ * @param[in] window Window to be used for in_slice
+ * @param[in] queue cl::CommandQueue
+ * @return N/A
+ */
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_values;
+ ICLTensor *_indices;
+ cl::Buffer *_out_key_buf;
+ cl::Buffer *_out_ind_buf;
+};
+
+} // namespace arm_compute
+#endif // Disable GPU implementation
+#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
new file mode 100644
index 000000000..c5ef730b6
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL.
+ */
+class CLTransposeConvLayerUpsampleKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLTransposeConvLayerUpsampleKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTransposeConvLayerUpsampleKernel &
+ operator=(const CLTransposeConvLayerUpsampleKernel &) = delete;
+ /** Default Move Constructor. */
+ CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default;
+ /** Default move assignment operator */
+ CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default;
+ /** Default destructor */
+ ~CLTransposeConvLayerUpsampleKernel() = default;
+
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
+ * @param[out] output Destination tensor. Data types supported: same as @p input. All but
+ * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only
+ * performed within the XY-plane.
+ * @param[in] inner_border Top and right inner border sizes. These rows and columns will be
+ * filled with zero.
+ * @param[in] info Contains padding and stride information described in @ref
+ * PadStrideInfo.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+ const PadStrideInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLTransposeConvLayerUpsample
+ *
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32.
+ * @param[in] output Destination tensor info. Data types supported: same as @p input. All
+ * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is
+ * only performed within the XY-plane.
+ * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled
+ * with zero.
+ * @param[in] info Contains padding and stride information described in @ref
+ * PadStrideInfo.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const BorderSize &inner_border, const PadStrideInfo &info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ BorderSize _inner_border;
+ PadStrideInfo _info;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
new file mode 100644
index 000000000..d093c22cb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
+#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** CPP kernel to perform tensor upsample.
+ *
+ */
+class CPPUpsampleKernelEx : public ICPPKernel
+{
+public:
+ const char *name() const override { return "CPPUpsampleKernelEx"; }
+ /** Default constructor */
+ CPPUpsampleKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete;
+ /** Allow instances of this class to be moved */
+ CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default;
+ /** Allow instances of this class to be moved */
+ CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default;
+ /** Default destructor */
+ ~CPPUpsampleKernelEx() = default;
+
+ /** Set the input and output of the kernel.
+ *
+ * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8
+ * @param[out] output The output tensor. Data types supported: Same as @p input
+ * @param[in] info Padding info.
+ */
+ void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+ bool is_parallelisable() const override;
+
+private:
+ const ITensor *_input;
+ ITensor *_output;
+ PadStrideInfo _info;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
new file mode 100644
index 000000000..358e0ebc6
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
+#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+class ITensor;
+class Window;
+class QuantizationInfo;
+} // namespace arm_compute
+
+namespace arm_compute
+{
+
+float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
+ const float32x4_t &scale);
+
+void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset,
+ const float32x4_t &invscale);
+
+float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale);
+
+void elementwise_op_quantized(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
+ int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t,
+ float32x4_t, float32x4_t, float32x4_t, const bool),
+ int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t,
+ int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t));
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ float (*scalar_func)(const float &, const float &),
+ int (*broadcast_func)(int, int, int, const float *, const float &, float *,
+ const bool),
+ int (*neon_func)(int, int, int, const float *, const float *, float *));
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
+ int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
+ uint8_t *, const bool),
+ int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *));
+} // namespace arm_compute
+#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
new file mode 100644
index 000000000..61992bd50
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+
+class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel
+{
+public:
+ /** Default destructor */
+ ~NEBinaryLogicalOperationKernel() = default;
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEBinaryLogicalOperationKernel
+ *
+ * @param[in] op Binary logical operation to be executed.
+ * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8.
+ * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
+ * @param[in] output Output tensor. Data types supported: Same as @p input1.
+ */
+ void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2,
+ ITensor *output);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEBinaryLogicalOperationKernel
+ *
+ * @param[in] op Binary logical operation to be executed.
+ * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
+ * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+ *
+ * @return a Status
+ */
+ static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1,
+ const ITensorInfo *input2, const ITensorInfo *output);
+
+protected:
+ // Inherited methods overridden:
+ static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
+ const ITensorInfo &output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
new file mode 100644
index 000000000..fd2a2ee3b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECASTKERNEL_H__
+#define __ARM_COMPUTE_NECASTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the cast layer kernel. */
+class NECastKernel : public INEKernel
+{
+public:
+ const char *name() const override { return "NECastKernel"; }
+ /** Default constructor */
+ NECastKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NECastKernel(const NECastKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NECastKernel &operator=(const NECastKernel &) = delete;
+ /** Default Move Constructor. */
+ NECastKernel(NECastKernel &&) = default;
+ /** Default move assignment operator */
+ NECastKernel &operator=(NECastKernel &&) = default;
+ /** Default destructor */
+ ~NECastKernel() = default;
+ /** Set input, output tensors.
+ *
+ * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+ * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+ * U8/S8/QASYMM8/U32/S32/F32.
+ * @param[in] input_subtype Sub data type of input.
+ */
+ void configure(const ITensor *input, ITensor *output, SubDataType input_subtype);
+ /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel
+ *
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+ * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+ * @param[in] input_subtype Sub data type of input.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ SubDataType input_subtype);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ const ITensor *_input;
+ ITensor *_output;
+ SubDataType _input_subtype;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
new file mode 100644
index 000000000..5b6ef6bfb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
+#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depth to space kernel */
+class NEDepthToSpaceLayerKernelEx : public INEKernel
+{
+public:
+ const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; }
+ /** Default constructor */
+ NEDepthToSpaceLayerKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete;
+ /** Allow instances of this class to be moved */
+ NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default;
+ /** Allow instances of this class to be moved */
+ NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default;
+ /** Default destructor */
+ ~NEDepthToSpaceLayerKernelEx() = default;
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] block_shape Block shape x value.
+ */
+ void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEDepthToSpaceLayerKernelEx.
+ *
+ * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] output Tensor output info. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ const ITensor *_input; /**< Source tensor */
+ ITensor *_output; /**< Destination tensor */
+ int32_t _block_shape; /**< Block shape */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
new file mode 100644
index 000000000..d6fad1155
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
+#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for an element-wise unary operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ output(x) = OP(input(x))@f]
+ *
+ */
+class NEElementwiseUnaryKernelEx : public INEKernel
+{
+public:
+ const char *name() const override { return "NEElementwiseUnaryKernelEx"; }
+ /** Default constructor */
+ NEElementwiseUnaryKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete;
+ /** Allow instances of this class to be moved */
+ NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default;
+ /** Allow instances of this class to be moved */
+ NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default;
+ /** Default destructor */
+ ~NEElementwiseUnaryKernelEx() = default;
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEElementwiseUnaryKernelEx
+ *
+ * @param[in] op Arithmetic operation to be executed.
+ * @param[in] input First tensor input. Data types supported: F16/F32/S32.
+ * @param[in] output Output tensor. Data types supported: Same as @p input.
+ */
+ void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEElementwiseUnaryKernelEx
+ *
+ * @param[in] op Arithmetic operation to be executed.
+ * @param[in] input First tensor input info. Data types supported: F16/F32/S32.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input.
+ *
+ * @return a Status
+ */
+ static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+ const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+ /** Common signature for all the specialised arithmetic functions
+ *
+ * @param[in] input An input tensor. Data types supported: F16/F32/S32.
+ * @param[out] output The output tensor. Data types supported: Same as @p input.
+ * @param[in] window Region on which to execute the kernel.
+ */
+ using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output,
+ const Window &window);
+
+protected:
+ // Inherited methods overridden:
+ static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output);
+
+ /** Function to use for the particular tensor types passed to configure() */
+ std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function;
+
+ const ITensor *_input;
+ ITensor *_output;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
new file mode 100644
index 000000000..1490e75f2
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform EmbeddingLookup operation */
+class NEEmbeddingLookupKernel : public INEKernel
+{
+public:
+ const char *name() const override { return "NEEmbeddingLookupKernel"; }
+ /** Default constructor */
+ NEEmbeddingLookupKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEEmbeddingLookupKernel(const NEEmbeddingLookupKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEEmbeddingLookupKernel &operator=(const NEEmbeddingLookupKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ NEEmbeddingLookupKernel(NEEmbeddingLookupKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ NEEmbeddingLookupKernel &operator=(NEEmbeddingLookupKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Destination tensor. Data types supported: same as @p input.
+ * @param[in] lookups Lookups are 1D tensor that values are indices into the first dimension of
+ * input.
+ */
+ void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEEmbeddingLookupKernel
+ *
+ * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] output Destination tensor. Data types supported: same as @p input.
+ * @param[in] lookups Lookups info. Data types supported: S32.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *lookups);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ const ITensor *_input;
+ const ITensor *_lookups;
+ ITensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
new file mode 100644
index 000000000..3fa9c6e9a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
+#define __ARM_COMPUTE_NEGATHERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform other operation on NEON */
+class NEGatherKernelEx : public INEKernel
+{
+public:
+ /** Default constructor. */
+ NEGatherKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEGatherKernelEx(const NEGatherKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete;
+ /** Allow instances of this class to be moved. */
+ NEGatherKernelEx(NEGatherKernelEx &&) = default;
+ /** Allow instances of this class to be moved. */
+ NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default;
+ /** Default detructor */
+ ~NEGatherKernelEx() = default;
+
+ /** Name of the kernel
+ *
+ * @return Kernel name
+ */
+ const char *name() const override { return "NEGatherKernelEx"; }
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported:
+ * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values
+ * wrap around. Defaults to 0
+ */
+ void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEGatherKernelEx
+ *
+ * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported:
+ * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+ * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+ * @param[in] output Destination tensor info. Data type supported: Same as @p input
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values
+ * wrap around. Defaults to 0
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ /** Implementation of the gather operation for 0 axis.
+ *
+ * For gather on the 0 axis an element by element copy is performed.
+ *
+ * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+ * returned by window())
+ * @param[in] info Info about executing thread and CPU.
+ */
+ template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info);
+
+ /** Implementation of the gather operation.
+ *
+ * For 1<=axis a row-wise copy is taking place.
+ *
+ * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+ * returned by window())
+ * @param[in] info Info about executing thread and CPU.
+ */
+ template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info);
+
+ using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info);
+
+ const ITensor *_input;
+ const ITensor *_indices;
+ int _axis;
+ ITensor *_output;
+ kernel_ptr _func;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
new file mode 100644
index 000000000..d8976e7d0
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HashtableLookup operation */
+class NEHashtableLookupKernel : public INEKernel
+{
+public:
+ const char *name() const override { return "NEHashtableLookupKernel"; }
+ /** Default constructor */
+ NEHashtableLookupKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEHashtableLookupKernel(const NEHashtableLookupKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEHashtableLookupKernel &operator=(const NEHashtableLookupKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ NEHashtableLookupKernel(NEHashtableLookupKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ NEHashtableLookupKernel &operator=(NEHashtableLookupKernel &&) = default;
+ /** Initialize the kernel's inputs, outputs.
+ *
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input. Data types supported: S32
+ * @param[in] keys Keys 1D tensor. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+ * (True) or not (False). Data types supported: U8/QASYMM8
+ * input.
+ */
+ void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
+ ITensor *hits);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEHashtableLookupKernel
+ *
+ * @param[in] lookups The lookups tensor info. Data types supported: S32.
+ * @param[in] keys The keys tensor info. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input The input tensor info.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output The output tensor info. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup
+ * hits (True) or not (False). Data types supported: U8/QASYMM8
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ const ITensor *_lookups; /** Lookups tensor */
+ const ITensor *_keys; /** Keys tensor */
+ const ITensor *_input; /** Source tensor */
+ ITensor *_output; /** Destination tensor */
+ ITensor *_hits; /** Hits tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
new file mode 100644
index 000000000..76e2587af
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for performing an instance normalization */
+class NEInstanceNormalizationLayerKernelEx : public INEKernel
+{
+public:
+ const char *name() const override { return "NEInstanceNormalizationLayerKernelEx"; }
+ /** Default constructor */
+ NEInstanceNormalizationLayerKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEInstanceNormalizationLayerKernelEx(const NEInstanceNormalizationLayerKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEInstanceNormalizationLayerKernelEx &
+ operator=(const NEInstanceNormalizationLayerKernelEx &) = delete;
+ /** Allow instances of this class to be moved */
+ NEInstanceNormalizationLayerKernelEx(NEInstanceNormalizationLayerKernelEx &&) = default;
+ /** Allow instances of this class to be moved */
+ NEInstanceNormalizationLayerKernelEx &
+ operator=(NEInstanceNormalizationLayerKernelEx &&) = default;
+ /** Default destructor */
+ ~NEInstanceNormalizationLayerKernelEx() = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported:
+ * NCHW
+ * In case of @p output tensor = nullptr this tensor will store the result
+ * of the normalization.
+ * @param[out] output Destination tensor. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor.
+ * Defaults to 1.0
+ * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor.
+ * Defaults to 0.0
+ * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+ */
+ void configure(ITensor *input, ITensor *output, ITensor *gamma = nullptr, ITensor *beta = nullptr,
+ float epsilon = 1e-12f);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEInstanceNormalizationLayer.
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported:
+ * NCHW
+ * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults
+ * to 1.0
+ * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor.
+ * Defaults to 0.0
+ * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+ float epsilon = 1e-12f);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ /** Common signature for all the specialized instance normalization functions
+ *
+ * @param[in, out] input An input tensor. In case of @p output tensor = nullptr this tensor will
+ * store the result of the normalization.
+ * @param[out] output The output tensor.
+ * @param[in] gamma The scale scalar value applied to the normalized tensor. Defaults to
+ * 1.0
+ * @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to
+ * 0.0
+ * @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12
+ */
+ using NormalizationFunction = void(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+ float epsilon, const Window &window);
+
+ NormalizationFunction *_func;
+ ITensor *_input;
+ ITensor *_output;
+ ITensor *_gamma;
+ ITensor *_beta;
+ float _epsilon;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
new file mode 100644
index 000000000..723b14523
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
+#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface to multiply scale factor kernel. */
+class NEMultiplyScaleFactorKernel : public INEKernel
+{
+public:
+ const char *name() const override { return "NEMultiplyScaleFactorKernel"; }
+ /** Default constructor */
+ NEMultiplyScaleFactorKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEMultiplyScaleFactorKernel(const NEMultiplyScaleFactorKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEMultiplyScaleFactorKernel &operator=(const NEMultiplyScaleFactorKernel &) = delete;
+ /** Default Move Constructor. */
+ NEMultiplyScaleFactorKernel(NEMultiplyScaleFactorKernel &&) = default;
+ /** Default move assignment operator */
+ NEMultiplyScaleFactorKernel &operator=(NEMultiplyScaleFactorKernel &&) = default;
+ /** Default destructor */
+ ~NEMultiplyScaleFactorKernel() = default;
+ /** Set input, output tensors.
+ *
+ * @param[in/out] input Source tensor. Data type supported: S32.
+ * @param[in] scale_factor Scale tensor. Data type supported: F16/F32.
+ * @param[out] output Destination tensor. Data type supported: Same as @p scale_factor.
+ */
+ void configure(const ITensor *input, const ITensor *scale_factor, ITensor *output,
+ float multiplier = 1.f);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEMultiplyScaleFactorKernel
+ *
+ * @param[in] input Input tensor info. Data types supported: S32.
+ * @param[in] scale_factor Scale tensor. Data type supported: F16/F32.
+ * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor,
+ const ITensorInfo *output, float multiplier = 1.f);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ template <typename T> void multiply(const Window &window);
+
+private:
+ const ITensor *_input;
+ const ITensor *_scale_factor;
+ ITensor *_output;
+ float _multiplier;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
new file mode 100644
index 000000000..79bb78661
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__
+#define __ARM_COMPUTE_NEPRELUKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform Parametric Rectified Linear Unit
+ *
+ * Result is computed by:
+ * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f]
+ */
+class NEPReLUKernel : public INEKernel
+{
+public:
+ const char *name() const override { return "NEPReLUKernel"; }
+ /** Default constructor */
+ NEPReLUKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEPReLUKernel(const NEPReLUKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEPReLUKernel &operator=(const NEPReLUKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ NEPReLUKernel(NEPReLUKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ NEPReLUKernel &operator=(NEPReLUKernel &&) = default;
+ /** Initialise the kernel's inputs and output
+ *
+ * @param[in] input Input tensor. Data type supported: QASYMM8/F32
+ * @param[in] alpha Alpha tensor. Data types supported: Same as @p input
+ * @param[out] output Output tensor. Data types supported: Same as @p input
+ */
+ void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEPReLUKernel.h
+ *
+ * @param[in] input Input tensor input info. Data types supported: QASYMM8/F32.
+ * @param[in] alpha Alpha tensor input info. Data types supported: Same as @p input.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input.
+ *
+ * @return a Status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *alpha,
+ const ITensorInfo *output);
+ static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+ const ITensorInfo &output);
+
+private:
+ const ITensor *_input; /**< Source tensor */
+ const ITensor *_alpha; /**< Alpha tensor */
+ ITensor *_output; /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
new file mode 100644
index 000000000..590b23873
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
+#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the dequantization layer kernel. */
+class NEQuantizationSymmetricKernel : public INEKernel
+{
+public:
+ const char *name() const override { return "NEQuantizationSymmetricKernel"; }
+ /** Default constructor */
+ NEQuantizationSymmetricKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEQuantizationSymmetricKernel(const NEQuantizationSymmetricKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEQuantizationSymmetricKernel &operator=(const NEQuantizationSymmetricKernel &) = delete;
+ /** Default Move Constructor. */
+ NEQuantizationSymmetricKernel(NEQuantizationSymmetricKernel &&) = default;
+ /** Default move assignment operator */
+ NEQuantizationSymmetricKernel &operator=(NEQuantizationSymmetricKernel &&) = default;
+ /** Default destructor */
+ ~NEQuantizationSymmetricKernel() = default;
+ /** Set input, output tensors.
+ *
+ * @param[in] input Source tensor. Data type supported: F16/F32.
+ * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+ * S8.
+ * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
+ */
+ void configure(const ITensor *input, ITensor *output, ITensor *scale_factor);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEQuantizationSymmetricKernel
+ *
+ * @param[in] input Input tensor info. Data types supported: F16/F32.
+ * @param[in] output Output tensor info. Data types supported: S8.
+ * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *scale_factor);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ template <typename T> void quantize(const Window &window);
+
+private:
+ const ITensor *_input;
+ ITensor *_output;
+ ITensor *_scale_factor;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h
new file mode 100644
index 000000000..73991b67d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__
+#define __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a reduction operation */
+class NEReductionOperationKernelEx : public INEKernel
+{
+public:
+ const char *name() const override { return "NEReductionOperationKernelEx"; }
+ /** Default constructor */
+ NEReductionOperationKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEReductionOperationKernelEx(const NEReductionOperationKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEReductionOperationKernelEx &operator=(const NEReductionOperationKernelEx &) = delete;
+ /** Allow instances of this class to be moved */
+ NEReductionOperationKernelEx(NEReductionOperationKernelEx &&) = default;
+ /** Allow instances of this class to be moved */
+ NEReductionOperationKernelEx &operator=(NEReductionOperationKernelEx &&) = default;
+ /** Default destructor */
+ ~NEReductionOperationKernelEx() = default;
+
+ /** Set the source, destination of the kernel
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported:
+ * NCHW.
+ * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0
+ * @param[in] op Reduction operation to perform.
+ */
+ void configure(const ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEReductionOperationKernelEx.
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts
+ * supported: NCHW.
+ * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p
+ * input.
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0
+ * @param[in] op Reduction operation to perform.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
+ ReduceOperation op);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+ BorderSize border_size() const override;
+
+private:
+ const ITensor *_input;
+ ITensor *_output;
+ unsigned int _reduction_axis;
+ ReduceOperation _op;
+ BorderSize _border_size;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
new file mode 100644
index 000000000..5d697c2b2
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
+#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the space to depth kernel */
+class NESpaceToDepthLayerKernelEx : public INEKernel
+{
+public:
+ const char *name() const override { return "NESpaceToDepthLayerKernelEx"; }
+ /** Default constructor */
+ NESpaceToDepthLayerKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete;
+ /** Allow instances of this class to be moved */
+ NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default;
+ /** Allow instances of this class to be moved */
+ NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default;
+ /** Default destructor */
+ ~NESpaceToDepthLayerKernelEx() = default;
+ /** Initialise the kernel's inputs and output.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value
+ */
+ void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NESpaceToDepthLayerKernelEx
+ *
+ * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] output Tensor output info. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ const ITensor *_input; /**< Source tensor */
+ ITensor *_output; /**< Destination tensor */
+ int32_t _block_shape; /**< Block shape */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
new file mode 100644
index 000000000..3b0902f08
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_TYPESEX_H__
+#define __ARM_COMPUTE_TYPESEX_H__
+
+namespace arm_compute
+{
+
+/** Available ArgIndex operations **/
+enum class ArgOperation
+{
+ MAX,
+ MIN,
+};
+
+/** Available reduce operations */
+enum class ReduceOperation
+{
+ MAX, /**< Max */
+ MEAN, /**< Mean */
+ SUM, /**< Sum */
+ MIN, /**< Min */
+};
+
+/** Available binary logical operations */
+enum class BinaryLogicalOperation
+{
+ AND, /**< AND */
+ OR, /**< OR */
+};
+
+enum class ComparisonOperationEx
+{
+ EQUAL, /**< EQUAL */
+ NOT_EQUAL, /**< NOT_EQUAL */
+};
+
+enum class ElementWiseUnaryEx
+{
+ NEG, /**< NEG */
+};
+
+enum class SubDataType
+{
+ NONE,
+ BOOL,
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TYPESEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
new file mode 100644
index 000000000..39026e6bb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_UTILSEX_H__
+#define __ARM_COMPUTE_UTILSEX_H__
+
+#include <utility>
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+
+/** Returns expected width and height of the transpose convolution's output tensor.
+ *
+ * @note This function was copied in order to fix a bug computing to wrong output dimensions.
+ *
+ * @param[in] in_width Width of input tensor (Number of columns)
+ * @param[in] in_height Height of input tensor (Number of rows)
+ * @param[in] kernel_width Kernel width.
+ * @param[in] kernel_height Kernel height.
+ * @param[in] info padding and stride info.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_top The number of zeros added to bottom edge of the output.
+ *
+ * @return A pair with the new width in the first position and the new height in the second.
+ */
+const std::pair<unsigned int, unsigned int>
+transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
+ unsigned int kernel_width, unsigned int kernel_height,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_top);
+}
+#endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
new file mode 100644
index 000000000..16fd40ed9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
+#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace shape_calculator
+{
+
+/** Calculate the upsampled output shape used for transpose convolution
+ *
+ * @param[in] input Input tensor info
+ * @param[in] weights Weights tensor shape
+ * @param[in] info Padding and stride info
+ * @param[in] out_dims Output shape dimensions
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[out] pad_left Padding on left
+ * @param[out] pad_right Padding on right
+ * @param[out] pad_top Padding on top
+ * @param[out] pad_bottom Padding on bottom
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_transposeconv_upsampled_shape(
+ const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
+ std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
+ unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
+ unsigned int &pad_top, unsigned int &pad_bottom)
+{
+ unsigned int sx = info.stride().first;
+ unsigned int sy = info.stride().second;
+ const DataLayout data_layout = input.data_layout();
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ // Find the upsampled dimensions
+ // transpose conv out:
+ // tconv_out + pad = 1 + (in - 1) * stride + invalid
+ // tconv_out = 1 + (in - 1) * stride + invalid - pad
+ // upsample out:
+ // upsample_out = 1 + (in - 1) * stride
+ unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1;
+ unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1;
+
+ // Find the padding needed for the convolution with stride 1 in order to match output shape
+ // upsample+pad out:
+ // upsample_out + pad = tconv_out + kernel - 1
+ // pad = tconv_out + kernel - 1 - upsample_out
+ unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
+ unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
+ out_x += padx;
+ out_y += pady;
+
+ unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right;
+ unsigned int pady_all_except_invallid =
+ pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
+ pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left();
+ pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right;
+ pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top();
+ pad_bottom = pady_all_except_invallid / 2 - info.pad_bottom() + invalid_bottom;
+
+ TensorShape scale_out_shape(input.tensor_shape());
+ scale_out_shape.set(idx_w, out_x);
+ scale_out_shape.set(idx_h, out_y);
+
+ return scale_out_shape;
+}
+
+/** Calculate the output shape of the transpose convolution layer
+ *
+ * @param[in] out_dims Output x and y shape dimensions
+ * @param[in] input Input tensor info
+ * @param[in] weights Weights tensor shape
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
+ const ITensorInfo &input, const ITensorInfo &weights)
+{
+ const TensorShape input_shape{input.tensor_shape()};
+ const TensorShape weights_shape{weights.tensor_shape()};
+
+ const DataLayout data_layout = input.data_layout();
+ const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int channel_idx =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ TensorShape out_shape{input_shape};
+ out_shape.set(width_idx, out_dims.first);
+ out_shape.set(height_idx, out_dims.second);
+ out_shape.set(channel_idx, weights_shape[batch_idx]);
+ return out_shape;
+}
+
+/** Calculate the depth to space output shape of a tensor
+ *
+ * @param[in] input Input tensor info
+ * @param[in] block Block shape value
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int block)
+{
+ ARM_COMPUTE_ERROR_ON(block < 2);
+
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ TensorShape output_shape{input->tensor_shape()};
+ output_shape.set(idx_width, input->dimension(idx_width) * block);
+ output_shape.set(idx_height, input->dimension(idx_height) * block);
+ output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block));
+
+ return output_shape;
+}
+
+/** Calculate the space to batch output shape of a tensor
+ *
+ * @param[in] input Input tensor info
+ * @param[in] block_shape Block shape value
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int32_t block_shape)
+{
+ ARM_COMPUTE_ERROR_ON(block_shape < 2);
+ TensorShape output_shape{input->tensor_shape()};
+
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape);
+ output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape);
+ output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape));
+
+ return output_shape;
+}
+
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape Input tensor shape
+ * @param[in] indices_shape Indices tensor shape
+ * @param[in] actual_axis The axis to be gathered
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape,
+ const TensorShape &indices_shape, uint32_t actual_axis)
+{
+ ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
+ ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
+ ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
+
+ TensorShape output_shape = input_shape;
+ if (indices_shape.num_dimensions() == 1)
+ {
+ output_shape[actual_axis] = indices_shape[0];
+ }
+ else if (indices_shape.num_dimensions() > 1)
+ {
+ output_shape.shift_right(indices_shape.num_dimensions() - 1);
+
+ for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
+ {
+ if (o == actual_axis)
+ {
+ ++i;
+ for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
+ {
+ output_shape[o] = indices_shape[in];
+ }
+ }
+ else
+ {
+ output_shape[o] = input_shape[i];
+ }
+ }
+ }
+ return output_shape;
+}
+
+} // namespace shape_calculator
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
new file mode 100644
index 000000000..831bb5423
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
+#define __ARM_COMPUTE_CLFUNCTIONSEX_H__
+
+#include <arm_compute/runtime/CL/functions/CLArgOperation.h>
+#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h>
+#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
+#include <arm_compute/runtime/CL/functions/CLCast.h>
+#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
+#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
+#include <arm_compute/runtime/CL/functions/CLGatherEx.h>
+#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
+#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/CL/functions/CLLogicalNot.h>
+#include <arm_compute/runtime/CL/functions/CLNeg.h>
+#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
+#include <arm_compute/runtime/CL/functions/CLPReLU.h>
+#include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
+#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h>
+#include <arm_compute/runtime/CL/functions/CLSpaceToBatchND.h>
+#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h>
+#include <arm_compute/runtime/CL/functions/CLSplit.h>
+#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h>
+#include <arm_compute/runtime/CL/functions/CLTopKV2.h>
+#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
+
+#endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
new file mode 100644
index 000000000..d9d0d4d35
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgOperation.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLArgOperation class
+ */
+
+#ifndef __ARM_COMPUTE_CLARGOPERATION_H__
+#define __ARM_COMPUTE_CLARGOPERATION_H__
+
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to execute CLArgOperation operation
+ */
+class CLArgOperation : public IFunction
+{
+public:
+ /**
+ * @brief Construct a new CLArgOperation object
+ */
+ CLArgOperation();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLArgOperation(const CLArgOperation &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLArgOperation &operator=(const CLArgOperation &) = delete;
+
+ /**
+ * @brief Construct a new CLArgOperation object by using copy constructor
+ * @param[in] CLArgOperation object to move
+ */
+ CLArgOperation(CLArgOperation &&) = default;
+
+ /**
+ * @brief Assign a CLArgOperation object.
+ * @param[in] CLArgOperation object to assign. This object will be moved.
+ */
+ CLArgOperation &operator=(CLArgOperation &&) = default;
+
+ /**
+ * @brief Initialise the kernel's inputs and outputs.
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[out] output The result of arg operation. Data types supported: S32.
+ * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+ * @param[in] op Arg operation to perform.
+ * @return N/A
+ */
+ void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+ * @param[out] output The result of arg operation. Data types supported: S32.
+ * @param[in] op Arg operation to perform.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+ const ITensorInfo *output, ArgOperation op);
+ /**
+ * @brief Run the OpenCL kernel for this operation
+ * @return N/A
+ */
+ void run() override;
+
+private:
+ ICLTensor *_input{nullptr};
+ ICLTensor *_output{nullptr};
+ std::vector<uint32_t> _axis{};
+ ArgOperation _arg_op{ArgOperation::MAX};
+
+ std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+ std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr};
+ size_t _num_of_kernels{0};
+};
+}
+#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
new file mode 100644
index 000000000..d16a0762d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBatchToSpaceNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLBatchToSpaceND : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] block_size A pointer to an array of integer values specifying block sizes
+ * for spatial dimension.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
new file mode 100644
index 000000000..061e34f26
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLBinaryLogicalOp : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8.
+ * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8.
+ * @param[out] output Output tensor. Data types supported: U8, QASYMM8.
+ */
+ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ BinaryLogicalOperation op);
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
new file mode 100644
index 000000000..36acfaed7
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLCast.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLCast class
+ */
+
+#ifndef __ARM_COMPUTE_CLCAST_H__
+#define __ARM_COMPUTE_CLCAST_H__
+
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLCastKernel.
+ * This converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLCast : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Initialise the kernel's input and output
+ * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * The input tensor is [in, out] because its TensorInfo might be
+ * modified inside the kernel.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] input_subtype Sub data type of input.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
+};
+}
+#endif /* __ARM_COMPUTE_CLCAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
new file mode 100644
index 000000000..d78a6ada4
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLDepthToSpaceKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLDepthToSpace : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[block_size] block size integer only
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+} // namesace arm_compute
+
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
new file mode 100644
index 000000000..257772a89
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class CLEmbeddingLookup : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input.
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
new file mode 100644
index 000000000..fd0a65f20
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        CLFullyConnectedReshapingLayer.h
+ * @brief       This file contains CLFullyConnectedReshapingLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
+#define __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
+
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h>
+#include <arm_compute/runtime/IMemoryManager.h>
+
+namespace arm_compute
+{
+/**
+ * @brief Class to run FullyConnected Layer after reshaping input tensor
+ */
+class CLFullyConnectedReshapingLayer : public arm_compute::IFunction
+{
+public:
+ CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+ : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
+ _cl_fc{memory_manager}, _cl_reshape{}, _needs_reshape(false)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Configure the layer
+ * @param[in] input The source tensor
+ * @param[in] weights The tensor that is filled with weight values
+ * @param[in] biases The tensor that is filled with biase values
+ * @param[in] output The destination tensor
+ * @param[in] needs_reshape Whether it needs to be reshaped or not
+ * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
+ * @return N/A
+ */
+ void configure(const arm_compute::ICLTensor *input, const arm_compute::ICLTensor *weights,
+ const arm_compute::ICLTensor *biases, arm_compute::ICLTensor *output,
+ bool needs_reshape, const arm_compute::TensorShape &reshape);
+
+public:
+ /**
+ * @brief Run the operation. Must be called after configure().
+ * @return N/A
+ */
+ void run(void) override;
+ /**
+ * @brief Prepare the operation
+ * @return N/A
+ */
+ void prepare(void) override;
+
+private:
+ const arm_compute::ICLTensor *_input;
+ const arm_compute::ICLTensor *_weights;
+ const arm_compute::ICLTensor *_biases;
+ arm_compute::ICLTensor *_output;
+
+ // buffer for reshaping input tensor
+ arm_compute::CLTensor _cl_buffer;
+
+private:
+ arm_compute::CLFullyConnectedLayer _cl_fc;
+ // TODO Change to CLReshapeLayer
+ arm_compute::misc::GenericReshapeLayer _cl_reshape;
+ bool _needs_reshape;
+};
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
new file mode 100644
index 000000000..04d227aa7
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLGatherEx.h
+ * @brief This file contains CLGatherEx class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLGATHEREX_H__
+#define __ARM_COMPUTE_CLGATHEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to to run @ref CLGatherKernel.
+ */
+class CLGatherEx : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Initialise the kernel's inputs, output and convertion policy.
+ * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] indices An indexes tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input.
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration
+ * of @ref CLGatherEx
+ * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] indices An indexes tensor. Data types supported: S32.
+ * @param[out] output The output tensor, Data types supported: same as @p input.
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
new file mode 100644
index 000000000..65aa6cbd5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class CLHashtableLookup : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input.
+ * @param[in] keys Keys 1D tensor. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+ * (True) or not (False). Data types supported: U8/QASYMM8
+ * @return N/A
+ */
+ void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
+ ICLTensor *output, ICLTensor *hits);
+};
+}
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
new file mode 100644
index 000000000..ed29db925
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to perform a Instance normalization.
+ *
+ * This function runs the following kernels:
+ * -# @ref CLInstanceNormalizationLayerKernelEx
+ */
+class CLInstanceNormalizationLayerEx : public ICLSimpleFunction
+{
+public:
+ /** Default constructor */
+ CLInstanceNormalizationLayerEx();
+ /** Set the input and output tensors.
+ *
+ * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will
+ * store the result of the normalization.
+ * Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+ * @param[out] output Destination tensor. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults
+ * to nullptr
+ * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults
+ * to nullptr
+ * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+ */
+ void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr,
+ ICLTensor *beta = nullptr, float epsilon = 1e-12f);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLInstanceNormalizationLayerEx.
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported:
+ * NHWC, NCHW
+ * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to
+ * nullptr
+ * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to
+ * nullptr
+ * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+ float epsilon = 1e-12f);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
new file mode 100644
index 000000000..4bf203c5a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__
+#define __ARM_COMPUTE_CLLOGICALNOT_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLLogicalNot : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input Source tensor. Data types supported: QASYMM8.
+ * @param[out] output Output tensor. Data types supported: QASYMM8.
+ */
+ void configure(ICLTensor *input, ICLTensor *output);
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
new file mode 100644
index 000000000..198a0fd4e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEG_H__
+#define __ARM_COMPUTE_CLNEG_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLNeg : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input Source tensor. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ *
+ */
+ void configure(ICLTensor *input, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEG_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
new file mode 100644
index 000000000..622a61b5e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_H__
+#define __ARM_COMPUTE_CLPRELU_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLPReLU : public ICLSimpleFunction
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] input. Data types supported:
+ * QASYMM8/F16/F32.
+ * @param[in] alpha. Data types supported:
+ * QASYMM8/F16/F32.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ */
+ void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
new file mode 100644
index 000000000..b142d3a2e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLPixelWiseDivision.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLPixelWiseDivision class
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLPixelWiseDivisionKernel.
+ */
+class CLPixelWiseDivision : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Initialise the kernel's inputs, output and convertion policy.
+ * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32
+ * The input tensor is [in, out] because its TensorInfo might be
+ * modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
+ * The input tensor is [in, out] because its TensorInfo might be
+ * modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] output The output tensor, Data types supported: same as @p input1.
+ * Note: U8 requires both inputs to be U8.
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or
+ * 1/2^n where n is between 0 and 15.
+ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
+ * even.
+ * @return N/A
+ */
+ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
+ ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+ RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLPixelWiseDivision
+ * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 An input tensor info. Data types supported: same as @p input1.
+ * @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * Note: U8 requires both inputs to be U8.
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n
+ * where n is between 0 and 15.
+ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, float scale = 1.f,
+ ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+ RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+};
+}
+#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
new file mode 100644
index 000000000..7e88cb369
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__
+#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__
+
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLRNNLayerEx */
+class CLRNNLayerEx : public IFunction
+{
+public:
+ /** Default constructor */
+ CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Initialize the function
+ *
+ * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
+ * types supported: F16/F32
+ * @param[in] weights Weights tensor of shape [input_size, num_units] that
+ * multiplies the input. Data types supported: Same as @p input
+ * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+ * the current 'state'. Data types supported: Same as @p input
+ * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same
+ * as @p input
+ * @param[out] output Output tensor of shape [num_units, batch_size]. Data types
+ * supported: Same as @p input
+ * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types
+ * supported: Same as @p input
+ * @param[in] info Activation layer parameter.
+ */
+ void configure(const ICLTensor *input, const ICLTensor *weights,
+ const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
+ ICLTensor *output, ActivationLayerInfo &info);
+ /** Initialize the function
+ *
+ * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
+ * types supported: F16/F32
+ * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies
+ * the input. Data types supported: Same as @p input
+ * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+ * current 'state'. Data types supported: Same as @p input
+ * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p
+ * input
+ * @param[in] output Output tensor of shape [num_units, batch_size]. Data types
+ * supported: Same as @p input
+ * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types
+ * supported: Same as @p input
+ * @param[in] info Activation layer parameter.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+ const ITensorInfo *hidden_state, const ITensorInfo *output,
+ const ActivationLayerInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
+
+private:
+ CLMemoryGroup _memory_group;
+ CLGEMM _gemm_state_f;
+ CLSaturatedArithmeticOperationKernel _add_kernel;
+ CLActivationLayerKernel _activation_kernel;
+ CLFullyConnectedLayer _fully_connected_kernel;
+ CLCopyKernel _copy_kernel;
+ CLTensor _fully_connected_out;
+ CLTensor _gemm_output;
+ CLTensor _add_output;
+ bool _is_prepared;
+};
+}
+#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
new file mode 100644
index 000000000..1d367d56b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperation.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLReduceOperation class
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATION_H__
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform ReduceOperation
+ */
+class CLReduceOperation : public IFunction
+{
+public:
+ /**
+ * @brief Construct a new ReduceOperation object
+ */
+ CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager);
+
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] input Source tensor. Data types supported: U8/S32/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+ * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+ * @param[in] op Reduce operation to perform.
+ * @return N/A
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis,
+ bool keep_dims, ReduceOperation op);
+
+ /**
+ * @brief Static function to check if given info will lead to a valid configuration of @ref
+ * CLReduceOperation.
+ * @param[in] input Source tensor info. Data types supported: U8/S32/F32
+ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates.
+ * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+ * @param[in] op Reduce operation to perform.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const std::set<uint32_t> &axis, bool keep_dims, const ReduceOperation &op);
+
+ /**
+ * @brief Run the OpenCL kernel for this operation
+ * @return N/A
+ */
+ void run() override;
+
+private:
+ CLMemoryGroup _memory_group;
+ ICLTensor *_input;
+ ICLTensor *_output;
+ std::set<uint32_t> _axis;
+ bool _keep_dims;
+
+ std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+ std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
+ CLReshapeLayer _reshape;
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
new file mode 100644
index 000000000..7e2df8986
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToBatchNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32.
+ * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape
+ * block_shape, and interleaves these blocks with the "batch" dimension such that in the output.
+ */
+class CLSpaceToBatchND : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @note The data layout of input and output must be the same.
+ * @note The number of dimensions of input and output must be 4, and `spatial` dimensions
+ * are height and width.
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
+ * @param[in] block_size Tensor of integer values specifying block sizes for spatial
+ * dimension.
+ * Data types supported: S32
+ * @param[in] padding_size Tensor of integer values specifying padding sizes for spatial
+ * dimension.
+ * Data types supported: S32
+ * @param[out] output Output tensor. Data types supported: same as @p input.
+ * Data layout supported: NCHW/NHWC
+ */
+ void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+ ICLTensor *output);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
new file mode 100644
index 000000000..17f762092
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
+#define __ARM_COMPUTE_CLSPACETODEPTH_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToDepthKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLSpaceToDepth : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[block_size] block size integer only
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
new file mode 100644
index 000000000..6b26a85c8
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLStridedSlice.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
+ */
+
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLStridedSliceKernel
+ */
+class CLStridedSliceEx : public ICLSimpleFunction
+{
+public:
+ /**
+ * @brief Initialise the kernel's inputs and outputs
+ * @param[in] input Tensor input. Data type supported:
+ * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] beginData 'begin' vector of strided slice operation
+ * @param[in] endData 'end' vector of strided slice operation
+ * @param[in] stridesData 'strides' vector of strided slice operation
+ * @param[in] beginMask If the ith bit is set, begin[i] is ignored
+ * @param[in] endMask If the ith bit is set, end[i] is ignored
+ * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the
+ * dimensionality by 1, taking on the value at index begin[i]
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+ ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+ int32_t shrinkAxisMask);
+};
+}
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
new file mode 100644
index 000000000..20c749e0b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLTopKV2.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLTopKV2 class
+ */
+#ifndef __ARM_COMPUTE_CLTOPK_V2_H__
+#define __ARM_COMPUTE_CLTOPK_V2_H__
+
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to execute TopKV2 operation.
+ */
+class CLTopKV2 : public IFunction
+{
+public:
+ /**
+ * @brief Construct a new CLTopKV2 object
+ */
+ CLTopKV2();
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLTopKV2(const CLTopKV2 &) = delete;
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ */
+ CLTopKV2 &operator=(const CLTopKV2 &) = delete;
+
+ /**
+ * @brief Construct a new CLTopKV2 object by using copy constructor
+ * @param[in] CLTopKV2 object to move
+ */
+ CLTopKV2(CLTopKV2 &&) = default;
+
+ /**
+ * @brief Assign a CLTopKV2 object.
+ * @param[in] CLTopKV2 object to assign. This object will be moved.
+ */
+ CLTopKV2 &operator=(CLTopKV2 &&) = default;
+
+ /**
+ * @brief Initialise the kernel's inputs and outputs.
+ * @param[in] input Input image. Data types supported: U8/S16/F32.
+ * @param[in] k The value of `k`.
+ * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if
+ * input type is F32.
+ * @param[out] indices Indices related to top k values. Data types supported: S32 if input type
+ * is U8/S16, F32 if input type is F32.
+ * @return N/A
+ */
+ void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+ int total_bits = 32, int bits = 4);
+
+ /**
+ * @brief Run the kernels contained in the function
+ * Depending on the value of the following environment variables it works differently:
+ * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE",
+ * quick sort on GPU is used.
+ * - If the value of environment variable "ACL_TOPKV2" == ""GPU"",
+ * radix sort on GPU is used.
+ * - For other value, TopKV2 runs on CPU
+ * @return N/A
+ */
+ void run() override;
+
+private:
+ void run_on_cpu();
+ void run_on_gpu();
+ void run_on_gpu_single_quicksort();
+
+ uint32_t _k;
+ uint32_t _total_bits;
+ uint32_t _bits;
+ uint32_t _radix;
+ uint32_t _hist_buf_size;
+ uint32_t _glob_sum_buf_size;
+ uint32_t _n;
+
+ ICLTensor *_input;
+ ICLTensor *_values;
+ ICLTensor *_indices;
+
+ cl::Buffer _qs_idx_buf;
+ cl::Buffer _qs_temp_buf;
+ cl::Buffer _hist_buf;
+ cl::Buffer _glob_sum_buf;
+ cl::Buffer _temp_buf;
+ cl::Buffer _first_negative_idx_buf;
+ cl::Buffer _in_key_buf;
+ cl::Buffer _out_key_buf;
+ cl::Buffer _in_ind_buf;
+ cl::Buffer _out_ind_buf;
+
+ cl::Buffer *_p_in_key_buf;
+ cl::Buffer *_p_out_key_buf;
+ cl::Buffer *_p_in_ind_buf;
+ cl::Buffer *_p_out_ind_buf;
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+// Invalid result on GPU
+#if 0
+ CLTopKV2Single _qs_kernel;
+ CLTopKV2Init _init_kernel;
+ CLRadixSortHistogram _hist_kernel;
+ CLRadixSortScanHistogram _scan_hist_kernel;
+ CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel;
+ CLRadixSortPasteHistogram _paste_hist_kernel;
+ CLRadixSortReorder _reorder_kernel;
+ CLTopKV2FindFirstNegative _find_first_negative_kernel;
+ CLTopKV2ReorderNegatives _reorder_negatives_kernel;
+ CLTopKV2Store _store_kernel;
+#endif
+};
+}
+#endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
new file mode 100644
index 000000000..340a7bfe9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+
+#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Function to run the transpose convolution layer.
+ *
+ * @note This layer was copied in order to fix a bug computing to wrong output dimensions.
+ *
+ * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perform a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input, pad is the amount of padding and finally a is a user
+ * specified value where a < stride - 1, that increases the padding top and right of the input
+ * image.
+ *
+ * The relation between input to output is as follows:
+ * \f[
+ * width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
+ * \f]
+ * \f[
+ * height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
+ * \f]
+ *
+ * where:
+ * width_input is the size of the first input dimension.
+ * height_input is the size of the second input dimension.
+ * width_output is the size of the first output dimension.
+ * height_output is the size of the second output dimension.
+ * kernel_x and kernel_y are the convolution sizes in x and y.
+ * stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref
+ * CPPFlipWeightsKernel.
+ *
+ * This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref CLTransposeConvLayerUpsample
+ * -# @ref CLConvolutionLayer
+ *
+ */
+class CLTransposeConvLayer : public IFunction
+{
+public:
+ /** Constructor */
+ CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTransposeConvLayer(const CLTransposeConvLayer &) = delete;
+ /** Default move constructor */
+ CLTransposeConvLayer(CLTransposeConvLayer &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete;
+ /** Default move assignment operator */
+ CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default;
+ /** Set the input, weights, biases and output tensors.
+ *
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension. Data type supported:
+ * Same as @p input.
+ * @param[out] output Output tensor. The output has the same number of dimensions
+ * as the @p input.
+ * @param[in] info Contains padding and policies to be used in the
+ * transpose convolution, this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to top edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been
+ * reshaped with @ref CLWeightsReshapeKernel.
+ */
+ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+ const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info = WeightsInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLTransposeConvLayer
+ *
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension. Data type supported:
+ * Same as @p input.
+ * @param[in] output Output tensor info. The output has the same number of dimensions
+ * as the @p input.
+ * @param[in] info Contains padding and policies to be used in the
+ * transpose convolution, this is decribed in @ref PadStrideInfo.
+ * @param[in] innvalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to top edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+ unsigned int innvalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info = WeightsInfo());
+
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
+
+private:
+ CLMemoryGroup _memory_group;
+ CLTransposeConvLayerUpsample _scale_f;
+ CLConvolutionLayer _conv_f;
+ CPPFlipWeightsKernel _flip_weights;
+ CLTensor _scaled_output;
+ ICLTensor *_original_weights;
+ CLTensor _weights_flipped;
+ bool _is_prepared;
+};
+}
+#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
new file mode 100644
index 000000000..4ae0e1830
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */
+class CLTransposeConvLayerUpsample : public IFunction
+{
+public:
+ /** Default constructor */
+ CLTransposeConvLayerUpsample();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete;
+ /** Allow instances of this class to be moved */
+ CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default;
+ /** Allow instances of this class to be moved */
+ CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default;
+ /** Default destructor */
+ virtual ~CLTransposeConvLayerUpsample() = default;
+
+ /** Initialize the function's source, destination, interpolation type and border_mode.
+ *
+ * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] inner_border The number of zeros added to right and top edges of the input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution.
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+ const PadStrideInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLTransposeConvLayerUpsample
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32.
+ * @param[in] output Destination tensor info. Data type supported: same as @p input.
+ * @param[in] inner_border The number of zeros added to right and top edges of the input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const BorderSize &inner_border, const PadStrideInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ CLTransposeConvLayerUpsampleKernel _upsample;
+ ICLTensor *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
new file mode 100644
index 000000000..8e7e2f937
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
+#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
+
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref CPPUpsample */
+class CPPUpsampleEx : public ICPPSimpleFunction
+{
+public:
+ /** Configure the upsample CPP kernel
+ *
+ * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8
+ * @param[out] output The output tensor. Data types supported: Same as @p input
+ * @param[in] info Padding information
+ */
+ void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
+};
+}
+#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
new file mode 100644
index 000000000..37bccc52c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
+#define __ARM_COMPUTE_NEFUNCTIONSEX_H__
+
+#include <arm_compute/runtime/NEON/functions/NEArgMinMax.h>
+#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
+#include <arm_compute/runtime/NEON/functions/NECast.h>
+#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
+#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
+#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEPReLU.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
+#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
+#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
+
+#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h
new file mode 100644
index 000000000..604cd93c4
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__
+#define __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce min/max operation */
+template <ReductionOperation op> class NEArgMinMaxStatic : public IFunction
+{
+public:
+ /** Constructor */
+ NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Configure kernel
+ *
+ * @note Supported tensor rank: up to 4
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+ * @param[in] axis Reduction axis.
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ */
+ void configure(ITensor *input, int axis, ITensor *output);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMax
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+ * @param[in] axis Reduction axis.
+ * @param[in] output Destination tensor. Data type supported: Same as @p input
+ *
+ * @return A status
+ */
+ static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ MemoryGroup _memory_group;
+ NEArgMinMaxLayer _reduction_kernel;
+ Tensor _reduced_out;
+ NEReshapeLayer _reshape;
+};
+
+/** Basic function to run arg max. */
+using NEArgMax = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>;
+/** Basic function to run arg min. */
+using NEArgMin = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>;
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
new file mode 100644
index 000000000..2a624656d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
+#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
+
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBinaryLogicalOperationKernel.
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8.
+ * @note The function performs a binary logical operation between two tensors.
+ */
+class NEBinaryLogicalOperation : public INESimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs, output and conversion policy.
+ *
+ * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8.
+ * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+ * @param[out] output Output tensor. Data types supported: Same as @p input1.
+ * @param[in] op Binary Logical Operation to be performed.
+ */
+ void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEBinaryLogicalOperationKernel
+ *
+ * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
+ * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+ * @param[in] op Binary Logical Operation to be performed.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, BinaryLogicalOperation op);
+};
+
+/** Basic function to run @ref NEBinaryLogicalOperationKernel
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8.
+ * @note The function performs a binary logical operation between two tensors.
+ */
+template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs, output and conversion policy.
+ *
+ * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8
+ * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+ * @param[out] output Output tensor. Data types supported: Same as @p input1.
+ */
+ void configure(ITensor *input1, ITensor *input2, ITensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEBinaryLogicalOperationKernel
+ *
+ * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8
+ * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output);
+};
+
+/** Basic function to run equal comparison. */
+using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+/** Basic function to run not equal comparison. */
+using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
new file mode 100644
index 000000000..ae2f57f19
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECAST_H__
+#define __ARM_COMPUTE_NECAST_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */
+class NECast : public INESimpleFunctionNoBorder
+{
+public:
+ /** Configure the kernel.
+ *
+ * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+ * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+ * U8/S8/QASYMM8/U32/S32/F32.
+ * @param[in] input_subtype Sub data type of input.
+ */
+ void configure(const ITensor *input, ITensor *output,
+ SubDataType input_subtype = SubDataType::NONE);
+ /** Static function to check if given info will lead to a valid configuration of @ref NECast
+ *
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+ * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+ * @param[in] input_subtype Sub data type of input.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ SubDataType input_subtype = SubDataType::NONE);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NECAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
new file mode 100644
index 000000000..90c0751b8
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */
+class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder
+{
+public:
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value.
+ */
+ void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEDepthToSpaceLayerEx.
+ *
+ * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] output Tensor output info. Data types supported: same as @p input
+ * @param[in] block_shape Block shape x value.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
new file mode 100644
index 000000000..f0c8ecdb5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform negative on an input tensor. */
+class NENegLayer : public INESimpleFunction
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] input Input tensor. Data types supported: F16/F32/S32.
+ * @param[out] output Output tensor. Data types supported: same as @p input.
+ */
+ void configure(const ITensor *input, ITensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
+ *
+ * @param[in] input First tensor input info. Data types supported: F16/F32/S32.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
new file mode 100644
index 000000000..0646f1668
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file NEEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::NEEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class NEEmbeddingLookup : public INESimpleFunctionNoBorder
+{
+public:
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input. Data types supported: S32.
+ * @return N/A
+ */
+ void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
+ /** Static function to check if given info will lead to a valid configuration of @ref NECopy
+ *
+ * @param[in] input Source tensor info. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] output Output tensor info. Data types supported: Same as @p input.
+ * @param[in] output Lookups tensor info. Data types supported: S32.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
new file mode 100644
index 000000000..42a786821
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls
+ * the following kernels:
+ *
+ * -# @ref NETransposeKernel
+ *
+ * @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder
+{
+public:
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported:
+ * QASYMM8/F16/F32.
+ * @param[out] output Destination tensor. Data type supported: Same as @p input.
+ */
+ void configure(const ITensor *input, ITensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEFullyConnectedHybridLayerReshapeWeights
+ *
+ * @param[in] input Weights tensor info. The weights must be 2 dimensional. Data types supported:
+ * QASYMM8/F16/F32.
+ * @param[in] output Destination tensor info. Data type supported: Same as @p input.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
+ * NEON kernels:
+ * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ * -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ * and transpose_weights is set to true ) (called once)
+ * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedHybridLayer : public IFunction
+{
+public:
+ /** Constructor */
+ NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete;
+ /** Default move constructor */
+ NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete;
+ /** Default move assignment operator */
+ NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data type supported: F16/F32.
+ * @param[in] weights Weights tensor. The weights must be 2 dimensional.
+ * If this function is called after a Convolution Layer, the (transposed)
+ * weights will have as many rows as the product of the first 3 input's dimensions.
+ * If it is called after another FullyConnected Layer, the (transposed)
+ * weights will have as many rows as the input's first dimension.
+ * Data type supported: S8.
+ * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+ * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix
+ * multiplication between:
+ * - The output of im2col on the input and the (transposed) 2D weights, if the
+ * function is called after a Convolution Layer
+ * - The input tensor and the (transposed) 2D weights, if the function is
+ * called after another FullyConnected Layer.
+ * Data type supported: Same as @p input.
+ * @param[in] fc_info (Optional) Fully connected layer additional info
+ */
+ void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
+ ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEFullyConnectedHybridLayer
+ *
+ * @param[in] input Source tensor info. Data type supported: F16/F32.
+ * @param[in] weights Weights tensor info. The weights must be 2 dimensional.
+ * If this function is called after a Convolution Layer, the (transposed)
+ * weights will have as many rows as the product of the first 3 input's dimensions.
+ * If it is called after another FullyConnected Layer, the (transposed)
+ * weights will have as many rows as the input's first dimension.
+ * Data type supported: S8.
+ * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+ * @param[out] output Destination tensor info. Its shape should be equal to the output of a
+ * matrix multiplication between:
+ * - The output of im2col on the input and the (transposed) 2D weights, if the
+ * function is called after a Convolution Layer
+ * - The input tensor and the (transposed) 2D weights, if the function is
+ * called after another FullyConnected Layer.
+ * Data type supported: Same as @p input.
+ * @param[in] fc_info (Optional) Fully connected layer additional info
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *biases, const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+ // Inherited methods override
+ void run() override;
+ void prepare() override;
+
+private:
+ void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+
+ MemoryGroup _memory_group;
+ NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
+ NEQuantizationSymmetricKernel _quant_input_kernel;
+ NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+ NEMultiplyScaleFactorKernel _multiply_scale_kernel;
+ NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+ Tensor _reshape_weights_output;
+ Tensor _quantized_input;
+ Tensor _scale_factor;
+ Tensor _gemmlowp_output;
+ const ITensor *_original_weights;
+ bool _are_weights_reshaped;
+ bool _accumulate_biases;
+ bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
new file mode 100644
index 000000000..6bd67f322
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
+ * NEON kernels:
+ * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ * -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and
+ * transpose_weights is set to true ) (called once)
+ * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ * @note The difference from NEFullyConnectedLayer is that this class supports weights as input
+ * with performance loss.
+ */
+class NEFullyConnectedLayerEx : public IFunction
+{
+public:
+ /** Constructor */
+ NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
+ /** Default move constructor */
+ NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
+ /** Default move assignment operator */
+ NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32.
+ * @param[in] weights Weights tensor. The weights must be 2 dimensional.
+ * If this function is called after a Convolution Layer, the (transposed)
+ * weights will have as many rows as the product of the first 3 input's dimensions.
+ * If it is called after another FullyConnected Layer, the (transposed)
+ * weights will have as many rows as the input's first dimension.
+ * Data type supported: Same as @p input.
+ * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+ * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix
+ * multiplication between:
+ * - The output of im2col on the input and the (transposed) 2D weights, if the
+ * function is called after a Convolution Layer
+ * - The input tensor and the (transposed) 2D weights, if the function is
+ * called after another FullyConnected Layer.
+ * Data type supported: Same as @p input.
+ * @param[in] fc_info (Optional) Fully connected layer additional info
+ */
+ void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
+ ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEFullyConnectedLayerEx
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32.
+ * @param[in] weights Weights tensor info. The weights must be 2 dimensional.
+ * If this function is called after a Convolution Layer, the (transposed)
+ * weights will have as many rows as the product of the first 3 input's dimensions.
+ * If it is called after another FullyConnected Layer, the (transposed)
+ * weights will have as many rows as the input's first dimension.
+ * Data type supported: Same as @p input.
+ * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+ * @param[out] output Destination tensor info. Its shape should be equal to the output of a
+ * matrix multiplication between:
+ * - The output of im2col on the input and the (transposed) 2D weights, if the
+ * function is called after a Convolution Layer
+ * - The input tensor and the (transposed) 2D weights, if the function is
+ * called after another FullyConnected Layer.
+ * Data type supported: Same as @p input.
+ * @param[in] fc_info (Optional) Fully connected layer additional info
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *biases, const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+ // Inherited methods override
+ void run() override;
+ void prepare() override;
+
+private:
+ void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output);
+ void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output);
+ void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+
+ MemoryGroup _memory_group;
+ NEFlattenLayerKernel _flatten_kernel;
+ NEConvertFullyConnectedWeights _convert_weights;
+ NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
+ NEGEMM _mm_gemm;
+ NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+ NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
+ NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+ Tensor _flatten_output;
+ Tensor _gemmlowp_output;
+ Tensor _converted_weights_output;
+ Tensor _reshape_weights_output;
+ const ITensor *_original_weights;
+ bool _are_weights_converted;
+ bool _are_weights_reshaped;
+ bool _is_fc_after_conv;
+ bool _accumulate_biases;
+ bool _is_quantized;
+ bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
new file mode 100644
index 000000000..18cb61bf9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        NEFullyConnectedReshapingLayer.h
+ * @brief       This file contains NEFullyConnectedReshapingLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
+#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
+
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+#include <arm_compute/runtime/IMemoryManager.h>
+#include <arm_compute/runtime/Tensor.h>
+
+namespace arm_compute
+{
+/**
+ * @brief Class to run FullyConnected Layer after reshaping input tensor
+ */
+class NEFullyConnectedReshapingLayer : public arm_compute::IFunction
+{
+public:
+ enum class KernelType
+ {
+ GENERAL, //< General FC
+ PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed
+ };
+
+public:
+ NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+ : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
+ _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Configure the layer
+ * @param[in] input The source tensor
+ * @param[in] weights The tensor that is filled with weight values
+ * @param[in] biases The tensor that is filled with biase values
+ * @param[in] output The destination tensor
+ * @param[in] needs_reshape Whether it needs to be reshaped or not
+ * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
+ * @param[in] kernel_type The kernel type for actual FullyConnected layer
+ * @return N/A
+ */
+ void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights,
+ const arm_compute::ITensor *biases, arm_compute::ITensor *output,
+ bool needs_reshape, const arm_compute::TensorShape &reshape,
+ KernelType kernel_type);
+
+public:
+ /**
+ * @brief Run the operation. Must be called after configure().
+ * @return N/A
+ */
+ void run(void) override;
+ /**
+ * @brief Prepare the operation
+ * @return N/A
+ */
+ void prepare(void) override;
+
+private:
+ std::shared_ptr<IMemoryManager> _memory_manager;
+ const arm_compute::ITensor *_input;
+ const arm_compute::ITensor *_weights;
+ const arm_compute::ITensor *_biases;
+ arm_compute::ITensor *_output;
+
+ // buffer for reshaping input tensor
+ arm_compute::Tensor _neon_buffer;
+
+private:
+ std::unique_ptr<arm_compute::IFunction> _neon_fc;
+ NEReshapeLayer _neon_reshape;
+ bool _needs_reshape;
+};
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
new file mode 100644
index 000000000..414b9f7d9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following
+ * NEON kernels if the DOT product instruction is not available:
+ *
+ * -# @ref NEGEMMInterleave4x4Kernel
+ * -# @ref NEGEMMTranspose1xWKernel
+ * -# @ref NEGEMMLowpMatrixMultiplyKernel
+ * -# @ref NEGEMMLowpOffsetContributionKernel
+ * -# @ref NEActivationLayer
+ *
+ * otherwise if the DOT product instruction is available:
+ *
+ * -# @ref NEGEMMLowpOffsetContributionKernel
+ *
+*/
+class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction
+{
+public:
+ /** Constructor */
+ NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+ /** Default move constructor */
+ NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+ /** Default move assignment operator */
+ NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+ /** Initialise the kernel's inputs, output
+ *
+ * @note GEMM_LOWP: low precision GEMM kernel
+ * This kernel performs the following computations:
+ *
+ * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+ * -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+ * -# Compute the matrix product of the resulting a * b in int32.
+ *
+ * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+ * QASYMM8/QASYMM8_SIGNED otherwise
+ *
+ * @param[in] a First input tensor (Matrix A). Data type supported:
+ * QASYMM8/QASYMM8_SIGNED.
+ * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
+ * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported:
+ * S32
+ * @param[out] output Output tensor. Data type supported: Data type supported:
+ * S32/QASYMM8/QASYMM8_SIGNED
+ * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+ * and
+ * if the reshape of matrix B should be executed only for the first run
+ */
+ void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output,
+ const GEMMInfo &gemm_info = GEMMInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEGEMMLowpMatrixMultiplyCoreEx
+ *
+ * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+ * QASYMM8/QASYMM8_SIGNED otherwise
+ *
+ * @param[in] a First input tensor info (Matrix A). Data type supported:
+ * QASYMM8/QASYMM8_SIGNED.
+ * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a
+ * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type
+ * supported: S32
+ * @param[in] output Output tensor info. Data type supported: Data type supported:
+ * S32/QASYMM8/QASYMM8_SIGNED
+ * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+ * and
+ * if the reshape of matrix B should be executed only for the first run
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+ const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+
+ // Inherited methods overridden
+ void run() override;
+ void prepare() override;
+
+private:
+ MemoryGroup _memory_group;
+ NEGEMMAssemblyDispatch _asm_glue;
+ std::unique_ptr<INEKernel> _mm_kernel;
+ std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
+ std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
+ NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
+ NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
+ NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+ NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
+ // NEActivationLayer _activation_func;
+
+ Tensor _vector_sum_col;
+ Tensor _vector_sum_row;
+ Tensor _tmp_a;
+ Tensor _tmp_b;
+ Tensor _mm_result_s32;
+ Tensor _signed_a;
+ Tensor _signed_output;
+ const ITensor *_original_b;
+ int32_t _a_offset;
+ int32_t _b_offset;
+
+ bool _run_vector_matrix_multiplication;
+ bool _assembly_path;
+ bool _fused_assembly_path;
+ bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
+ bool _fuse_output_stage;
+ bool _run_activation;
+ bool _flip_signedness;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
new file mode 100644
index 000000000..d95e6a81e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHEREX_H__
+#define __ARM_COMPUTE_NEGATHEREX_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEGatherKernelEx */
+class NEGatherEx : public INESimpleFunctionNoBorder
+{
+public:
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported:
+ * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+ */
+ void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEGatherKernelEx
+ *
+ * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported:
+ * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+ * @param[in] output Destination tensor info. Data type supported: Same as @p input
+ * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis);
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
new file mode 100644
index 000000000..69abf0192
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file NEHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::NEHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class NEHashtableLookup : public INESimpleFunctionNoBorder
+{
+public:
+ /**
+ * @brief Set the input and output tensors.
+ * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of
+ * input. Data types supported: S32
+ * @param[in] keys Keys 1D tensor. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input Source tensor.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+ * (True) or not (False). Data types supported: U8/QASYMM8
+ * @return N/A
+ */
+ void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
+ ITensor *hits);
+ /** Static function to check if given info will lead to a valid configuration of @ref NECopy
+ *
+ * @param[in] lookups Lookups 1D tensor info.
+ * Data types supported: S32
+ * @param[in] keys Keys 1D tensor info. keys and input pair represent a map.
+ * Data types supported: S32
+ * @param[in] input Source tensor info.
+ * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
+ * input.
+ * @param[in] hits Hits 1D tensor info. A boolean tensor that indicates whether the lookup
+ * hits (True) or not (False). Data types supported: U8/QASYMM8
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits);
+};
+}
+#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
new file mode 100644
index 000000000..521f50d2f
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform a Instance normalization.
+ *
+ * This function runs the following kernels:
+ * -# @ref NEInstanceNormalizationLayerKernelEx
+ */
+class NEInstanceNormalizationLayerEx : public IFunction
+{
+public:
+ /** Constructor */
+ NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Set the input and output tensors.
+ *
+ * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will
+ * store the result of the normalization.
+ * Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+ * @param[out] output Destination tensor. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor.
+ * Defaults to 1.0
+ * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor.
+ * Defaults to 0.0
+ * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+ */
+ void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+ float epsilon = 1e-12f);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEInstanceNormalizationLayer.
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported:
+ * NHWC, NCHW
+ * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults
+ * to 1.0
+ * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor.
+ * Defaults to 0.0
+ * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+ float epsilon = 1e-12f);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ MemoryGroup _memory_group;
+ NEInstanceNormalizationLayerKernelEx _normalization_kernel;
+ bool _is_nchw;
+ NEPermute _permute_input;
+ NEPermute _permute_output;
+ Tensor _permuted_input;
+ Tensor _permuted_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
new file mode 100644
index 000000000..5664c57cb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPRELU_H__
+#define __ARM_COMPUTE_NEPRELU_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEPReLUKernel */
+class NEPReLU : public INESimpleFunctionNoBorder
+{
+public:
+ /** Initialise the kernel's inputs and output
+ *
+ * @param[in] input. Data types supported: QASYMM8/F32.
+ * @param[in] alpha. Data types supported: Same as @p input.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
+ */
+ void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEPRELU_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
new file mode 100644
index 000000000..17c37d806
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__
+#define __ARM_COMPUTE_NERNNLAYER_EX_H__
+
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to run @ref NERNNLayerEx */
+class NERNNLayerEx : public IFunction
+{
+public:
+ /** Default constructor */
+ NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NERNNLayerEx(const NERNNLayerEx &) = delete;
+ /** Default move constructor */
+ NERNNLayerEx(NERNNLayerEx &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NERNNLayerEx &operator=(const NERNNLayerEx &) = delete;
+ /** Default move assignment operator */
+ NERNNLayerEx &operator=(NERNNLayerEx &&) = default;
+ /** Initialize the function
+ *
+ * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
+ * types supported: F16/F32
+ * @param[in] weights Weights tensor of shape [input_size, num_units] that
+ * multiplies the input. Data types supported: Same as @p input
+ * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+ * the current 'state'. Data types supported: Same as @p input
+ * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same
+ * as @p input
+ * @param[out] output Output tensor of shape [num_units, batch_size]. Data types
+ * supported: Same as @p input
+ * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types
+ * supported: Same as @p input
+ * @param[in] info Activation layer parameter.
+ */
+ void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights,
+ const ITensor *bias, ITensor *hidden_state, ITensor *output,
+ ActivationLayerInfo &info);
+ /** Initialize the function
+ *
+ * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data
+ * types supported: F16/F32
+ * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies
+ * the input. Data types supported: Same as @p input
+ * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+ * current 'state'. Data types supported: Same as @p input
+ * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p
+ * input
+ * @param[in] output Output tensor of shape [num_units, batch_size]. Data types
+ * supported: Same as @p input
+ * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types
+ * supported: Same as @p input
+ * @param[in] info Activation layer parameter.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+ const ITensorInfo *hidden_state, const ITensorInfo *output,
+ const ActivationLayerInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
+
+private:
+ MemoryGroup _memory_group;
+ NEGEMM _gemm_state_f;
+ NEArithmeticAdditionKernel _add_kernel;
+ NEActivationLayerKernel _activation_kernel;
+ NEFullyConnectedLayer _fully_connected_kernel;
+ NECopyKernel _copy_kernel;
+ Tensor _fully_connected_out;
+ Tensor _gemm_output;
+ Tensor _add_output;
+ bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
new file mode 100644
index 000000000..7209acf19
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceMeanEx : public IFunction
+{
+public:
+ /** Constructor */
+ NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Configure kernel
+ *
+ * @note Supported tensor rank: up to 4
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+ * @param[in] reduction_axis Reduction axis vector.
+ * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ */
+ void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+ ITensor *output);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEReduceMeanEx
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+ * @param[in] reduction_axis Reduction axis vector.
+ * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+ * @param[in] output Destination tensor. Data type supported: Same as @p input
+ *
+ * @return A status
+ */
+ static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ MemoryGroup _memory_group;
+ std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
+ std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
+ NEReshapeLayer _reshape;
+ unsigned int _reduction_ops;
+ bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
new file mode 100644
index 000000000..9c558e6a2
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
+#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceOperation : public IFunction
+{
+public:
+ /** Constructor */
+ NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Configure kernel
+ *
+ * @note Supported tensor rank: up to 4
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+ * @param[in] reduction_axis Reduction axis vector.
+ * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] op Reduce operation to perform.
+ */
+ void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output,
+ ReduceOperation op);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEReduceOperation
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+ * @param[in] reduction_axis Reduction axis vector.
+ * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+ * @param[in] output Destination tensor. Data type supported: Same as @p input
+ * @param[in] op Reduce operation to perform.
+ *
+ * @return A status
+ */
+ static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output, ReduceOperation op);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ MemoryGroup _memory_group;
+ std::vector<NEReductionOperationEx> _reduction_kernels;
+ std::vector<Tensor> _reduced_outs;
+ NEReshapeLayer _reshape;
+ unsigned int _reduction_ops;
+ bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
new file mode 100644
index 000000000..c028ea658
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__
+#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceSum : public IFunction
+{
+public:
+ /** Constructor */
+ NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Configure kernel
+ *
+ * @note Supported tensor rank: up to 4
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+ * @param[in] reduction_axis Reduction axis vector.
+ * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+ * @param[out] output Destination tensor. Data type supported: Same as @p input
+ */
+ void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+ ITensor *output);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32
+ * @param[in] reduction_axis Reduction axis vector.
+ * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+ * @param[in] output Destination tensor. Data type supported: Same as @p input
+ *
+ * @return A status
+ */
+ static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ MemoryGroup _memory_group;
+ std::vector<NEReductionOperation> _reduction_kernels;
+ std::vector<Tensor> _reduced_outs;
+ NEReshapeLayer _reshape;
+ unsigned int _reduction_ops;
+ bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
new file mode 100644
index 000000000..7180742df
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
+#define __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to simulate a reduction operation. This function calls the following NEON
+ * kernels:
+ *
+ * -# @ref NEFillBorderKernel
+ * -# @ref NEReductionOperationKernelEx
+ *
+ */
+class NEReductionOperationEx : public IFunction
+{
+public:
+ /** Default constructor */
+ NEReductionOperationEx();
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32.
+ * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
+ * @param[in] axis Dimension along which to reduce.
+ * @param[in] op Reduction operation to perform.
+ */
+ void configure(ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEReductionOperationEx.
+ *
+ * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32.
+ * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
+ * input.
+ * @param[in] axis Dimension along which to reduce.
+ * @param[in] op Reduction operation to perform.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
+ ReduceOperation op);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ NEReductionOperationKernelEx _reduction_kernel;
+ NEFillBorderKernel _fill_border_kernel;
+ size_t _window_split;
+ int _reduction_axis;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
new file mode 100644
index 000000000..302f9af2e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to spatial divide a tensor. This function calls the following NEON
+ * kernels/functions:
+ *
+ * -# @ref NEMemsetKernel
+ * -# @ref NESpaceToBatchLayerKernel
+ */
+class NESpaceToBatchLayerEx : public IFunction
+{
+public:
+ /** Default constructor */
+ NESpaceToBatchLayerEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete;
+ /** Allow instances of this class to be moved */
+ NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default;
+ /** Allow instances of this class to be moved */
+ NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default;
+ /** Default destructor */
+ virtual ~NESpaceToBatchLayerEx() = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
+ * @param[in] paddings 2-D tensor with shape [2, M]. Data types supported: S32
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ */
+ void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings,
+ ITensor *output);
+ /** Set the input and output tensors. (Static block shape and paddings)
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] block_shape_x Block shape x value.
+ * @param[in] block_shape_y Block shape y value.
+ * @param[in] padding_left The left padding of the output tensor.
+ * @param[in] padding_right The right padding of the output tensor.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ */
+ void configure(const ITensor *input, const int block_shape_x, const int block_shape_y,
+ const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NESpaceToBatchLayerEx
+ *
+ * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
+ * @param[in] paddings paddings tensor info with shape [2, M]. Data types supported: S32
+ * @param[in] output Tensor output info. Data types supported: same as @p input
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+ const ITensorInfo *paddings, const ITensorInfo *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NESpaceToBatchLayerEx (Static block shape and paddings)
+ *
+ * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] block_shape_x Block shape x value.
+ * @param[in] block_shape_y Block shape y value.
+ * @param[in] padding_left The left padding of the output tensor.
+ * @param[in] padding_right The right padding of the output tensor.
+ * @param[in] output Tensor output info. Data types supported: same as @p input
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y,
+ const Size2D &padding_left, const Size2D &padding_right,
+ const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+ NEMemsetKernel _memset_kernel; /**< Memset kernel to run */
+ bool _has_padding; /**< Flag to check if the output has padding */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
new file mode 100644
index 000000000..117717b55
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** This function calls the following NEON kernels/functions:
+ *
+ * -# @ref NESpaceToDepthLayerKernelEx
+ */
+class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder
+{
+public:
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value
+ */
+ void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NESpaceToDepthLayerEx (Static block shape and paddings)
+ *
+ * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] output Tensor output info. Data types supported: same as @p input
+ * @param[in] block_shape Block shape value
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
new file mode 100644
index 000000000..a50b9ea60
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+
+#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Function to run the deconvolution layer.
+ *
+ * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the
+ * input depending on the stride and pad info and then perfrom a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input, pad is the amount of padding and finaly a is a user
+ * specified value where a < stride - 1 that increases the padding top and right of the input image.
+ *
+ * The relation between input to output is as follows:
+ * \f[
+ * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ * \f]
+ * \f[
+ * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ * \f]
+ *
+ * where
+ * width is the size of the first input dimension.
+ * height is the size of the second input dimension.
+ * width_output is the size of the first output dimension.
+ * height_output is the size of the second output dimension.
+ * kernel_x and kernel_y are the convolution sizes in x and y.
+ * stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Transpose convolution are supposed to be the same as the ones used for
+ * Convolution. Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref
+ * CPPFlipWeightsKernel.
+ *
+ * This function calls the following NEON kernels/functions:
+ *
+ * -# @ref CPPUpsample
+ * -# @ref NEConvolutionLayer
+ *
+ */
+class NETransposeConvLayer : public IFunction
+{
+public:
+ /** Default constructor */
+ NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NETransposeConvLayer(const NETransposeConvLayer &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
+ /** Allow instances of this class to be moved */
+ NETransposeConvLayer(NETransposeConvLayer &&) = default;
+ /** Allow instances of this class to be moved */
+ NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
+ /** Default destructor */
+ virtual ~NETransposeConvLayer() = default;
+
+ /** Set the input, weights, biases and output tensors.
+ *
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+ * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type
+ * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+ * @param[out] output Output tensor. The output has the same number of dimensions as the @p
+ * input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution, this is
+ * decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to top edge of the output.
+ *
+ */
+ void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NETransposeConvLayer
+ *
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types
+ * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+ * @param[in] output Output tensor info. The output has the same number of dimensions as the @p
+ * input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution, this is
+ * decribed in @ref PadStrideInfo.
+ * @param[in] innvalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to top edge of the output.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, const ITensorInfo *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom);
+
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
+
+private:
+ MemoryGroup _memory_group;
+ NEConvolutionLayer _conv_f;
+ CPPUpsampleEx _upsample_f;
+ CPPFlipWeightsKernel _flip_weights;
+ NEPermute _permute_input;
+ NEPermute _permute_weights;
+ NEPermute _permute_output;
+ Tensor _scaled_output;
+ Tensor _weights_flipped;
+ Tensor _permuted_input;
+ Tensor _permuted_weights;
+ Tensor _permuted_output;
+ bool _is_nchw;
+ const ITensor *_original_weights;
+ ITensor *_input;
+ PadStrideInfo _info;
+ bool _is_prepared;
+};
+} // arm_compute
+#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h
new file mode 100644
index 000000000..3db0c7e5e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        GenericGather.h
+ * @brief       This file contains GenericGather class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+#define __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLGatherEx.h>
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+/**
+ * @brief Class to run Gather with both CPU and GPU
+ */
+class GenericGather : public arm_compute::IFunction
+{
+public:
+ GenericGather(void)
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Configure the layer
+ * @param[in] input The source tensor
+ * @param[in] indices The indices tensor
+ * @param[in] output The destination tensor
+ * @param[in] axis (Optional) The axis in input to gather indices from
+ * @return N/A
+ */
+ void configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+ arm_compute::ITensor *output, int axis = 0);
+
+public:
+ /**
+ * @brief Run the operation. Must be called after configure().
+ * @return N/A
+ */
+ void run(void) override;
+
+private:
+ arm_compute::ITensor *_input{nullptr};
+ arm_compute::ITensor *_indices{nullptr};
+ arm_compute::ITensor *_output{nullptr};
+ int _axis{0};
+ arm_compute::CLTensor _cl_permuted;
+
+private:
+ arm_compute::CLPermute _cl_permute;
+ arm_compute::CLGatherEx _cl_gather;
+};
+
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h
new file mode 100644
index 000000000..ab2fdc71d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        GenericReshapeLayer.h
+ * @brief       This file contains GenericReshapeLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__
+#define __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEPermute.h>
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+/**
+ * @brief Class to run Reshape Layer with both CPU and GPU
+ */
+class GenericReshapeLayer : public arm_compute::IFunction
+{
+public:
+ GenericReshapeLayer(void)
+ : _input(nullptr), _output(nullptr), _cl_permuted{}, _neon_permuted{}, _cl_permute{},
+ _cl_reshape{}, _neon_permute{}, _neon_reshape{}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Configure the layer
+ * @param[in] input The source tensor
+ * @param[in] output The destination tensor
+ * @return N/A
+ */
+ void configure(const arm_compute::ITensor *input, arm_compute::ITensor *output);
+
+public:
+ /**
+ * @brief Run the operation. Must be called after configure().
+ * @return N/A
+ */
+ void run(void) override;
+
+private:
+ const arm_compute::ITensor *_input;
+ arm_compute::ITensor *_output;
+ arm_compute::CLTensor _cl_permuted;
+ arm_compute::Tensor _neon_permuted;
+
+private:
+ arm_compute::CLPermute _cl_permute;
+ arm_compute::CLReshapeLayer _cl_reshape;
+
+ arm_compute::NEPermute _neon_permute;
+ arm_compute::NEReshapeLayer _neon_reshape;
+};
+
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h
new file mode 100644
index 000000000..53736f55f
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file utils.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains utils for arm compute library
+ */
+#ifndef __ARM_COMPUTE_MISC_UTILS_H__
+#define __ARM_COMPUTE_MISC_UTILS_H__
+
+#include <string>
+#include <cassert>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/core/Coordinates.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/Types.h>
+
+// TODO : It should be extracted to independent module.
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace utils
+{
+
+/**
+ * @brief Check if this runtime runs on GPU or NEON
+ * @return @c true if GPU mode, otherwise @c false
+ */
+bool isGpuMode();
+
+#ifndef CAST_CL
+#define CAST_CL(tensor) static_cast<::arm_compute::CLTensor *>(tensor)
+#endif
+
+#ifndef CAST_NE
+#define CAST_NE(tensor) static_cast<::arm_compute::Tensor *>(tensor)
+#endif
+
+/**
+* @brief Generate arm compute permutation vector from runtime permutation vector
+* @param[in] rank Rank number supported upto 4
+* @param[in] runtime_pv Integer array for runtime permutation vector
+* @return Permutation vector of arm compute
+*/
+arm_compute::PermutationVector getARMComputePermutationVector(uint32_t rank,
+ const int32_t *runtime_pv);
+
+/**
+ * @brief Set value to arm compute tensor with casting
+ * @param[in] value Value to set
+ * @param[out] to Target tensor of arm compute
+ * @param[in] id Position of element
+ * @return N/A
+ */
+template <typename FromT>
+void copyCast(const FromT value, arm_compute::ITensor *to, const arm_compute::Coordinates &id)
+{
+ switch (to->info()->data_type())
+ {
+ case arm_compute::DataType::F32:
+ {
+ *reinterpret_cast<float *>(to->ptr_to_element(id)) = static_cast<float>(value);
+ break;
+ }
+ case arm_compute::DataType::S32:
+ {
+ *reinterpret_cast<int32_t *>(to->ptr_to_element(id)) = static_cast<int32_t>(value);
+ break;
+ }
+ case arm_compute::DataType::U32:
+ {
+ *reinterpret_cast<uint32_t *>(to->ptr_to_element(id)) = static_cast<uint32_t>(value);
+ break;
+ }
+ case arm_compute::DataType::QASYMM8:
+ {
+ float realValue = static_cast<float>(value);
+ // NOTE We haven't known the policy of rounding for quantization.
+ // So this is set to a temporary value.
+ *(to->ptr_to_element(id)) =
+ to->info()->quantization_info().quantize(realValue, arm_compute::RoundingPolicy::TO_ZERO);
+ break;
+ }
+ default:
+ throw std::runtime_error("Not supported, yet");
+ break;
+ }
+}
+
+} // namespace utils
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_UTILS_H__
diff --git a/compute/ARMComputeEx/resolve_includes.py b/compute/ARMComputeEx/resolve_includes.py
new file mode 100644
index 000000000..b3e252892
--- /dev/null
+++ b/compute/ARMComputeEx/resolve_includes.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (c) 2016, 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import collections
+import os.path
+import re
+import subprocess
+import glob
+
+
+def resolve_includes(target, source):
+ # File collection
+ FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents')
+
+ # Include pattern
+ pattern = re.compile("#include \"(.*)\"")
+
+ # Get file contents
+ files = []
+ for i in range(len(source)):
+ src = source[i]
+ dst = target[i]
+ f = open(src)
+ cts = f.read()
+ f.close()
+ contents = cts.splitlines()
+ entry = FileEntry(target_name=dst, file_contents=contents)
+ files.append((os.path.basename(src), entry))
+
+ # Create dictionary of tupled list
+ files_dict = dict(files)
+
+ # Check for includes (can only be files in the same folder)
+ final_files = []
+ for file in files:
+ done = False
+ tmp_file = file[1].file_contents
+ print(file[1].target_name)
+ while not done:
+ file_count = 0
+ updated_file = []
+ for line in tmp_file:
+ found = pattern.search(line)
+ if found:
+ include_file = found.group(1)
+ data = files_dict[include_file].file_contents
+ updated_file.extend(data)
+ else:
+ updated_file.append(line)
+ file_count += 1
+
+ # Check if all include are replaced.
+ if file_count == len(tmp_file):
+ done = True
+
+ # Update temp file
+ tmp_file = updated_file
+
+ # Append and prepend string literal identifiers and add expanded file to final list
+ tmp_file.insert(0, "R\"(\n")
+ tmp_file.append("\n)\"")
+ entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file)
+ final_files.append((file[0], entry))
+
+ # Write output files
+ for file in final_files:
+ with open(file[1].target_name, 'w+') as out_file:
+ out_file.write("\n".join(file[1].file_contents))
+
+
+# Generate embed files
+cl_files = glob.glob('src/core/CL/cl_kernels/*.cl')
+cl_files += glob.glob('src/core/CL/cl_kernels/*.h')
+
+# DEBUG: print cl files
+print("cl_files:")
+print(cl_files)
+
+embed_files = [f + "embed" for f in cl_files]
+print("embed_files:")
+print(embed_files)
+
+resolve_includes(embed_files, cl_files)
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
new file mode 100644
index 000000000..7d4760600
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
+ // ARMComputeEx kernels
+ {"arg_op", "arg_operation.cl"},
+ {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
+ {"binary_logical_op", "binary_logical_op.cl"},
+ {"cast", "cast.cl"},
+ {"cast_qasymm_in", "cast.cl"},
+ {"cast_qasymm_out", "cast.cl"},
+ {"comparison_op", "comparison_op.cl"},
+ {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
+ {"depth_to_space_nchw", "depth_to_space.cl"},
+ {"depth_to_space_nhwc", "depth_to_space.cl"},
+ {"embedding_lookup", "embedding_lookup.cl"},
+ {"gather_ex", "gather_ex.cl"},
+ {"gather_ex_1d", "gather_ex.cl"},
+ {"gather_ex_1d_out", "gather_ex.cl"},
+ {"hashtable_lookup", "hashtable_lookup.cl"},
+ {"instance_normalization_ex", "instance_normalization_ex.cl"},
+ {"neg_tensor", "neg_tensor.cl"},
+ {"permute_generic", "permute_ex.cl"},
+ {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
+ {"prelu", "prelu.cl"},
+ {"prelu_qasymm8", "prelu_quantized.cl"},
+ {"reduce_min_max", "reduce_operation.cl"},
+ {"reduce_sum_mean", "reduce_operation.cl"},
+ {"topkv2_init", "topkv2.cl"},
+ {"topkv2_find_first_negative", "topkv2.cl"},
+ {"topkv2_reorder_negatives", "topkv2.cl"},
+ {"topkv2_store", "topkv2.cl"},
+ {"radixsort_histogram", "topkv2_radixsort.cl"},
+ {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
+ {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
+ {"radixsort_reorder", "topkv2_radixsort.cl"},
+ {"topkv2_quicksort", "topkv2_quicksort.cl"},
+ {"space_to_batch_4d_nchw", "space_to_batch.cl"},
+ {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
+ {"space_to_depth_nchw", "space_to_depth.cl"},
+ {"space_to_depth_nhwc", "space_to_depth.cl"},
+};
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
+#ifdef EMBEDDED_KERNELS
+ {
+ "arg_operation.cl",
+#include "./cl_kernels/arg_operation.clembed"
+ },
+ {
+ "cast.cl",
+#include "./cl_kernels/cast.clembed"
+ },
+ {
+ "embedding_lookup.cl",
+#include "./cl_kernels/embedding_lookup.clembed"
+ },
+ {
+ "depth_to_space.cl",
+#include "./cl_kernels/depth_to_space.clembed"
+ },
+ {
+ "gather_ex.cl",
+#include "./cl_kernels/gather_ex.clembed"
+ },
+ {
+ "hashtable_lookup.cl",
+#include "./cl_kernels/hashtable_lookup.clembed"
+ },
+ {
+ "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+ },
+ {
+ "helpers_asymm.h",
+#include "./cl_kernels/helpers_asymm.hembed"
+ },
+ {
+ "instance_normalization_ex.cl",
+#include "./cl_kernels/instance_normalization_ex.clembed"
+ },
+ {
+ "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+ },
+ {
+ "neg_tensor.cl",
+#include "./cl_kernels/neg_tensor.clembed"
+ },
+ {
+ "prelu.cl",
+#include "./cl_kernels/prelu.clembed"
+ },
+ {
+ "prelu_quantized.cl",
+#include "./cl_kernels/prelu_quantized.clembed"
+ },
+ {
+ "reduce_operation.cl",
+#include "./cl_kernels/reduce_operation.clembed"
+ },
+ {
+ "space_to_batch.cl",
+#include "./cl_kernels/space_to_batch.clembed"
+ },
+ {
+ "space_to_depth.cl",
+#include "./cl_kernels/space_to_depth.clembed"
+ },
+ {
+ "topkv2.cl",
+#include "./cl_kernels/topkv2.clembed"
+ },
+ {
+ "topkv2_radixsort.cl",
+#include "./cl_kernels/topkv2_radixsort.clembed"
+ },
+ {
+ "topkv2_quicksort.cl",
+#include "./cl_kernels/topkv2_quicksort.clembed"
+ },
+
+#endif /* EMBEDDED_KERNELS */
+};
+
+CLKernelLibraryEx::CLKernelLibraryEx()
+ : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+{
+ opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
+ // CLKernelLibraryEx is built
+}
+
+CLKernelLibraryEx &CLKernelLibraryEx::get()
+{
+ static CLKernelLibraryEx _kernel_library;
+ return _kernel_library;
+}
+
+Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
+ const StringSet &build_options_set) const
+{
+ // Find which program contains the kernel
+ auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+ if (_kernel_program_map.end() == kernel_program_it)
+ {
+ ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+ }
+ std::string concat_str;
+
+ if (fp16_supported())
+ {
+ concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
+ }
+
+ if (get_cl_version(_device) == CLVersion::CL20)
+ {
+ concat_str += " -cl-std=CL2.0 ";
+ }
+ else if (arm_non_uniform_workgroup_supported(_device))
+ {
+ concat_str += " -cl-arm-non-uniform-work-group-size ";
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
+ }
+
+ // Check if the program has been built before with same build options.
+ const std::string program_name = kernel_program_it->second;
+ const std::string build_options = stringify_set(build_options_set) + concat_str;
+
+ const std::string built_program_name = program_name + "_" + build_options;
+ auto built_program_it = _built_programs_map.find(built_program_name);
+
+ cl::Program cl_program;
+
+ if (_built_programs_map.end() != built_program_it)
+ {
+ // If program has been built, retrieve to create kernel from it
+ cl_program = built_program_it->second;
+ }
+ else
+ {
+ // Get program
+ Program program = load_program(program_name);
+
+ // Build program
+ cl_program = program.build(build_options);
+
+ // Add built program to internal map
+ _built_programs_map.emplace(built_program_name, cl_program);
+ }
+
+ // Create and return kernel
+ return Kernel(kernel_name, cl_program);
+}
+
+void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
+ cl::Program program)
+{
+ _built_programs_map.emplace(built_program_name, program);
+}
+
+bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); }
+
+bool CLKernelLibraryEx::int64_base_atomics_supported() const
+{
+ return device_supports_extension(_device, "cl_khr_int64_base_atomics");
+}
+
+const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
+{
+ const auto program_it = _programs_map.find(program_name);
+
+ if (program_it != _programs_map.end())
+ {
+ return program_it->second;
+ }
+
+ Program program;
+
+#ifdef EMBEDDED_KERNELS
+ const auto program_source_it = _program_source_map.find(program_name);
+
+ if (_program_source_map.end() == program_source_it)
+ {
+ ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+ }
+
+ program = Program(_context, program_name, program_source_it->second);
+#else /* EMBEDDED_KERNELS */
+ // Check for binary
+ std::string source_name = _kernel_path + program_name;
+ std::string binary_name = source_name + "bin";
+
+ if (std::ifstream(binary_name).is_open())
+ {
+ const std::string program_binary = read_file(binary_name, true);
+ program = Program(_context, _device, program_name,
+ std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
+ }
+ else if (std::ifstream(source_name).is_open())
+ {
+ program = Program(_context, program_name, read_file(source_name, false));
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
+ }
+#endif /* EMBEDDED_KERNELS */
+
+ // Insert program to program map
+ const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+ return new_program.first->second;
+}
+
+std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const
+{
+ std::string concat_set;
+
+#ifndef EMBEDDED_KERNELS
+ concat_set += "-I" + _kernel_path + " ";
+#endif /* EMBEDDED_KERNELS */
+
+ // Concatenate set
+ for (const auto &el : s)
+ {
+ concat_set += " " + el;
+ }
+
+ return concat_set;
+}
+
+std::string CLKernelLibraryEx::get_program_source(const std::string &program_name)
+{
+ const auto program_source_it = _program_source_map.find(program_name);
+
+ if (program_source_it == _program_source_map.end())
+ {
+ ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+ }
+
+ return program_source_it->second;
+}
+
+size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const
+{
+ size_t result;
+
+ size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+ ARM_COMPUTE_ERROR_ON_MSG(
+ err != 0,
+ "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+ ARM_COMPUTE_UNUSED(err);
+
+ return result;
+}
+
+cl::NDRange CLKernelLibraryEx::default_ndrange() const
+{
+ // GPUTarget _target = get_target_from_device(_device);
+ cl::Device device = cl::Device::getDefault();
+ GPUTarget _target = get_target_from_device(device);
+ cl::NDRange default_range;
+
+ switch (_target)
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::T600:
+ case GPUTarget::T700:
+ case GPUTarget::T800:
+ default_range = cl::NDRange(128u, 1);
+ break;
+ default:
+ default_range = cl::NullRange;
+ }
+
+ return default_range;
+}
+
+std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); }
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
new file mode 100644
index 000000000..2a6dfc91f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform arg_max/arg_min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
+ * e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ * e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types:
+ * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension
+ * (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension
+ * (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension
+ * (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element
+ * in the source image
+ * @param[in] input_stride_w Stride of the source tensor in W dimension
+ * (in bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination image.
+ * Supported data types: U32
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension
+ * (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension
+ * (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ * @param[in] axis Axis through which reduction occurs
+ * @param[in] dim Dimension across the axis to be reduced.
+ */
+
+__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
+ const int dim)
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int indices[4] = {
+ get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+ get_global_id(2) / DEPTH_OUT,
+ };
+
+ DATA_TYPE value =
+ *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+ DATA_TYPE tval = value;
+ int idx = 0;
+ for (int i = 1; i < dim; ++i)
+ {
+ indices[axis] = i;
+
+#if OP_CODE == 1 // ArgMax
+ value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+ indices[2], indices[3])));
+#elif OP_CODE == 2 // ArgMin
+ value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+ indices[2], indices[3])));
+#else
+ return;
+
+#endif
+
+ if (tval != value)
+ {
+ idx = indices[axis];
+ tval = value;
+ }
+ }
+
+ *((__global uint *)out.ptr) = idx;
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
new file mode 100644
index 000000000..77e239f55
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers_asymm.h"
+
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8
+ *
+ * The following computations will be performed:
+ *
+ * -# Add offset terms to inputs
+ -# Get scaled value of two inputs
+ * -# Add inputs
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using
+ * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The number of bits to shift left of input tensors must be passed at compile time using
+ * -DLEFT_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
+ * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
+ -DIN2_OFFSET,
+ * -RIN2_MULT_INT and -DIN2_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+ -DRESULT_SHIFT
+ *
+ * @attention The input and output data_types need to be passed at compile time using
+ * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
+ * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
+ * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
+ * @attention The inputs and output scale offset need to be passed at compile time using
+ * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
+ * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
+ * wrapping policy will be used.
+ *
+ * @param[in] in1_ptr Pointer to the source tensor.
+ * Supported data types: QASYMM8
+ * @param[in] in1_stride_x Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source tensor in Z dimension
+ * (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types:
+ * QASYMM8
+ * @param[in] in2_stride_x Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source tensor in Z dimension
+ * (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] out_ptr Pointer to the destination tensor.
+ * Supported data types: QASYMM8
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension
+ * (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ VEC_DATA_TYPE(int, 16)
+ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+ VEC_DATA_TYPE(int, 16)
+ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+ // Get scaled value of two inputs
+ VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+ VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+
+ VEC_DATA_TYPE(int, 16)
+ left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
+ VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
+ VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
+
+ VEC_DATA_TYPE(int, 16)
+ scaled_in1_val =
+ ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
+ VEC_DATA_TYPE(int, 16)
+ scaled_in2_val =
+ ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
+
+ // Add inputs and multiply with a multiplier smaller than 1
+ VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
+ VEC_DATA_TYPE(int, 16)
+ out_val =
+ ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+ out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+ VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+ // TODO: Apply min-max BOUND to support fuse with relu.
+ /*
+ #if defined(MIN_BOUND)
+ res = max(res, (uchar16)MIN_BOUND);
+ #endif // defined(MIN_BOUND)
+ #if defined(MAX_BOUND)
+ res = min(res, (uchar16)MAX_BOUND);
+ #endif // defined(MAX_BOUND)
+ */
+
+ // Store result
+ VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
new file mode 100644
index 000000000..8c875516d
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(OP_CODE) && defined(DATA_TYPE)
+/** returns truth value of the two input tensors for BINARY LOGICAL OP.
+ * where BINARY LOGICAL OP can be AND, OR.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size.
+ * e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input1_ptr Pointer to the source tensor.
+ * Supported data types: QASYMM8
+ * @param[in] input1_stride_x Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension
+ * (in bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] input2_ptr Pointer to the source tensor.
+ * Supported data types: QASYMM8
+ * @param[in] input2_stride_x Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] input2_step_x input2_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input2_stride_y Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] input2_step_y input2_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input2_stride_z Stride of the source tensor in Z dimension
+ * (in bytes)
+ * @param[in] input2_step_z input2_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] output_ptr Pointer to the destination tensor.
+ * Supported data types: QASYMM8
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ */
+__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+ Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if OP_CODE == 1 // LOGICAL AND
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) &&
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+ 0, (__global DATA_TYPE *)output.ptr);
+
+#elif OP_CODE == 2 // LOGICAL OR
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) ||
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+ 0, (__global DATA_TYPE *)output.ptr);
+
+#else // OP NOT SUPPORTED
+ return
+
+#endif
+}
+#endif // if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
new file mode 100644
index 000000000..2342fda9f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef SCALE
+#define SCALE 1.0f
+#endif
+#ifndef OFFSET
+#define OFFSET 0
+#endif
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
+/** Perform a cast operation on an input tensor.
+ *
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @attention -DBOOL_INPUT : Whether type of input is bool.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+ 0, (__global DATA_TYPE_OUT *)output.ptr);
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+ res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+#if defined(BOOL_INPUT)
+ VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
+ VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
+ res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+#endif // defined(BOOL_INPUT)
+
+ VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
+}
+
+/** Perform a cast operation on an QASYMM8 input tensor.
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of input should be given as a preprocessor argument using
+ * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+ in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+ VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+ VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+
+ VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
+ VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
+
+ VSTORE(VEC_SIZE)
+ (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+ (__global DATA_TYPE_OUT *)output.ptr);
+}
+
+/** Perform a cast operation on an QASYMM8 output tensor.
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of output should be given as a preprocessor argument using
+ * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: U8
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+ in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+ VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+ VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+
+ VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
+ VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
+
+ VSTORE(VEC_SIZE)
+ (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+ (__global DATA_TYPE_OUT *)output.ptr);
+}
+#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
new file mode 100644
index 000000000..e005322f7
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ * e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+ * using -DZ_OUT=size. e.g. -DZ_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ * -DBLOCK_SIZE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+
+ int out_index[4] = {0};
+ int in_index[4] = {0};
+
+ out_index[0] = get_global_id(0); // W
+ out_index[1] = get_global_id(1); // H
+ out_index[2] = get_global_id(2) % Z_OUT; // C
+ out_index[3] = get_global_id(2) / Z_OUT; // B
+
+ in_index[0] = out_index[0] / BLOCK_SIZE;
+ in_index[1] = out_index[1] / BLOCK_SIZE;
+ in_index[2] = out_index[2] +
+ ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
+ in_index[3] = out_index[3];
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+ &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+/** Perform space to depth rearrangement of tensor (NHWC)
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ * e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+ * using -DZ_OUT=size. e.g. -DZ_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ * -DBLOCK_SIZE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+
+ int out_index[4] = {0};
+ int in_index[4] = {0};
+
+ out_index[0] = get_global_id(0); // C
+ out_index[1] = get_global_id(1); // W
+ out_index[2] = get_global_id(2) % Z_OUT; // H
+ out_index[3] = get_global_id(2) / Z_OUT; // B
+
+ in_index[0] = out_index[0] +
+ ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
+ in_index[1] = out_index[1] / BLOCK_SIZE;
+ in_index[2] = out_index[2] / BLOCK_SIZE;
+ in_index[3] = out_index[3];
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+ &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
new file mode 100644
index 000000000..dd8cb6d93
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform embedding_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using
+ * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in] lookups_ptr Pointer to the lookups vector. Supported data
+ * types: S32
+ * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in
+ * bytes)
+ * @param[in] lookups_step_x lookups_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups
+ * vector
+ */
+
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(lookups))
+{
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+ Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+ // lookup ids for based on the tensor dimensions
+ int lup_id[4] = {0};
+
+ lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
+ : get_global_id(0);
+ lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
+ : get_global_id(1);
+ lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
+ : get_global_id(2) % DEPTH_OUT;
+ lup_id[3] = (NUM_DIMS == 4)
+ ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+ : get_global_id(2) / DEPTH_OUT;
+
+ in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
+ lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
+ (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
new file mode 100644
index 000000000..09f776156
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data
+ * types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per work item (in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per work item (in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per work item (in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W
+ * processed per work item (in bytes)
+ * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source
+ * tensor
+ * @param[in] indices_ptr Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in] indices_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] indices_step_x indices_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] indices_step_y indices_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] indices_step_z indices_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per work item (in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per work item (in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per work item (in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension
+ * (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per work item (in bytes)
+ * @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination
+ * tensor
+ */
+__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices),
+ TENSOR4D_DECLARATION(output))
+{
+ const int px = get_global_id(0);
+ const int py = get_global_id(1);
+ const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+ const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+ const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
+ const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+#if INDICES_DIM == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif INDICES_DIM == 2
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
+#elif INDICES_DIM == 3
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+ __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
+#endif
+#elif AXIS == 1
+#if INDICES_DIM == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif INDICES_DIM == 2
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
+#elif INDICES_DIM == 3
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
+#endif
+#elif AXIS == 2
+#if INDICES_DIM == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif INDICES_DIM == 2
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
+#endif
+#elif AXIS == 3
+#if INDICES_DIM == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif
+#endif // AXIS
+
+ *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+}
+
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
new file mode 100644
index 000000000..73f29e3e5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform hashtable_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using
+ * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in] lookups_ptr Pointer to the lookups vector. Supported data
+ * types: S32
+ * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in
+ * bytes)
+ * @param[in] lookups_step_x lookups_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups
+ * vector
+ */
+__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(lookups))
+{
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+ Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+ int lup_id[4] = {0};
+
+ lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
+ : get_global_id(0);
+ lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
+ : get_global_id(1);
+ lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
+ : get_global_id(2) % DEPTH_OUT;
+ lup_id[3] = (NUM_DIMS == 4)
+ ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+ : get_global_id(2) / DEPTH_OUT;
+
+ if (lup_id[NUM_DIMS - 1] < 0)
+ {
+ VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
+ return;
+ }
+
+ in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
+ lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
+ (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
new file mode 100644
index 000000000..0e123ae0a
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+ defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+ // defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define EXPAND(x) x
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
+ uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+ uint name##_step_y, uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+ uint name##_step_y, uint name##_stride_z, uint name##_step_z, \
+ uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name) \
+ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+ uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \
+ uint name##_step_w, uint name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+ name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+ name##_stride_x, name##_step_x, name##_stride_y, \
+ name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+ name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
+ name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+ name##_stride_x, name##_step_x, name##_stride_y, \
+ name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name) \
+ update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
+ name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+ update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ 0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \
+ update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
+ name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+ update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+ 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \
+ mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+ int stride_y; /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+ int stride_y; /**< Stride of the image in Y dimension (in bytes) */
+ int stride_z; /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+ int stride_y; /**< Stride of the image in Y dimension (in bytes) */
+ int stride_z; /**< Stride of the image in Z dimension (in bytes) */
+ int stride_w; /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+ uint stride_x, uint step_x)
+{
+ Vector vector = {
+ .ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ };
+ vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+ return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+ uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+ Image img = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y};
+ img.ptr +=
+ img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+ return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes,
+ uint stride_x, uint step_x, uint stride_y,
+ uint step_y, uint stride_z, uint step_z)
+{
+ Image img = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y};
+ img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
+ get_global_id(1) * step_y + get_global_id(2) * step_z;
+ return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes, uint stride_x,
+ uint step_x, uint stride_y, uint step_y, uint stride_z,
+ uint step_z)
+{
+ Tensor3D tensor = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z};
+ tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+ get_global_id(1) * step_y + get_global_id(2) * step_z;
+ return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+ uint offset_first_element_in_bytes, uint stride_x,
+ uint step_x, uint stride_y, uint step_y, uint stride_z,
+ uint step_z, uint stride_w, uint step_w, uint mod_size)
+{
+ Tensor4D tensor = {.ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z,
+ .stride_w = stride_w};
+
+ tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+ get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
+ (get_global_id(2) / mod_size) * step_w;
+ return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+ return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+ return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] z Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+ return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] z Relative Z position
+ * @param[in] w Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+ return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+ w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
new file mode 100644
index 000000000..c39138caa
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+#include "helpers.h"
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+ { \
+ VEC_DATA_TYPE(int, size) \
+ mask = (1 << exponent) - 1; \
+ const VEC_DATA_TYPE(int, size) zero = 0; \
+ const VEC_DATA_TYPE(int, size) one = 1; \
+ VEC_DATA_TYPE(int, size) \
+ threshold = (mask >> 1) + select(zero, one, x < 0); \
+ return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
+ }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+ { \
+ VEC_DATA_TYPE(int, size) \
+ overflow = a == b && a == INT_MIN; \
+ VEC_DATA_TYPE(long, size) \
+ a_64 = convert_long##size(a); \
+ VEC_DATA_TYPE(long, size) \
+ b_64 = convert_long##size(b); \
+ VEC_DATA_TYPE(long, size) \
+ ab_64 = a_64 * b_64; \
+ /* COMPMID-907 */ \
+ VEC_DATA_TYPE(int, size) \
+ ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \
+ return select(ab_x2_high32, INT_MAX, overflow); \
+ }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
+ a) \
+ { \
+ const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \
+ const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \
+ const int k_fractional_bits = 31; \
+ VEC_DATA_TYPE(int, size) \
+ x = a + (1 << (k_fractional_bits - 3)); \
+ VEC_DATA_TYPE(int, size) \
+ x2 = ASYMM_MULT(x, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ x3 = ASYMM_MULT(x2, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4 = ASYMM_MULT(x2, x2, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_24_plus_x3_over_6_plus_x2 = \
+ ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_24_plus_x3_over_6_plus_x2_over_2 = \
+ ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \
+ return constant_term + \
+ ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
+ }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding
+ * bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \
+ VEC_DATA_TYPE(int, size) then_val, \
+ VEC_DATA_TYPE(int, size) else_val) \
+ { \
+ return (if_mask & then_val) ^ (~if_mask & else_val); \
+ }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+ { \
+ const VEC_DATA_TYPE(int, size) all_zeros = 0; \
+ const VEC_DATA_TYPE(int, size) all_ones = ~0; \
+ return select(all_zeros, all_ones, a == 0); \
+ }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+ { \
+ const VEC_DATA_TYPE(int, size) all_zeros = 0; \
+ const VEC_DATA_TYPE(int, size) all_ones = ~0; \
+ return select(all_zeros, all_ones, a != 0); \
+ }
+
+#define EXP_BARREL_SHIFTER_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \
+ VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+ int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+ { \
+ if (k_integer_bits > exponent) \
+ { \
+ const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
+ return ASYMM_SELECT_USING_MASK( \
+ ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \
+ ASYMM_MULT(result, fp_multiplier, size), result, size); \
+ } \
+ \
+ return result; \
+ }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+ { \
+ const int k_fractional_bits = 31 - k_integer_bits; \
+ VEC_DATA_TYPE(int, size) \
+ k_one_quarter = 1 << (k_fractional_bits - 2); \
+ VEC_DATA_TYPE(int, size) \
+ mask = k_one_quarter - 1; \
+ VEC_DATA_TYPE(int, size) \
+ a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \
+ VEC_DATA_TYPE(int, size) \
+ a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \
+ VEC_DATA_TYPE(int, size) \
+ result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \
+ a_mod_quarter_minus_one_quarter_scaled, size); \
+ VEC_DATA_TYPE(int, size) \
+ remainder = a_mod_quarter_minus_one_quarter - a; \
+ \
+ result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \
+ remainder, size); \
+ result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
+ size); \
+ result = \
+ EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \
+ \
+ if (k_integer_bits > 5) \
+ { \
+ const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \
+ result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
+ } \
+ \
+ const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
+ return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \
+ }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+ { \
+ if (exponent < 0) \
+ { \
+ return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \
+ } \
+ \
+ const VEC_DATA_TYPE(int, size) min = INT_MIN; \
+ const VEC_DATA_TYPE(int, size) max = INT_MAX; \
+ int threshold = ((1 << (31 - exponent)) - 1); \
+ VEC_DATA_TYPE(int, size) \
+ positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \
+ VEC_DATA_TYPE(int, size) \
+ negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \
+ VEC_DATA_TYPE(int, size) \
+ result = x << exponent; \
+ result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \
+ result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \
+ return result; \
+ }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+ { \
+ VEC_DATA_TYPE(long, size) \
+ a64 = convert_long##size(a); \
+ VEC_DATA_TYPE(long, size) \
+ b64 = convert_long##size(b); \
+ VEC_DATA_TYPE(long, size) \
+ sum = a64 + b64; \
+ const VEC_DATA_TYPE(long, size) one = 1; \
+ const VEC_DATA_TYPE(long, size) minus_one = -1; \
+ VEC_DATA_TYPE(long, size) \
+ sign = select(minus_one, one, sum >= 0); \
+ return convert_int##size((sum + sign) / 2); \
+ }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+ { \
+ const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
+ const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \
+ VEC_DATA_TYPE(int, size) \
+ half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \
+ const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \
+ const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \
+ VEC_DATA_TYPE(int, size) \
+ x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
+ for (int i = 0; i < 3; i++) \
+ { \
+ VEC_DATA_TYPE(int, size) \
+ half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \
+ VEC_DATA_TYPE(int, size) \
+ tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \
+ x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \
+ } \
+ return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \
+ }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value
+ * accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \
+ int src_integer_bits, int dst_integer_bits) \
+ { \
+ int exponent = src_integer_bits - dst_integer_bits; \
+ return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \
+ }
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
+ asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+ asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+ asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+ remainder, size) \
+ exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+ remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \
+ asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \
+ asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+ asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+ asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H \ No newline at end of file
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
new file mode 100644
index 000000000..1d96150f8
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
+ defined(DIM_Y) && defined(DIM_Z)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and
+ * standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g.
+ * -DDATA_TYPE=float
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with
+ * -DEPSILON=value. e.g. -DEPSILON=0.001f
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value,
+ * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported
+ * data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension
+ * (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension
+ * (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension
+ * (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first
+ * source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination tensor.
+ * Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the destination tensor
+ * @param[in] gamma_ptr (Optional) Pointer to the gamma tensor.
+ * Supported data types: same as @p input_ptr
+ * @param[in] gamma_stride_x (Optional) Stride of the gamma tensor in X
+ * dimension (in bytes)
+ * @param[in] gamma_step_x (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the gamma tensor
+ * @param[in] beta_ptr (Optional) Pointer to the beta tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in] beta_stride_x (Optional) Stride of the beta tensor in X
+ * dimension (in bytes)
+ * @param[in] beta_step_x (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the beta tensor
+ */
+__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
+#ifndef IN_PLACE
+ TENSOR4D_DECLARATION(output)
+#endif /* IN_PLACE */
+#ifdef GAMMA
+ ,
+ VECTOR_DECLARATION(gamma)
+#endif // GAMMA
+#ifdef BETA
+ ,
+ VECTOR_DECLARATION(beta)
+#endif // BETA
+ )
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+#ifndef IN_PLACE
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+#endif /* IN_PLACE */
+
+ float sum = 0.f;
+ float sum_sq = 0.f;
+
+#if defined(NHWC)
+
+ const int ch = get_global_id(0); // Current channel
+ const int batch = get_global_id(2); // Current batch
+ const int elements_plane = DIM_Y * DIM_Z;
+
+ for (int i_w = 0; i_w < DIM_Y; ++i_w)
+ {
+ for (int i_h = 0; i_h < DIM_Z; ++i_h)
+ {
+ float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
+ sum += data;
+ sum_sq += data * data;
+ }
+ }
+
+#else // !defined(NHWC)
+ const int ch = get_global_id(2) % DIM_Z; // Current channel
+ const int batch = get_global_id(2) / DIM_Z; // Current batch
+ const int elements_plane = DIM_X * DIM_Y;
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ part_sum = 0.f;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ part_sum_sq = 0.f;
+ // Calculate partial sum
+ for (int y = 0; y < DIM_Y; ++y)
+ {
+ int x = 0;
+ for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+ {
+ // Load data
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+ part_sum += data;
+ part_sum_sq += data * data;
+ }
+ // Left-overs loop
+ for (; x < DIM_X; ++x)
+ {
+ DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+ part_sum.s0 += data;
+ part_sum_sq.s0 += data * data;
+ }
+ }
+// Perform reduction
+#if VEC_SIZE > 8
+ part_sum.s01234567 += part_sum.s89abcdef;
+ part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
+#endif // VEC_SIZE > 8
+#if VEC_SIZE > 4
+ part_sum.s0123 += part_sum.s4567;
+ part_sum_sq.s0123 += part_sum_sq.s4567;
+#endif // VEC_SIZE > 4
+#if VEC_SIZE > 2
+ part_sum.s01 += part_sum.s23;
+ part_sum_sq.s01 += part_sum_sq.s23;
+#endif // VEC_SIZE > 2
+ part_sum.s0 += part_sum.s1;
+ part_sum_sq.s0 += part_sum_sq.s1;
+
+ sum = (float)part_sum.s0;
+ sum_sq = (float)part_sum_sq.s0;
+
+#endif // defined(NHWC)
+
+ const float mean_float = (sum / elements_plane);
+ const DATA_TYPE mean = (DATA_TYPE)mean_float;
+ const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float);
+#if defined(GAMMA)
+ const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON);
+ const DATA_TYPE multip = (DATA_TYPE)multip_float;
+#else // !defined(GAMMA)
+ const DATA_TYPE multip = (DATA_TYPE)0;
+#endif // defined(GAMMA)
+#if defined(BETA)
+ const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch);
+#else // !defined(BETA)
+ const DATA_TYPE beta = 0;
+#endif // defined(BETA)
+
+#if defined(NHWC)
+
+ for (int i_w = 0; i_w < DIM_Y; ++i_w)
+ {
+ for (int i_h = 0; i_h < DIM_Z; ++i_h)
+ {
+ __global DATA_TYPE *input_address =
+ (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+#ifdef IN_PLACE
+ __global DATA_TYPE *output_address = input_address;
+#else /* !IN_PLACE */
+ __global DATA_TYPE *output_address =
+ (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+#endif /* IN_PLACE */
+ *(output_address) = (*(input_address)-mean) * multip + beta;
+ }
+ }
+
+#else // !defined(NHWC)
+ for (int y = 0; y < DIM_Y; ++y)
+ {
+ int x = 0;
+ for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+ {
+ __global DATA_TYPE *input_address =
+ (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+ __global DATA_TYPE *output_address = input_address;
+#else /* !IN_PLACE */
+ __global DATA_TYPE *output_address =
+ (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = VLOAD(VEC_SIZE)(0, input_address);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res = (data - mean) * multip + beta;
+ VSTORE(VEC_SIZE)
+ (res, 0, output_address);
+ }
+ // Left-overs loop
+ for (; x < DIM_X; ++x)
+ {
+ __global DATA_TYPE *input_address =
+ (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+ __global DATA_TYPE *output_address = input_address;
+#else /* !IN_PLACE */
+ __global DATA_TYPE *output_address =
+ (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+ *(output_address) = (*(input_address)-mean) * multip + beta;
+ }
+ }
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
+ defined(DIM_Y) && defined(DIM_Z) */
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
new file mode 100644
index 000000000..4aa7883c3
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Performs a negation of input tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types:
+ * S16/S32/F16/F32.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed
+ * per work item (in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed
+ * per work item (in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination
+ * image
+ *
+ */
+__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VSTORE(VEC_SIZE)
+ (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
new file mode 100644
index 000000000..2074d3ceb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers_asymm.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of
+ * GEMMLowp to QASYMM8
+ *
+ * The following computations will be performed by the kernel:
+ *
+ * -# Add offset terms to inputs
+ * -# Multiply inputs
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using
+ * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and
+ * -DIN2_OFFSET
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+ * -DRESULT_SHIFT
+ *
+ * @param[in] in1_ptr Pointer to the source image. Supported data types:
+ * U8
+ * @param[in] in1_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] in2_ptr Pointer to the source image. Supported data types:
+ * U8
+ * @param[in] in2_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data
+ * types: U8
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ * @param[in] scale Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out), const float scale)
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ VEC_DATA_TYPE(int, 16)
+ in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+ VEC_DATA_TYPE(int, 16)
+ in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+ // Perform multiplication of two inputs
+ VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+ VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+ VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
+
+ // Multiply with a multiplier smaller than 1
+ out_val =
+ ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+ out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+ VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+ // TODO: Apply min-max BOUND to support fuse with relu.
+ /*
+ #if defined(MIN_BOUND)
+ res = max(res, (uchar16)MIN_BOUND);
+ #endif // defined(MIN_BOUND)
+ #if defined(MAX_BOUND)
+ res = min(res, (uchar16)MAX_BOUND);
+ #endif // defined(MAX_BOUND)
+ */
+
+ // Store result
+ VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
new file mode 100644
index 000000000..62a8901f6
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in] input1_ptr Pointer to the source image. Supported Data
+ * types : F16/F32
+ * @param[in] input1_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[in] alpha_ptr Pointer to the source image. Supported Data
+ * types : F16/F32
+ * @param[in] alpha_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] alpha_step_x input2_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] alpha_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] alpha_step_y input2_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] alpha_step_z input2_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ *
+ * @param[out] output_ptr Pointer to the destination image. Supported
+ * data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VSTORE(VEC_SIZE)
+ (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
+ ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
+ : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
+ 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
new file mode 100644
index 000000000..5e0abd585
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
+ defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define SELECT_TYPE VEC_INT
+
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
+ * -DDATA_TYPE_IN=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note Can only take uchar data types.
+ *
+ * @param[in] input1_ptr Pointer to the source image. Supported Data
+ * types : QASYMM8
+ * @param[in] input1_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input1_step_x input1_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input1_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input1_step_y input1_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input1_step_z input1_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[in] alpha_ptr Pointer to the source image. Supported Data
+ * types : QASYMM8
+ * @param[in] alpha_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] alpha_step_x input2_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] alpha_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] alpha_step_y input2_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] alpha_step_z input2_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[out] output_ptr Pointer to the destination image. Supported
+ * data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
+ VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
+
+ in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
+ alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
+
+ const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
+ const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
+ const VEC_FLOAT outf32 =
+ select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
+ const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
+ const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global uchar *)output.ptr);
+}
+
+#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
+ // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
new file mode 100644
index 000000000..d7ea2e2c4
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform reduce max/min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ * e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ * @param[in] axis Axis through which reduction occurs
+ * @param[in] dim Dimension across the axis to be reduced.
+ */
+__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+ const int axis, const int dim)
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int indices[4] = {
+ get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+ get_global_id(2) / DEPTH_OUT,
+ };
+
+ DATA_TYPE value =
+ *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+ for (int i = 1; i < dim; ++i)
+ {
+ indices[axis] = i;
+
+#if OP_CODE == 1 // REDUCE_MAX
+ value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+ indices[2], indices[3])));
+
+#elif OP_CODE == 2 // REDUCE_MIN
+ value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+ indices[2], indices[3])));
+
+#else // OP NOT SUPPORTED
+ return;
+
+#endif
+ }
+
+ *((__global DATA_TYPE *)out.ptr) = value;
+}
+
+/** Perform reduce sum/mean
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ * e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ * @param[in] axis Axis through which reduction occurs
+ * @param[in] dim Dimension across the axis to be reduced.
+ */
+__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+ const int axis, const int dim)
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int indices[4] = {
+ get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+ get_global_id(2) / DEPTH_OUT,
+ };
+
+ DATA_TYPE sum_value = (DATA_TYPE)0;
+ for (int i = 0; i < dim; ++i)
+ {
+ indices[axis] = i;
+ sum_value += *(
+ (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+ }
+
+#if OP_CODE == 3 // REDUCE_SUM
+ *((__global DATA_TYPE *)out.ptr) = sum_value;
+
+#elif OP_CODE == 4 // REDUCE_MEAN
+ *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
+
+#else // OP NOT SUPPORTED
+ return;
+
+#endif
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
new file mode 100644
index 000000000..7367da7fb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \
+ defined(WIDTH_IN) && defined(ZERO_VALUE)
+/** Perform space to batch with input of 4D and NCHW format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ * e.g. -DDEPTH_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size.
+ * e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ * e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ * e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported
+ * data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along
+ * X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along
+ * Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along
+ * Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along
+ * W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the
+ * source tensor
+ * @param[out] output_ptr Pointer to the destination tensor.
+ * Supported data types: same as @p
+ * input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements
+ * along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in] block_size_ptr Pointer to the source tensor. Supported
+ * data types: S32
+ * @param[in] block_size_stride_x Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] block_size_step_x block_size_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in] padding_size_ptr Pointer to the source tensor. Supported
+ * data types: S32
+ * @param[in] padding_size_stride_x Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] padding_size_step_x padding_size_stride_x * number of
+ * elements along X processed per workitem
+ * (in bytes)
+ * @param[in] padding_size_stride_y Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] padding_size_step_y padding_size_stride_y * number of
+ * elements along Y processed per workitem
+ * (in bytes)
+ * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(block_size),
+ IMAGE_DECLARATION(padding_size))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int block_size_x = *((__global int *)(block_size_ptr));
+ int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+ int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
+ int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
+
+ int in_index[4] = {
+ 0,
+ };
+ in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+ in_index[1] = get_global_id(1) * block_size_y + shift_y -
+ *((__global int *)(padding_size_ptr + padding_size_stride_y));
+ in_index[2] = get_global_id(2) % DEPTH_OUT;
+ in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
+
+ if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
+ {
+ *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+ }
+ else
+ {
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+ &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+ }
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) &&
+ // defined(WIDTH_IN) && defined(ZERO_VALUE)
+
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \
+ defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+/** Perform space to batch with input of 4D and NHWC format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size.
+ * e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ * e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ * e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported
+ * data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along
+ * X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along
+ * Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along
+ * Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along
+ * W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the
+ * source tensor
+ * @param[out] output_ptr Pointer to the destination tensor.
+ * Supported data types: same as @p
+ * input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements
+ * along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in] block_size_ptr Pointer to the source tensor. Supported
+ * data types: S32
+ * @param[in] block_size_stride_x Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] block_size_step_x block_size_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in] padding_size_ptr Pointer to the source tensor. Supported
+ * data types: S32
+ * @param[in] padding_size_stride_x Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] padding_size_step_x padding_size_stride_x * number of
+ * elements along X processed per workitem
+ * (in bytes)
+ * @param[in] padding_size_stride_y Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] padding_size_step_y padding_size_stride_y * number of
+ * elements along Y processed per workitem
+ * (in bytes)
+ * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(block_size),
+ IMAGE_DECLARATION(padding_size))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+
+ int block_size_x = *((__global int *)(block_size_ptr));
+ int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+ int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+ int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+
+ int in_index[4] = {
+ 0,
+ };
+ in_index[0] = get_global_id(0) * VEC_SIZE;
+ in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+ in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y -
+ *((__global int *)(padding_size_ptr + padding_size_stride_y));
+ in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+
+ if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+ {
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+ }
+ else
+ {
+ VSTORE(VEC_SIZE)
+ (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1],
+ in_index[2], in_index[3])),
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+ 0, (__global DATA_TYPE *)out.ptr);
+ }
+}
+
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) &&
+ // defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
new file mode 100644
index 000000000..a26e762e8
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+ * e.g. -DDEPTH_IN=16
+ * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+ * argument using -DZ_IN=size. e.g. -DZ_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ * -DBLOCK_SIZE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ int out_index[4] = {0};
+ int in_index[4] = {0};
+
+ in_index[0] = get_global_id(0); // W
+ in_index[1] = get_global_id(1); // H
+ in_index[2] = get_global_id(2) % Z_IN; // C
+ in_index[3] = get_global_id(2) / Z_IN; // B
+
+ out_index[0] = in_index[0] / BLOCK_SIZE;
+ out_index[1] = in_index[1] / BLOCK_SIZE;
+ out_index[2] =
+ in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+ out_index[3] = in_index[3];
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+ out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+
+#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+ * e.g. -DDEPTH_IN=16
+ * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+ * argument using -DZ_IN=size. e.g. -DZ_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ * -DBLOCK_SIZE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[out] output_ptr Pointer to the destination image. Supported data
+ * types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination image
+ */
+__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ int out_index[4] = {0};
+ int in_index[4] = {0};
+
+ in_index[0] = get_global_id(0); // C
+ in_index[1] = get_global_id(1); // W
+ in_index[2] = get_global_id(2) % Z_IN; // H
+ in_index[3] = get_global_id(2) / Z_IN; // B
+
+ out_index[0] =
+ in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
+ out_index[1] = in_index[1] / BLOCK_SIZE;
+ out_index[2] = in_index[2] / BLOCK_SIZE;
+ out_index[3] = in_index[3];
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+ out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
new file mode 100644
index 000000000..50472e4f9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.h"
+
+__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf,
+ __global int *in_ind_buf, const int n)
+{
+ int gid = get_global_id(0);
+ int lws = get_local_size(0);
+ int groups = get_num_groups(0);
+ int gws = lws * groups;
+ int iter = n / gws;
+
+ Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+
+ for (int i = 0; i < iter; ++i)
+ {
+ int idx = i * gws + gid;
+ in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x);
+ in_ind_buf[idx] = idx;
+ }
+}
+
+__kernel void topkv2_find_first_negative(__global float *out_key_buf,
+ __global int *first_negative_idx, int n)
+{
+ int gid = get_global_id(0);
+
+ if (gid == n - 1)
+ {
+ // if the last item is positive, the first negative index is n.
+ if (out_key_buf[gid] > 0.f)
+ *first_negative_idx = n;
+ }
+ else if (gid == 0)
+ {
+ // if the first item is negative, set it 0.
+ if (out_key_buf[gid] < 0.f)
+ *first_negative_idx = 0;
+ }
+ else
+ {
+ // if its left is positive and it is negative, then it is the first negative item.
+ if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f)
+ *first_negative_idx = gid;
+ }
+}
+
+__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf,
+ __global float *in_ind_buf, __global float *out_ind_buf,
+ __global int *first_negative_idx, int n)
+{
+ int gid = get_global_id(0);
+
+ int num_negs = n - *first_negative_idx;
+ int in_idx;
+
+ if (gid < num_negs)
+ {
+ in_idx = n - 1 - gid;
+ }
+ else
+ {
+ in_idx = gid - num_negs;
+ }
+
+ out_key_buf[gid] = in_key_buf[in_idx];
+ out_ind_buf[gid] = in_ind_buf[in_idx];
+}
+
+__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices),
+ __global float *out_key_buf, __global int *out_ind_buf, int n)
+{
+ int gid = get_global_id(0);
+
+ Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
+ Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+
+ int idx = n - 1 - gid;
+
+ *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
+ *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
new file mode 100644
index 000000000..9594daf19
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.h"
+
+__global inline float *get_vec_elem(Vector *vec, int idx)
+{
+ return (__global float *)(vec->ptr + idx * vec->stride_x);
+}
+
+__global inline int *get_vec_elem_int(Vector *vec, int idx)
+{
+ return (__global int *)(vec->ptr + idx * vec->stride_x);
+}
+
+// A utility function to swap two elements
+void swap(__global float *a, __global float *b)
+{
+ float t = *a;
+ *a = *b;
+ *b = t;
+}
+
+void swap_idx(__global int *a, __global int *b)
+{
+ int t = *a;
+ *a = *b;
+ *b = t;
+}
+
+/* This function is same in both iterative and recursive*/
+int partition(Vector *arr, __global int *indices, int l, int h)
+{
+ float x = *get_vec_elem(arr, h);
+ int i = (l - 1);
+
+ for (int j = l; j <= h - 1; j++)
+ {
+ if (*get_vec_elem(arr, j) >= x)
+ {
+ i++;
+ swap(get_vec_elem(arr, i), get_vec_elem(arr, j));
+ swap_idx(&indices[i], &indices[j]);
+ }
+ }
+ swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
+ swap_idx(&indices[i + 1], &indices[h]);
+ return (i + 1);
+}
+
+/* A[] --> Array to be sorted,
+ l --> Starting index,
+ h --> Ending index */
+void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h)
+{
+ // Create an auxiliary stack
+
+ // initialize top of stack
+ int top = -1;
+
+ // push initial values of l and h to stack
+ stack[++top] = l;
+ stack[++top] = h;
+
+ // Keep popping from stack while is not empty
+ while (top >= 0)
+ {
+ // Pop h and l
+ h = stack[top--];
+ l = stack[top--];
+
+ // Set pivot element at its correct position
+ // in sorted array
+ int p = partition(arr, indices, l, h);
+
+ // If there are elements on left side of pivot,
+ // then push left side to stack
+ if (p - 1 > l)
+ {
+ stack[++top] = l;
+ stack[++top] = p - 1;
+ }
+
+ // If there are elements on right side of pivot,
+ // then push right side to stack
+ if (p + 1 < h)
+ {
+ stack[++top] = p + 1;
+ stack[++top] = h;
+ }
+ }
+}
+
+__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values),
+ VECTOR_DECLARATION(topk_indices), __global int *indices,
+ __global int *temp_stack, int k, int n)
+{
+ Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+ Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
+ Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
+
+ for (int i = 0; i < n; ++i)
+ {
+ indices[i] = i;
+ }
+
+ quickSortIterative(&input, indices, temp_stack, 0, n - 1);
+
+ // extract k items.
+ for (int i = 0; i < k; ++i)
+ {
+ *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i);
+ *get_vec_elem_int(&topk_indices, i) = indices[i];
+ }
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
new file mode 100644
index 000000000..f6830d229
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// reference:
+// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
+// OpenCL kernel sources for the CLRadixSort class
+// the #include does not exist in OpenCL
+// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
+// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
+// if you find this software usefull you can cite the following work in your reports or articles:
+// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
+// http://hal.archives-ouvertes.fr/hal-00596730
+
+// Reference for floating point radix sort:
+// http://www.codercorner.com/RadixSortRevisited.htm
+
+// compute the histogram for each radix and each virtual processor for the pass
+__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
+ const int pass, __local int *loc_histo, const int n)
+{
+ int it = get_local_id(0); // i local number of the processor
+ int ig = get_global_id(0); // global number = i + g I
+
+ int gr = get_group_id(0); // g group number
+
+ int groups = get_num_groups(0);
+ int items = get_local_size(0);
+
+ // set the local histograms to zero
+ for (int ir = 0; ir < _RADIX; ir++)
+ {
+ loc_histo[ir * items + it] = 0;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // range of keys that are analyzed by the work item
+ int size = n / groups / items; // size of the sub-list
+ int start = ig * size; // beginning of the sub-list
+
+ unsigned int key;
+ int shortkey, k;
+
+ // compute the index
+ // the computation depends on the transposition
+ for (int j = 0; j < size; j++)
+ {
+#ifdef TRANSPOSE
+ k = groups * items * j + ig;
+#else
+ k = j + start;
+#endif
+
+ key = *((__global unsigned int *)(in_key_buf + k));
+
+ // extract the group of _BITS bits of the pass
+ // the result is in the range 0.._RADIX-1
+ shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
+
+ // increment the local histogram
+ loc_histo[shortkey * items + it]++;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // copy the local histogram to the global one
+ for (int ir = 0; ir < _RADIX; ir++)
+ {
+ d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// initial transpose of the list for improving
+// coalescent memory access
+__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
+ const int nbrow, const __global int *inperm, __global int *outperm,
+ __local int *blockmat, __local int *blockperm, const int tilesize)
+{
+
+ int i0 = get_global_id(0) * tilesize; // first row index
+ int j = get_global_id(1); // column index
+
+ int jloc = get_local_id(1); // local column index
+
+ // fill the cache
+ for (int iloc = 0; iloc < tilesize; iloc++)
+ {
+ int k = (i0 + iloc) * nbcol + j; // position in the matrix
+ blockmat[iloc * tilesize + jloc] = invect[k];
+#ifdef PERMUT
+ blockperm[iloc * tilesize + jloc] = inperm[k];
+#endif
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // first row index in the transpose
+ int j0 = get_group_id(1) * tilesize;
+
+ // put the cache at the good place
+ for (int iloc = 0; iloc < tilesize; iloc++)
+ {
+ int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
+ outvect[kt] = blockmat[jloc * tilesize + iloc];
+#ifdef PERMUT
+ outperm[kt] = blockperm[jloc * tilesize + iloc];
+#endif
+ }
+}
+
+// each virtual processor reorders its data using the scanned histogram
+__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
+ __global int *d_Histograms, const int pass,
+ __global int *indices_in, __global int *indices_out,
+ __local int *loc_histo, const int n)
+{
+
+ int it = get_local_id(0);
+ int ig = get_global_id(0);
+
+ int gr = get_group_id(0);
+ int groups = get_num_groups(0);
+ int items = get_local_size(0);
+
+ int start = ig * (n / groups / items);
+ int size = n / groups / items;
+
+ // take the histogram in the cache
+ for (int ir = 0; ir < _RADIX; ir++)
+ {
+ loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int newpos, shortkey, k, newpost;
+ unsigned int key;
+
+ for (int j = 0; j < size; j++)
+ {
+#ifdef TRANSPOSE
+ k = groups * items * j + ig;
+#else
+ k = j + start;
+#endif
+ float org_value = in_key[k];
+ key = *(__global unsigned int *)(in_key + k);
+ shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
+
+ newpos = loc_histo[shortkey * items + it];
+
+#ifdef TRANSPOSE
+ int ignew, jnew;
+ ignew = newpos / (n / groups / items);
+ jnew = newpos % (n / groups / items);
+ newpost = jnew * (groups * items) + ignew;
+#else
+ newpost = newpos;
+#endif
+
+ // d_outKeys[newpost]= key; // killing line !!!
+ out_key[newpost] = org_value;
+
+#ifdef PERMUT
+ indices_out[newpost] = indices_in[k];
+#endif
+
+ newpos++;
+ loc_histo[shortkey * items + it] = newpos;
+ }
+}
+
+// perform a parallel prefix sum (a scan) on the local histograms
+// (see Blelloch 1990) each workitem worries about two memories
+// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
+ __global int *globsum)
+{
+ int it = get_local_id(0);
+ int ig = get_global_id(0);
+ int decale = 1;
+ int n = get_local_size(0) * 2;
+ int gr = get_group_id(0);
+
+ // load input into local memory
+ // up sweep phase
+ temp[2 * it] = histo[2 * ig];
+ temp[2 * it + 1] = histo[2 * ig + 1];
+
+ // parallel prefix sum (algorithm of Blelloch 1990)
+ for (int d = n >> 1; d > 0; d >>= 1)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (it < d)
+ {
+ int ai = decale * (2 * it + 1) - 1;
+ int bi = decale * (2 * it + 2) - 1;
+ temp[bi] += temp[ai];
+ }
+ decale *= 2;
+ }
+
+ // store the last element in the global sum vector
+ // (maybe used in the next step for constructing the global scan)
+ // clear the last element
+ if (it == 0)
+ {
+ globsum[gr] = temp[n - 1];
+ temp[n - 1] = 0;
+ }
+
+ // down sweep phase
+ for (int d = 1; d < n; d *= 2)
+ {
+ decale >>= 1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (it < d)
+ {
+ int ai = decale * (2 * it + 1) - 1;
+ int bi = decale * (2 * it + 2) - 1;
+
+ int t = temp[ai];
+ temp[ai] = temp[bi];
+ temp[bi] += t;
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // write results to device memory
+
+ histo[2 * ig] = temp[2 * it];
+ histo[2 * ig + 1] = temp[2 * it + 1];
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// use the global sum for updating the local histograms
+// each work item updates two values
+__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
+{
+ int ig = get_global_id(0);
+ int gr = get_group_id(0);
+
+ int s;
+
+ s = globsum[gr];
+
+ // write results to device memory
+ histo[2 * ig] += s;
+ histo[2 * ig + 1] += s;
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
new file mode 100644
index 000000000..7f4b5b0df
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(axis, 1);
+
+ return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ArgOperation /*op*/)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
+ DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
+ output->tensor_shape().num_dimensions(),
+ "Input's rank is not same with output");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match axis");
+
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+ return Status{};
+}
+
+} // namespace
+
+CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+ ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+ // Construct kernel and set op_code based on type of ArgOperation as specified by object op
+ std::string kernel_name = "arg_op";
+ int op_code = 0;
+ if (op == ArgOperation::MAX)
+ {
+ op_code = 1;
+ }
+ else if (op == ArgOperation::MIN)
+ {
+ op_code = 2;
+ }
+ else
+ throw std::runtime_error("Operation not supported, yet");
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+ build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t axis, ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+ return Status{};
+}
+
+void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ _kernel.setArg<cl_int>(idx++, _axis);
+ _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
new file mode 100644
index 000000000..c14e73634
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+ ICLTensor *output, BinaryLogicalOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "binary_logical_op";
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+
+ int op_code = 0;
+ switch (op)
+ {
+ case BinaryLogicalOperation::AND:
+ op_code = 1;
+ break;
+ case BinaryLogicalOperation::OR:
+ op_code = 2;
+ break;
+ default:
+ throw std::runtime_error("Operation not supported, yet");
+ }
+
+ build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLBinaryLogicalOpKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
new file mode 100644
index 000000000..35f607bd0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Set kernel build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_OUT=" +
+ get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.add_option(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ // Create kernel
+ if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ const float scale_in = input->info()->quantization_info().scale;
+ const int offset_in = input->info()->quantization_info().offset;
+ build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
+
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options()));
+ }
+ else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
+ {
+ const float scale_in = output->info()->quantization_info().scale;
+ const int offset_in = output->info()->quantization_info().offset;
+ build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
+
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options()));
+ }
+ else
+ {
+ build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT");
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("cast", build_opts.options()));
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
new file mode 100644
index 000000000..2a3433c2b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+// TODO Use this validation function
+#if 0
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
+ "Output width should be equal to (Input width * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
+ "Output height should be equal to (Input height * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
+ "Input depth should be divisible by (block size * block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ output->dimension(2) != input->dimension(2) / (block_size * block_size),
+ "Output depth should be equal to (Input depth / (block size * block size))");
+
+ return Status{};
+}
+#endif
+} // namespace
+
+CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+{
+ // DO NOTHING
+}
+
+void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t block_size)
+{
+ // TODO Add validation of data_layout
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ auto layout_out = output->info()->data_layout();
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+ auto depth = output->info()->dimension(index_depth);
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
+ build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+ "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..0862b78bf
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
+ : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *lookups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+ return Status{};
+}
+
+void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const ICLTensor *lookups)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+ _input = input;
+ _output = output;
+ _lookups = lookups;
+
+ // Set kernel build options
+ std::stringstream kernel_name;
+ std::set<std::string> build_opts;
+ kernel_name << "embedding_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ Window win_lookup;
+ win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_in);
+ add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
new file mode 100644
index 000000000..718f615f9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/core/UtilsEx.h"
+
+using namespace arm_compute;
+
+namespace
+{
+
+inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+ ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+ input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices,
+ ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+ std::unique_ptr<ITensorInfo> output_info = input->clone();
+ output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+ input->tensor_shape(), indices->tensor_shape(), actual_axis));
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
+
+ // Create window
+ Window win = calculate_max_window(*output, Steps());
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+
+} // namespace
+
+CLGatherExKernel::CLGatherExKernel()
+ : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+{
+}
+
+void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices,
+ ICLTensor *output, int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), indices->info(), output->info(), axis));
+
+ // Configure kernel window
+ auto win_config =
+ validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input = input;
+ _output = output;
+ _indices = indices;
+ _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions()));
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DOUTPUT_DIM_Z=" +
+ support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+ build_opts.add_option("-DINDICES_DIM=" +
+ support::cpp11::to_string(indices->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ indices->clone().get(),
+ output->clone().get(), axis)
+ .first);
+ return Status{};
+}
+
+void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4);
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window_collapsed);
+ add_3D_tensor_argument(idx, _indices, window_collapsed);
+ add_4D_tensor_argument(idx, _output, window_collapsed);
+ enqueue(queue, *this, window_collapsed, lws_hint());
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
new file mode 100644
index 000000000..31e98c9a8
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLHashtableLookupKernel::CLHashtableLookupKernel()
+{
+ // DO NOTHING
+}
+
+Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Output's shape was not set");
+
+ ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) ||
+ output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+
+ return Status{};
+}
+
+void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
+ const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+
+ _lookups = lookups;
+ _keys = keys;
+ _input = input;
+ _output = output;
+ _hits = hits;
+
+ // Make _lookup_indices tensor
+ _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+ _lookup_indices->allocator()->init(
+ TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+ _lookup_indices->allocator()->allocate();
+
+ // Set kernel build options
+ std::stringstream kernel_name;
+ std::set<std::string> build_opts;
+ kernel_name << "hashtable_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const_cast<ICLTensor *>(_lookups)->map(queue);
+ const_cast<ICLTensor *>(_keys)->map(queue);
+ _hits->map(queue);
+ _lookup_indices->map(queue);
+
+ // Set values of hits
+ const int32_t *lookups_buf =
+ reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+ const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
+ uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+ int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
+
+ std::map<int32_t, size_t> key_map;
+ const size_t keys_num = _keys->info()->dimension(0);
+ for (size_t key_index = 0; key_index < keys_num; key_index++)
+ {
+ key_map[keys_buf[key_index]] = key_index;
+ }
+
+ const size_t lookups_num = _lookups->info()->dimension(0);
+ for (size_t i = 0; i < lookups_num; ++i)
+ {
+ const auto lookup_value = lookups_buf[i];
+ const auto it = key_map.find(lookup_value);
+ if (it != key_map.end())
+ {
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+ if (it->second >= lookups_num)
+ ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+ lookup_indices_buf[i] = static_cast<int32_t>(it->second);
+ hits_buf[i] = static_cast<uint8_t>(1);
+ }
+ else
+ {
+ lookup_indices_buf[i] = -1;
+ hits_buf[i] = static_cast<uint8_t>(0);
+ }
+ }
+
+ const_cast<ICLTensor *>(_lookups)->unmap(queue);
+ const_cast<ICLTensor *>(_keys)->unmap(queue);
+ _hits->unmap(queue);
+ _lookup_indices->unmap(queue);
+
+ Window win = window.collapse(ICLKernel::window(), 2, 4);
+
+ Window win_lookup;
+ win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, win);
+ add_4D_tensor_argument(idx, _output, win);
+ add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+ enqueue(queue, *this, win);
+ } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
new file mode 100644
index 000000000..5db414f62
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
+{
+ ARM_COMPUTE_UNUSED(gamma);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+
+ if (output != nullptr && output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // We handle the planes manually
+ Window win = calculate_max_window(*input, Steps(1));
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+
+ // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
+ // skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx()
+ : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
+ _run_in_place(false)
+{
+}
+
+void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output,
+ ICLTensor *gamma, ICLTensor *beta,
+ float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _input = input;
+ _output = output == nullptr ? input : output;
+ _gamma = gamma;
+ _beta = beta;
+ _epsilon = epsilon;
+
+ _run_in_place = (output == nullptr) || (output == input);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(),
+ gamma ? gamma->info() : nullptr,
+ beta ? beta->info() : nullptr, epsilon));
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" +
+ support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
+ build_opts.add_option_if(gamma, "-DGAMMA");
+ build_opts.add_option_if(beta, "-DBETA");
+ build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+ build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(_input->info(), _output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+ ICLKernel::configure_internal(std::get<1>(win_config));
+}
+
+Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *gamma,
+ const ITensorInfo *beta, float epsilon)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+ input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+ return Status{};
+}
+
+void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window collapsed_window = window.collapse(window, Window::DimZ);
+
+ // We will process the planes together
+ if (_input->info()->data_layout() == DataLayout::NCHW)
+ {
+ collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+ collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+ }
+ else
+ {
+ collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+ collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1));
+ }
+
+ Window vec_window;
+ vec_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, collapsed_window);
+ if (!_run_in_place)
+ {
+ add_4D_tensor_argument(idx, _output, collapsed_window);
+ }
+ if (_gamma)
+ {
+ add_1D_tensor_argument(idx, _gamma, vec_window);
+ }
+ if (_beta)
+ {
+ add_1D_tensor_argument(idx, _beta, vec_window);
+ }
+
+ enqueue(queue, *this, collapsed_window, lws_hint());
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
new file mode 100644
index 000000000..ecfe05a51
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ return Status{};
+}
+
+} // namespace
+
+CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
new file mode 100644
index 000000000..e7d587029
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+ const TensorShape &out_shape =
+ TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+ return Status{};
+}
+} // namespace
+
+CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
+
+ _input = input;
+ _alpha = alpha;
+ _output = output;
+
+ // Create kernel
+ std::string kernel_name = "prelu";
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ build_opts.emplace("-DOFF_IN=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ build_opts.emplace("-DOFF_ALPHA=" +
+ support::cpp11::to_string(alpha->info()->quantization_info().offset));
+ build_opts.emplace("-DOFF_OUT=" +
+ support::cpp11::to_string(output->info()->quantization_info().offset));
+ build_opts.emplace("-DSCALE_IN=" +
+ support::cpp11::to_string(input->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_ALPHA=" +
+ support::cpp11::to_string(alpha->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_OUT=" +
+ support::cpp11::to_string(output->info()->quantization_info().scale));
+ kernel_name += "_qasymm8";
+ }
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), out_shape);
+
+ if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if (input->info()->data_type() == DataType::F32 ||
+ alpha->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+ Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+
+ AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win_input1, input1_access) ||
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input->info()->tensor_shape();
+ const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse =
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed =
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
+
+ const TensorShape &in_shape1_collapsed =
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed =
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_input1);
+ add_3D_tensor_argument(idx, _alpha, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPReLUKernel::border_size() const
+{
+ const unsigned int replicateSize =
+ _output->info()->dimension(0) -
+ std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+ const unsigned int border =
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
new file mode 100644
index 000000000..24e89db28
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+namespace
+{
+// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
+// are the same.
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(axis, 1);
+
+ return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32, DataType::S32);
+ if (op == ReduceOperation::SUM)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
+ "Not support QASYMM8, yet");
+ }
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match axis");
+
+ return Status{};
+}
+} // namespace
+
+CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const uint32_t axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+ // Construct kernel name
+ std::string kernel_name;
+ int op_code = 0;
+ if (op == ReduceOperation::MAX)
+ {
+ kernel_name = "reduce_min_max";
+ op_code = 1;
+ }
+ else if (op == ReduceOperation::MIN)
+ {
+ kernel_name = "reduce_min_max";
+ op_code = 2;
+ }
+ else if (op == ReduceOperation::SUM)
+ {
+ kernel_name = "reduce_sum_mean";
+ op_code = 3;
+ }
+ else if (op == ReduceOperation::MEAN)
+ {
+ kernel_name = "reduce_sum_mean";
+ op_code = 4;
+ }
+ else
+ throw std::runtime_error("Operation not supported, yet");
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+ build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+ return Status{};
+}
+
+void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ _kernel.setArg<cl_int>(idx++, _axis);
+ _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+ // Support dimensions up to 4
+ Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
+ // of input and output are the same
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+ idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out, lws_hint());
+
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
new file mode 100644
index 000000000..f7836b6cd
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
+ const ITensorInfo *padding_size, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::F16, DataType::S32,
+ DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
+ "The number of dimensions of input should be equal to output");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
+ "The input and output layouts are different!");
+
+ // TODO Support other cases
+ if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
+ else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
+ "CLSpaceToBatchNDKernel supports dimensions up to 4");
+
+ if (input->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
+ "The input and output quantization info are different!");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel()
+{
+ // DO NOTHING
+}
+
+void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
+ const ICLTensor *padding_size, ICLTensor *output)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
+
+ _input = input;
+ _block_size = block_size;
+ _padding_size = padding_size;
+ _output = output;
+
+ // Set kernel build options
+ // TODO Support other cases
+ std::string kernel_name = "space_to_batch_4d";
+ std::set<std::string> build_opts;
+ Window win;
+
+ if (input->info()->data_layout() == DataLayout::NCHW)
+ {
+ kernel_name += "_nchw";
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+ win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+ }
+ else if (input->info()->data_layout() == DataLayout::NHWC)
+ {
+ kernel_name += "_nhwc";
+ build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DVEC_SIZE=" +
+ support::cpp11::to_string(num_elems_processed_per_iteration));
+
+ win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->info()->valid_region());
+
+ if (window_changed)
+ {
+ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported layout");
+ }
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
+ if (input->info()->data_type() == DataType::QASYMM8)
+ {
+ build_opts.emplace("-DZERO_VALUE=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ }
+ else
+ {
+ build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ }
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+ const_cast<ICLTensor *>(_block_size)->map(queue);
+ const_cast<ICLTensor *>(_padding_size)->map(queue);
+
+ const size_t num_dimensions = _input->info()->num_dimensions();
+ const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
+ uint32_t batch_size = _input->info()->dimension(num_dimensions - 1);
+ for (size_t i = 0; i < num_spacial_dimensions; ++i)
+ {
+ const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
+ const int32_t padding_size_pre =
+ *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
+ const int32_t padding_size_post =
+ *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
+
+ ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
+ ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
+ "Padding size should be greater than or equal to 0");
+
+ if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(i) !=
+ (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
+ "Dimension value of spatial block does not match output's dimension value");
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
+ (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
+ padding_size_pre + padding_size_post) /
+ block_size,
+ "Dimension value of spatial block does not match output's dimension value");
+ }
+
+ batch_size *= block_size;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _output->info()->dimension(num_dimensions - 1) != batch_size,
+ "Output batch size should be equal to input batch size * (multiplication of all block size)");
+
+ const_cast<ICLTensor *>(_block_size)->unmap(queue);
+ const_cast<ICLTensor *>(_padding_size)->unmap(queue);
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Set block size window
+ Window win_block = calculate_max_window(*_block_size->info(), Steps());
+
+ // Set padding size window
+ Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ add_1D_tensor_argument(idx, _block_size, win_block);
+ add_2D_tensor_argument(idx, _padding_size, win_padding);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
new file mode 100644
index 000000000..b085192a2
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const int32_t block_size)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+ "Block size should be greater than or equal to 1.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
+ "Input batch should be equal to Output batch");
+
+ auto layout_out = input->data_layout();
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+
+ auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+ auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
+ auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
+ "Output depth should be equal to (input depth * block size *block size)");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
+ (input->dimension(index_height) % block_size),
+ "Input height and width should be divisible by block size");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
+ (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
+ "Output height and width should be equal to "
+ "input_height/blocksize and input_width/blocksize respectively");
+
+ return Status{};
+}
+
+} // namespace
+
+CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const int32_t block_size)
+{
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+ _input = input;
+ _output = output;
+
+ // Set kernel build options
+ auto layout_out = input->info()->data_layout();
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+ auto depth = input->info()->dimension(index_depth);
+ build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
+ build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+ "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_out(slice_in);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
new file mode 100644
index 000000000..4f2b388c9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+// Invalid result on GPU
+#if 0
+namespace arm_compute
+{
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
+
+void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+ cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
+{
+ ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ _input = input;
+ _topk_values = topk_values;
+ _topk_indices = topk_indices;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
+
+ unsigned int idx = 3 * num_arguments_per_1D_tensor();
+ _kernel.setArg(idx++, *indices);
+ _kernel.setArg(idx++, *temp_stack);
+ _kernel.setArg<cl_int>(idx++, k);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, 1, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, window);
+ add_1D_tensor_argument(idx, _topk_values, window);
+ add_1D_tensor_argument(idx, _topk_indices, window);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
+
+void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
+ int n)
+{
+ ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ _input = input;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
+
+ unsigned int idx = num_arguments_per_1D_tensor();
+ _kernel.setArg(idx++, *in_key_buf);
+ _kernel.setArg(idx++, *in_ind_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, window);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This kernel makes a histogram of radix for each work item.
+CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
+
+void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
+
+ int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
+
+ unsigned int idx = 1;
+ _kernel.setArg(idx++, *hist_buf);
+
+ idx = 3;
+ _kernel.setArg(idx++, loc_histo_size, nullptr);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ _kernel.setArg(0, *_in_key_buf);
+ _kernel.setArg<cl_int>(2, _pass);
+
+ cl::NDRange lws = cl::NDRange(_ITEMS, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
+
+void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+ int temp_size =
+ std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *hist_buf);
+ _kernel.setArg(idx++, temp_size, nullptr);
+ _kernel.setArg(idx++, *glob_sum_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
+
+void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
+ int bits)
+{
+ ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+ int temp_size =
+ std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *glob_sum_buf);
+ _kernel.setArg(idx++, temp_size, nullptr);
+ _kernel.setArg(idx++, *temp_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
+
+void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *hist_buf);
+ _kernel.setArg(idx++, *glob_sum_buf);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortReorder::CLRadixSortReorder()
+ : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
+ _out_ind_buf(nullptr)
+{
+}
+
+void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+ ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ unsigned int radix = 1 << bits;
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+ build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+ build_opts.emplace("-DPERMUT=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
+
+ unsigned int idx = 2;
+ _kernel.setArg(idx++, *hist_buf);
+
+ idx = 6;
+ _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+ unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
+ cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
+
+ _kernel.setArg(0, *_in_key_buf);
+ _kernel.setArg(1, *_out_key_buf);
+ _kernel.setArg<cl_int>(3, _pass);
+ _kernel.setArg(4, *_in_ind_buf);
+ _kernel.setArg(5, *_out_ind_buf);
+
+ enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
+
+void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+ ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
+
+ unsigned int idx = 1;
+ _kernel.setArg(idx++, *first_negative_idx_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *_out_key_buf);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
+ : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+ ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+ ARM_COMPUTE_ERROR_ON(n == 0);
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
+
+ unsigned int idx = 4;
+ _kernel.setArg(idx++, *first_negative_idx_buf);
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, n, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, *_in_key_buf);
+ _kernel.setArg(idx++, *_out_key_buf);
+ _kernel.setArg(idx++, *_in_ind_buf);
+ _kernel.setArg(idx++, *_out_ind_buf);
+
+ enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Store::CLTopKV2Store()
+ : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
+{
+ ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
+ ARM_COMPUTE_ERROR_ON(k == 0);
+ ARM_COMPUTE_ERROR_ON(k > n);
+
+ _values = values;
+ _indices = indices;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+
+ // Create kernel
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
+
+ unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
+ _kernel.setArg<cl_int>(idx++, n);
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, k, 1));
+ ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
+{
+ _out_key_buf = out_key_buf;
+ _out_ind_buf = out_ind_buf;
+}
+
+void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _values, window);
+ add_1D_tensor_argument(idx, _indices, window);
+ _kernel.setArg(idx++, *_out_key_buf);
+ _kernel.setArg(idx++, *_out_ind_buf);
+
+ enqueue(queue, *this, window);
+}
+
+} // namespace arm_compute
+#endif // Disable GPU implementation
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
new file mode 100644
index 000000000..6cc8d9d13
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
+ : _input(nullptr), _output(nullptr), _inner_border(), _info()
+{
+}
+
+Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const BorderSize &inner_border,
+ const PadStrideInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+ for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
+ "inner_border_right must be smaller that stride_x");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
+ "inner_border_top must be smaller that stride_y");
+
+ return Status{};
+}
+
+void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const BorderSize &inner_border,
+ const PadStrideInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ _input = input;
+ _output = output;
+ _inner_border = inner_border;
+ _info = info;
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
+ input->info(), output->info(), inner_border, info));
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const DataLayout data_layout = _input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ const int out_start_x = _info.pad_left();
+ const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
+ _info.pad_right() + _info.stride().first - 1;
+ const int out_step_x = _info.stride().first;
+
+ const int out_start_y = _inner_border.top + _info.pad_top();
+ const int out_end_y =
+ _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
+ const int out_step_y = _info.stride().second;
+
+ switch (data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+ Window slice_out = collapsed.first_slice_window_3D();
+ slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+ slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+ Window slice_in = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (collapsed.slide_window_slice_3D(slice_in) &&
+ collapsed.slide_window_slice_3D(slice_out));
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ // NOTE: not collapsing in NHWC
+ Window slice_out = window.first_slice_window_3D();
+ slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+ slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+ Window slice_in = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data layout");
+ }
+}
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
new file mode 100644
index 000000000..8ac667ceb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {}
+
+bool CPPUpsampleKernelEx::is_parallelisable() const { return false; }
+
+void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output,
+ const PadStrideInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ _input = input;
+ _output = output;
+ _info = info;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICPPKernel::configure(win);
+}
+
+void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+ // Initialize _scaled_output buffer
+ const int width_scaled = _output->info()->dimension(0);
+ const int height_scaled = _output->info()->dimension(1);
+ const int stride_x = _info.stride().first;
+ const int stride_y = _info.stride().second;
+ const int start_x = _info.pad_left();
+ const int start_y = _info.pad_top();
+ const int end_y = height_scaled - _info.pad_bottom();
+ const int end_x = width_scaled - _info.pad_top();
+ const size_t element_size = _input->info()->element_size();
+
+ // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
+ const uint8_t fill_value =
+ _output->info()->data_type() == DataType::QASYMM8
+ ? utility::clamp<uint8_t>(_output->info()->quantization_info().offset)
+ : 0;
+ // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
+ // values in a buffer of uint8_ts
+ std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value);
+
+ // Create window
+ Window window_out(window);
+ window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
+ window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
+
+ // Create iterators
+ Iterator in(_input, window);
+ Iterator out(_output, window_out);
+
+ execute_window_loop(
+ window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
new file mode 100644
index 000000000..4508f5800
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+
+#include <algorithm>
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+namespace
+{
+void store_quantized_int32(uint8_t *output_ptr, const int32x4x4_t &out)
+{
+ const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
+ const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
+ vst1q_u8(output_ptr, vcombine_u8(pa, pb));
+}
+
+using namespace arm_compute;
+template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
+void elementwise_op_templ(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+ int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
+ OutputScalarType *, const bool),
+ int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
+ OutputScalarType *))
+{
+ // Create input windows
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+ if (is_broadcast_across_x)
+ {
+ const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+ Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+ Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+ // Clear X Dimension on execution window as we handle manually
+ non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator broadcast_input(broadcast_tensor, broadcast_win);
+ Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win,
+ [&](const Coordinates &) {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto non_broadcast_input_ptr =
+ reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+ const InputScalarType broadcast_value =
+ *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
+ non_broadcast_input_ptr, broadcast_value,
+ output_ptr, !is_broadcast_input_2);
+ for (; x < window_end_x; ++x)
+ {
+ const auto a = *(non_broadcast_input_ptr + x);
+ *(output_ptr + x) =
+ (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+ !is_broadcast_input_2 ? a : broadcast_value);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
+ }
+ else
+ {
+ // Clear X Dimension on execution window as we handle manually
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win,
+ [&](const Coordinates &) {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto input1_ptr =
+ reinterpret_cast<const InputScalarType *>(input1.ptr());
+ const auto input2_ptr =
+ reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+ int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
+ input1_ptr, input2_ptr, output_ptr);
+ for (; x < window_end_x; ++x)
+ {
+ const auto a = *(input1_ptr + x);
+ const auto b = *(input2_ptr + x);
+ *(output_ptr + x) = (*scalar_func)(a, b);
+ }
+ },
+ input1, input2, output);
+ }
+}
+
+} // namespace
+
+namespace arm_compute
+{
+
+float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
+ const float32x4_t &scale)
+{
+ qasymm8x16_t x = vld1q_u8(input1_ptr);
+ const float32x4x4_t out = {{
+ vmulq_f32(
+ vcvtq_f32_s32(vsubq_s32(
+ vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+ scale),
+ vmulq_f32(
+ vcvtq_f32_s32(vsubq_s32(
+ vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+ scale),
+ vmulq_f32(
+ vcvtq_f32_s32(vsubq_s32(
+ vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+ scale),
+ vmulq_f32(
+ vcvtq_f32_s32(vsubq_s32(
+ vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+ scale),
+ }};
+ return out;
+}
+
+void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset,
+ const float32x4_t &invscale)
+{
+ int32x4x4_t out = {{
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+ }};
+ store_quantized_int32(output_ptr, out);
+}
+
+float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale)
+{
+ const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vscale = vdupq_n_f32(scale);
+
+ const float32x4x4_t broadcast_vector = {{
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(
+ vmovl_u8(vget_low_u8(broadcast_value_vec))))),
+ voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(
+ vmovl_u8(vget_low_u8(broadcast_value_vec))))),
+ voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(
+ vmovl_u8(vget_high_u8(broadcast_value_vec))))),
+ voffset)),
+ vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(
+ vmovl_u8(vget_high_u8(broadcast_value_vec))))),
+ voffset)),
+ vscale),
+ }};
+ return broadcast_vector;
+}
+
+void elementwise_op_quantized(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
+ int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t,
+ float32x4_t, float32x4_t, float32x4_t, const bool),
+ int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t,
+ int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
+{
+ // Create input windows
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+ const float output_scale = out->info()->quantization_info().scale;
+ const int output_offset = out->info()->quantization_info().offset;
+
+ // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from
+ // zero)
+ const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f);
+ const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
+
+ if (is_broadcast_across_x)
+ {
+ // Select the broadcast input on the X axis
+ const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+ Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+ Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+ const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info();
+ const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info();
+
+ const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
+ const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
+
+ // Clear X Dimension on execution window as we handle manually
+ non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator broadcast_input(broadcast_tensor, broadcast_win);
+ Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+ Iterator output(out, win);
+
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto non_broadcast_input_ptr =
+ reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+ const float32x4x4_t broadcast_vector =
+ dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale);
+
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
+ non_broadcast_input_ptr, broadcast_vector, output_ptr,
+ voffset_non_broadcast, vscale_non_broadcast, voffseto,
+ invvscaleo, !is_broadcast_input_2);
+ for (; x < window_end_x; ++x)
+ {
+ const float afs =
+ scvt_f32_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo.scale,
+ non_broadcast_qinfo.offset);
+ const float bfs =
+ scvt_f32_qasymm8(broadcast_value, broadcast_qinfo.scale, broadcast_qinfo.offset);
+ *(output_ptr + x) =
+ (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs,
+ out->info()->quantization_info());
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
+ }
+ else
+ {
+ // Input1 quantization info
+ const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
+ const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
+
+ // Input2 quantization info
+ const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
+ const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
+
+ // Clear X Dimension on execution window as we handle manually
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
+ const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ int x =
+ (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr,
+ output_ptr, voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+ for (; x < window_end_x; ++x)
+ {
+ const float afs =
+ scvt_f32_qasymm8(*(input1_ptr + x), input1_qinfo.scale, input1_qinfo.offset);
+ const float bfs =
+ scvt_f32_qasymm8(*(input2_ptr + x), input2_qinfo.scale, input2_qinfo.offset);
+ *(output_ptr + x) = (*scalar_func)(afs, bfs, out->info()->quantization_info());
+ }
+ },
+ input1, input2, output);
+ }
+}
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ float (*scalar_func)(const float &, const float &),
+ int (*broadcast_func)(int, int, int, const float *, const float &, float *,
+ const bool),
+ int (*neon_func)(int, int, int, const float *, const float *, float *))
+{
+ elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func,
+ broadcast_func, neon_func);
+}
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
+ int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
+ uint8_t *, const bool),
+ int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *))
+{
+ elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func,
+ broadcast_func, neon_func);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
new file mode 100644
index 000000000..d2f42de53
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation op, typename ScalarType>
+inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+ auto res = ScalarType(0);
+
+ switch (op)
+ {
+ case BinaryLogicalOperation::AND:
+ res = a & b;
+ break;
+ case BinaryLogicalOperation::OR:
+ res = a | b;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+ return res;
+}
+
+template <BinaryLogicalOperation op, typename VectorType>
+inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b)
+{
+ VectorType res = {0, 0, 0, 0};
+
+ switch (op)
+ {
+ case BinaryLogicalOperation::AND:
+ res = wrapper::vand(a, b);
+ break;
+ case BinaryLogicalOperation::OR:
+ res = wrapper::vorr(a, b);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+ return res;
+}
+
+template <BinaryLogicalOperation op>
+inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b)
+{
+ uint8x16x4_t out = {{
+ elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]),
+ elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]),
+ }};
+ return out;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_logic_op_broadcast(const VectorType &a,
+ const ScalarType &broadcast_value,
+ const bool reorder)
+{
+ VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+ return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x,
+ const ScalarType *input1_ptr, const ScalarType *input2_ptr,
+ ScalarType *output_ptr)
+{
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b));
+ }
+ return x;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x,
+ int window_step_x,
+ const ScalarType *non_broadcast_input_ptr,
+ const ScalarType &broadcast_value,
+ ScalarType *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+ wrapper::vstore(output_ptr + x,
+ elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder));
+ }
+ return x;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+ const Window &window)
+{
+ elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>,
+ &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>,
+ &elementwise_logic_op_loop<op, ScalarType, VectorType>);
+}
+
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
+ const ITensor *input1, const ITensor *input2, ITensor *output,
+ std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+{
+ std::string function_to_call("op_");
+ function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+ function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+ function_to_call += string_from_data_type(output->info()->data_type());
+
+ auto it = map_function.find(function_to_call);
+
+ if (it != map_function.end())
+ {
+ auto func = it->second;
+ return [func](const ITensor *input1, const ITensor *input2, ITensor *output,
+ const Window &window) { func(input1, input2, output, window); };
+ }
+ return nullptr;
+}
+
+template <BinaryLogicalOperation op>
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+ static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
+ {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+ {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+
+ return configure_func(input1, input2, output, map_function);
+}
+
+void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1,
+ const ITensor *input2, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+ configure_common(input1, input2, output);
+ switch (op)
+ {
+ case BinaryLogicalOperation::AND:
+ _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output);
+ break;
+ case BinaryLogicalOperation::OR:
+ _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1,
+ const ITensorInfo &input2,
+ const ITensorInfo &output)
+{
+ // Validate in case of configured output
+ if (output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8,
+ DataType::QASYMM8);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+
+ const TensorShape out_shape =
+ TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if (output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+
+ return Status{};
+}
+
+Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
+ const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+ return Status{};
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
new file mode 100644
index 000000000..7e4fc129b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ SubDataType input_subtype)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
+ DataType::QASYMM8, DataType::U32,
+ DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
+ input->data_type() != DataType::U8);
+
+ if (output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+ DataType::QASYMM8, DataType::U32,
+ DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
+
+ // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+ return std::make_tuple(Status{}, win);
+}
+
+typedef struct bool8x16
+{
+ uint8x16_t val;
+} bool8x16_t;
+
+static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
+
+template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
+template <> inline uint8x16_t vcast(const bool8x16_t &v)
+{
+ const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+ const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+ uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+ return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+}
+
+template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
+{
+ const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+ const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+ uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+ uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+ const uint32x4x4_t ret = {{
+ vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
+ vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
+ vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
+ vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
+ }};
+
+ return ret;
+}
+
+template <> inline int32x4x4_t vcast(const bool8x16_t &v)
+{
+ const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+ const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+ uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+ uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+ const int32x4x4_t ret = {{
+ vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+ }};
+
+ return ret;
+}
+
+template <> inline float32x4x4_t vcast(const bool8x16_t &v)
+{
+ const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+ const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+ uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+ uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+ const float32x4x4_t ret = {{
+ vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+ vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+ vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+ vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+ }};
+
+ return ret;
+}
+
+template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
+{
+ const uint32x4x4_t ret = {{
+ vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
+ vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
+ vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
+ vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
+ }};
+
+ return ret;
+}
+
+template <> inline int32x4x4_t vcast(const uint8x16_t &v)
+{
+ const int32x4x4_t ret = {{
+ vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+ }};
+
+ return ret;
+}
+
+template <> inline float32x4x4_t vcast(const uint8x16_t &v)
+{
+ const float32x4x4_t ret = {{
+ vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+ vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+ vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+ vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+ }};
+
+ return ret;
+}
+
+template <> inline uint8x16_t vcast(const int32x4x4_t &v)
+{
+ // Saturate cast
+ return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
+ vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
+}
+
+template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
+{
+ // Saturate cast
+ const uint32x4x4_t ret = {{
+ vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
+ vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
+ vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
+ vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
+ vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
+ vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
+ vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
+ vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
+ }};
+
+ return ret;
+}
+
+template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
+{
+ const float32x4x4_t ret = {{
+ vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
+ vcvtq_f32_s32(v.val[3]),
+ }};
+
+ return ret;
+}
+
+template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
+{
+ return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
+ vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
+}
+
+template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
+{
+ const int32x4x4_t ret = {{
+ vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
+ vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
+ vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
+ vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
+ vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
+ vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
+ vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
+ vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
+ }};
+
+ return ret;
+}
+
+template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
+{
+ const float32x4x4_t ret = {{
+ vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
+ vcvtq_f32_u32(v.val[3]),
+ }};
+
+ return ret;
+}
+
+template <> inline uint8x16_t vcast(const float32x4x4_t &v)
+{
+ // Saturate cast
+ return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
+ vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
+ vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
+ vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
+}
+
+template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
+{
+ const uint32x4x4_t ret = {{
+ vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
+ vcvtq_u32_f32(v.val[3]),
+ }};
+
+ return ret;
+}
+
+template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
+{
+ const int32x4x4_t ret = {{
+ vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
+ vcvtq_s32_f32(v.val[3]),
+ }};
+
+ return ret;
+}
+
+template <typename T> struct cast_vector;
+template <> struct cast_vector<bool>
+{
+ using type = bool8x16_t;
+};
+template <> struct cast_vector<uint8_t>
+{
+ using type = uint8x16_t;
+};
+template <> struct cast_vector<uint32_t>
+{
+ using type = uint32x4x4_t;
+};
+template <> struct cast_vector<int32_t>
+{
+ using type = int32x4x4_t;
+};
+template <> struct cast_vector<float>
+{
+ using type = float32x4x4_t;
+};
+
+template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
+{
+ wrapper::vstore(ptr, v.val[0]);
+ wrapper::vstore(ptr + 4, v.val[1]);
+ wrapper::vstore(ptr + 8, v.val[2]);
+ wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
+{
+ wrapper::vstore(ptr, v);
+}
+
+inline bool8x16_t vloadq(const bool *ptr)
+{
+ bool8x16_t ret;
+ ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
+ return ret;
+}
+
+template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
+{
+ return wrapper::vloadq(ptr);
+}
+
+template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
+{
+ return vloadq(ptr);
+}
+
+template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
+{
+ return vld4q_u32(ptr);
+}
+
+template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
+{
+ return vld4q_s32(ptr);
+}
+
+template <> inline typename cast_vector<float>::type load_input(const float *ptr)
+{
+ return vld4q_f32(ptr);
+}
+
+template <typename T> inline T get_value(const T *ptr) { return *ptr; }
+
+template <> inline bool get_value(const bool *ptr)
+{
+ bool ret = (*ptr != 0);
+ return ret;
+}
+
+template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
+{
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Create iterators
+ Iterator in(input, win_collapsed);
+ Iterator out(output, win_collapsed);
+
+#ifdef __aarch64__
+ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else //__aarch64__
+ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &) {
+ const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ using from_vector = typename cast_vector<FromT>::type;
+ const from_vector vin = load_input(in_ptr + x);
+
+ switch (output->info()->data_type())
+ {
+ case DataType::U8:
+ {
+ using to_vector = typename cast_vector<uint8_t>::type;
+ const to_vector vout = vcast<to_vector, from_vector>(vin);
+ store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ using to_vector = typename cast_vector<float>::type;
+ const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+ const auto vf = vcast<to_vector, from_vector>(vin);
+ const auto vout = vquantize(vf, qinfo_out);
+ store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+ break;
+ }
+ case DataType::U32:
+ {
+ using to_vector = typename cast_vector<uint32_t>::type;
+ const to_vector vout = vcast<to_vector, from_vector>(vin);
+ store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+ break;
+ }
+ case DataType::S32:
+ {
+ using to_vector = typename cast_vector<int32_t>::type;
+ const to_vector vout = vcast<to_vector, from_vector>(vin);
+ store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+ break;
+ }
+ case DataType::F32:
+ {
+ using to_vector = typename cast_vector<float>::type;
+ const to_vector vout = vcast<to_vector, from_vector>(vin);
+ store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ FromT val = get_value(in_ptr + x);
+ switch (output->info()->data_type())
+ {
+ case DataType::U8:
+ {
+ *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+ const auto qval = qinfo_out.quantize(static_cast<float>(val), rounding_policy);
+ *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
+ break;
+ }
+ case DataType::U32:
+ {
+ *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+ break;
+ }
+ case DataType::S32:
+ {
+ *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+ break;
+ }
+ case DataType::F32:
+ {
+ *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+ }
+ },
+ in, out);
+}
+
+void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Create iterators
+ Iterator in(input, win_collapsed);
+ Iterator out(output, win_collapsed);
+
+#ifdef __aarch64__
+ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else //__aarch64__
+ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+ const auto &qinfo_in = input->info()->quantization_info();
+ const auto &qinfo_out = output->info()->quantization_info();
+
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &) {
+ const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ using from_vector = typename cast_vector<float>::type;
+ const auto vf = wrapper::vloadq(in_ptr + x);
+ const auto vin = vdequantize(vf, qinfo_in);
+ switch (output->info()->data_type())
+ {
+ case DataType::U8:
+ {
+ using to_vector = typename cast_vector<uint8_t>::type;
+ const to_vector vout = vcast<to_vector, from_vector>(vin);
+ store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ using to_vector = typename cast_vector<float>::type;
+ const auto vf = vcast<to_vector, from_vector>(vin);
+ const auto vout = vquantize(vf, qinfo_out);
+ store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+ break;
+ }
+ case DataType::U32:
+ {
+ using to_vector = typename cast_vector<uint32_t>::type;
+ const to_vector vout = vcast<to_vector, from_vector>(vin);
+ store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+ break;
+ }
+ case DataType::S32:
+ {
+ using to_vector = typename cast_vector<int32_t>::type;
+ const to_vector vout = vcast<to_vector, from_vector>(vin);
+ store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+ break;
+ }
+ case DataType::F32:
+ {
+ using to_vector = typename cast_vector<float>::type;
+ const to_vector vout = vcast<to_vector, from_vector>(vin);
+ store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ qasymm8_t qval_in = *(in_ptr + x);
+ const auto val = qinfo_in.dequantize(qval_in);
+
+ switch (output->info()->data_type())
+ {
+ case DataType::U8:
+ {
+ *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ const auto qval_out = qinfo_out.quantize(val, rounding_policy);
+ *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
+ break;
+ }
+ case DataType::U32:
+ {
+ *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+ break;
+ }
+ case DataType::S32:
+ {
+ *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+ break;
+ }
+ case DataType::F32:
+ {
+ *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+ }
+ },
+ in, out);
+}
+} // namespace
+
+NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
+{
+}
+
+void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
+
+ _input = input;
+ _output = output;
+ _input_subtype = input_subtype;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ SubDataType input_subtype)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+ return Status{};
+}
+
+void NECastKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch (_input->info()->data_type())
+ {
+ case DataType::U8:
+ if (_input_subtype == SubDataType::BOOL)
+ {
+ run_cast<bool>(_input, _output, window);
+ }
+ else
+ {
+ run_cast<uint8_t>(_input, _output, window);
+ }
+ break;
+ case DataType::QASYMM8:
+ run_cast_qasymm8(_input, _output, window);
+ break;
+ case DataType::U32:
+ run_cast<uint32_t>(_input, _output, window);
+ break;
+ case DataType::S32:
+ run_cast<int32_t>(_input, _output, window);
+ break;
+ case DataType::F32:
+ run_cast<float>(_input, _output, window);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
new file mode 100644
index 000000000..8a2223c26
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
+
+ const DataLayout data_layout = input->data_layout();
+ const int idx_channel =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+ 0);
+ // Validate output if initialized
+ if (output->total_size() != 0)
+ {
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+ (block_shape * input->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+ (block_shape * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
+ : _input(nullptr), _output(nullptr), _block_shape()
+{
+}
+
+void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
+ int32_t block_shape)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+
+ _input = input;
+ _output = output;
+ _block_shape = block_shape;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ ICPPKernel::configure(win);
+}
+
+Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ int32_t block_shape)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+ return Status{};
+}
+
+void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+ const int idx_channel =
+ get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const int depth_size = _input->info()->dimension(idx_channel);
+ const int r = (depth_size / (_block_shape * _block_shape));
+ const int element_size = _input->info()->element_size();
+
+ Window slice_out = window.first_slice_window_3D();
+
+ // The slice_out slice does not move
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Main loop for NCHW and NHWC
+ if (_input->info()->data_layout() == DataLayout::NCHW)
+ {
+ Window slice_in = window.first_slice_window_2D();
+ do
+ {
+ Iterator in(_input, slice_in);
+ execute_window_loop(slice_in,
+ [&](const Coordinates &id) {
+ const int x = id.x();
+ const int y = id.y();
+
+ const int z = id.z() % r;
+ const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
+ const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
+ Coordinates output_coords{out_x, out_y, z, id[3]};
+ memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+ },
+ in);
+ } while (window.slide_window_slice_2D(slice_in));
+ }
+ else
+ {
+ Window slice_in = window.first_slice_window_3D();
+ do
+ {
+ Iterator in(_input, slice_in);
+ execute_window_loop(slice_in,
+ [&](const Coordinates &id) {
+ const int x = id.y();
+ const int y = id.z();
+
+ const int z = id.x() % r;
+ const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
+ const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
+ Coordinates output_coords{z, out_x, out_y, id[3]};
+ memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+ },
+ in);
+ } while (window.slide_window_slice_3D(slice_in));
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
new file mode 100644
index 000000000..cebd614df
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+template <ElementWiseUnaryEx op, typename ScalarType>
+inline ScalarType elementwise_op_scalar(const ScalarType &a)
+{
+ switch (op)
+ {
+ case ElementWiseUnaryEx::NEG:
+ return -a;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+template <ElementWiseUnaryEx op, typename VectorType>
+inline VectorType elementwise_op(const VectorType &a)
+{
+ switch (op)
+ {
+ case ElementWiseUnaryEx::NEG:
+ return wrapper::vneg(a);
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+template <ElementWiseUnaryEx op, typename ScalarType>
+void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
+{
+ const int window_step_x = 16 / sizeof(ScalarType);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(in, win);
+ Iterator output(out, win);
+
+ execute_window_loop(win,
+ [&](const Coordinates &) {
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+
+ int x = window_start_x;
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(output_ptr + x,
+ elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
+ }
+ },
+ input, output);
+}
+
+template <ElementWiseUnaryEx op>
+std::function<void(const ITensor *input, ITensor *output, const Window &window)>
+configure_func(const ITensor *input, ITensor *output)
+{
+ std::string function_to_call("op_");
+ function_to_call += string_from_data_type(input->info()->data_type()) + "_";
+ function_to_call += string_from_data_type(output->info()->data_type());
+
+ static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
+ map_function = {
+ {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
+ };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+ auto it = map_function.find(function_to_call);
+
+ if (it != map_function.end())
+ {
+ auto func = it->second;
+ return [func](const ITensor *input, ITensor *output, const Window &window) {
+ func(input, output, window);
+ };
+ }
+ return nullptr;
+}
+} // namespace
+
+NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
+ : _function(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Configure kernel window
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input->info());
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+
+ Window win = calculate_max_window(valid_region);
+
+ _input = input;
+ _output = output;
+
+ INEKernel::configure(win);
+
+ switch (op)
+ {
+ case ElementWiseUnaryEx::NEG:
+ _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
+ const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
+ DataType::S32);
+
+ // Validate in case of configured output
+ if (output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+ }
+
+ return Status{};
+}
+
+Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+ return Status{};
+}
+
+void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_function == nullptr);
+ _function(_input, _output, window);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..5401afea0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
+ : _input(nullptr), _lookups(nullptr), _output(nullptr)
+{
+}
+
+void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output,
+ const ITensor *lookups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+ _input = input;
+ _output = output;
+ _lookups = lookups;
+
+ // Auto initialize output if not initialized
+ auto out_shape = input->info()->tensor_shape();
+ out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions());
+ auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+
+ INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input,
+ const arm_compute::ITensorInfo *output,
+ const arm_compute::ITensorInfo *lookups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
+ ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+ for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
+ }
+ }
+
+ return Status{};
+}
+
+void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const size_t lookup_dim = _output->info()->num_dimensions() - 1;
+
+ Window output_window{window};
+ output_window.set(Window::DimX,
+ Window::Dimension(output_window.x().start(), output_window.x().end(),
+ _input->info()->dimension(0)));
+
+ Window out_slice = output_window.first_slice_window_4D();
+ do
+ {
+ Iterator output_it(_output, out_slice);
+
+ execute_window_loop(out_slice,
+ [&](const Coordinates &id) {
+ const int32_t lookup = *reinterpret_cast<int32_t *>(
+ _lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
+ Coordinates input_id{id};
+ input_id.set(lookup_dim, lookup);
+ memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+ _output->info()->dimension(0) * _output->info()->element_size());
+ },
+ output_it);
+
+ } while (window.slide_window_slice_4D(out_slice));
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
new file mode 100644
index 000000000..ce2413dc1
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Validate the indices
+ *
+ * Validate that indices are not negative
+ *
+ * @param[in] indices Indices tensor info.
+ */
+template <typename U> void validate_indices(const ITensor *indices)
+{
+ for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
+ }
+}
+
+} // namespace
+
+NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {}
+
+template <typename U>
+inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+
+ // Validate that the indices are not negative
+ validate_indices<U>(_indices);
+
+ Iterator output_it(_output, window);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id) {
+ Coordinates gather_id(id);
+ gather_id.collapse(_indices->info()->num_dimensions(), 0);
+
+ U new_index;
+ switch (_indices->info()->num_dimensions())
+ {
+ case 1:
+ new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+ break;
+ case 2:
+ new_index =
+ *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
+ break;
+ case 3:
+ new_index = *(
+ reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Wrong num of dimensions");
+ break;
+ }
+
+ gather_id.set(0, new_index);
+
+ std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
+ output_it.ptr());
+ },
+ output_it);
+}
+
+template <typename U>
+void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+
+ // Validate that the indices are not negative
+ validate_indices<U>(_indices);
+
+ Window output_window{window};
+ output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator output_it(_output, output_window);
+ execute_window_loop(
+ output_window,
+ [&](const Coordinates &id) {
+ Coordinates gather_id(id);
+ gather_id.collapse(_indices->info()->num_dimensions(), _axis);
+
+ U new_index;
+ switch (_indices->info()->num_dimensions())
+ {
+ case 1:
+ new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+ break;
+ case 2:
+ new_index = *(reinterpret_cast<U *>(
+ _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
+ break;
+ case 3:
+ new_index = *(reinterpret_cast<U *>(
+ _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Wrong num of dimensions");
+ break;
+ }
+
+ gather_id.set(_axis, new_index);
+
+ std::copy_n(_input->ptr_to_element(gather_id),
+ _input->info()->dimension(0) * _output->info()->element_size(),
+ output_it.ptr());
+ },
+ output_it);
+}
+
+void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
+ int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ _input = input;
+ _indices = indices;
+ _output = output;
+ _axis = axis;
+
+ if (_axis < 0)
+ {
+ _axis += input->info()->num_dimensions();
+ }
+ ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
+
+ if (0 == _axis)
+ {
+ switch (_indices->info()->data_type())
+ {
+ case DataType::U32:
+ _func = &NEGatherKernelEx::gather_0_axis<uint32_t>;
+ break;
+ case DataType::S32:
+ _func = &NEGatherKernelEx::gather_0_axis<int32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ }
+ else
+ {
+ switch (_indices->info()->data_type())
+ {
+ case DataType::U32:
+ _func = &NEGatherKernelEx::gather_n_axis<uint32_t>;
+ break;
+ case DataType::S32:
+ _func = &NEGatherKernelEx::gather_n_axis<int32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ }
+ // Output auto initialization if not yet initialized
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+ input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+ // Create window
+ Window win = calculate_max_window(*output->info(), Steps());
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+
+ if (axis < 0)
+ {
+ axis += input->num_dimensions();
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+ input->tensor_shape(), indices->tensor_shape(), axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+ return Status{};
+}
+
+void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ (this->*_func)(window, info);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
new file mode 100644
index 000000000..391337bfb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <unordered_map>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr size_t NOT_HIT = 0xFFFFFFFF;
+} // namespace
+
+NEHashtableLookupKernel::NEHashtableLookupKernel()
+ : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
+{
+}
+
+void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys,
+ const ITensor *input, ITensor *output, ITensor *hits)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+
+ _lookups = lookups;
+ _keys = keys;
+ _input = input;
+ _output = output;
+ _hits = hits;
+
+ // Auto initialize output if not initialized
+ auto out_shape{input->info()->tensor_shape()};
+ out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false);
+ auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+
+ // Auto initialize hits if not initialized
+ auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8);
+
+ INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1));
+
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
+ ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+ for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
+ }
+ }
+
+ // Validate in case of configured hits
+ if (hits->total_size() > 0)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1));
+ ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0));
+ ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+ }
+
+ return Status{};
+}
+
+void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const size_t lookup_dim = _output->info()->num_dimensions() - 1;
+ const int const_0 = _output->info()->data_type() == DataType::QASYMM8
+ ? _output->info()->quantization_info().offset
+ : 0;
+
+ std::unordered_map<int32_t, size_t> key_index_map;
+ for (size_t n = 0; n < _keys->info()->dimension(0); ++n)
+ {
+ const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n}));
+ key_index_map[key] = n;
+ }
+ std::vector<size_t> lookup_indices;
+ for (size_t k = 0; k < _lookups->info()->dimension(0); ++k)
+ {
+ const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k}));
+ const auto it = key_index_map.find(key);
+ if (it == key_index_map.end())
+ {
+ lookup_indices.emplace_back(NOT_HIT);
+ *_hits->ptr_to_element({k}) = 0;
+ }
+ else
+ {
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+ if (it->second >= _keys->info()->dimension(0))
+ ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds.");
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+ lookup_indices.emplace_back(it->second);
+ *_hits->ptr_to_element({k}) = 1;
+ }
+ }
+
+ Window output_window{window};
+ output_window.set(Window::DimX,
+ Window::Dimension(output_window.x().start(), output_window.x().end(),
+ _input->info()->dimension(0)));
+
+ Window out_slice = output_window.first_slice_window_4D();
+ do
+ {
+ Iterator output_it(_output, out_slice);
+
+ execute_window_loop(out_slice,
+ [&](const Coordinates &id) {
+ const auto lookup = lookup_indices.at(id[lookup_dim]);
+ if (lookup == NOT_HIT)
+ {
+ memset(output_it.ptr(), const_0,
+ _output->info()->dimension(0) * _output->info()->element_size());
+ }
+ else
+ {
+ Coordinates input_id{id};
+ input_id.set(lookup_dim, lookup);
+ memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+ _output->info()->dimension(0) * _output->info()->element_size());
+ }
+
+ },
+ output_it);
+
+ } while (window.slide_window_slice_4D(out_slice));
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
new file mode 100644
index 000000000..1ea77fb5c
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+ float epsilon, const Window &window)
+{
+ /** NEON vector tag type. */
+ using ExactTagType =
+ typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+ // Clear X/Y dimensions on execution window as we handle the planes manually
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ constexpr int window_step_x = 16 / sizeof(T);
+ const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+ const auto channel_idx =
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+
+ Iterator input_it(input, win);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id) {
+ Window win_plane = window;
+ win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+ win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+ win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+ Iterator input_plane_it(input, win_plane);
+ Iterator output_plane_it(output, win_plane);
+
+ auto sum_h_w = static_cast<T>(0.f);
+ auto sum_squares_h_w = static_cast<T>(0.f);
+
+ execute_window_loop(
+ win_plane,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+ auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+ // Compute S elements per iteration
+ int x = window.x().start();
+ for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+ {
+ auto vec_input_val = wrapper::vloadq(input_ptr + x);
+ vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
+ vec_sum_squares_h_w =
+ wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+ }
+
+ auto vec2_sum_h_w =
+ wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+ auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
+ wrapper::vgetlow(vec_sum_squares_h_w));
+ for (int i = 0; i < window_step_x / 4; ++i)
+ {
+ vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+ vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+ }
+ sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+ sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+ // Compute left-over elements
+ for (; x < window.x().end(); ++x)
+ {
+ const auto value = *(input_ptr + x);
+ sum_h_w += value;
+ sum_squares_h_w += value * value;
+ }
+ },
+ input_plane_it, output_plane_it);
+
+ const auto mean_h_w = sum_h_w / elements_plane;
+ const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+ auto gamma_val = 1.0f;
+ if (gamma != nullptr)
+ {
+ gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
+ }
+ const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
+ const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
+ const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
+ auto beta_val = 0.0f;
+ if (beta != nullptr)
+ {
+ beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
+ }
+ const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
+
+ execute_window_loop(
+ win_plane,
+ [&](const Coordinates &) {
+ auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
+ auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+ // Compute S elements per iteration
+ int x = window.x().start();
+ auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+ for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+ {
+ vec_val = wrapper::vloadq(input_ptr + x);
+ vec_val = wrapper::vadd(
+ wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
+ wrapper::vstore(output_ptr + x, vec_val);
+ }
+
+ // Compute left-over elements
+ for (; x < window.x().end(); ++x)
+ {
+ *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
+ }
+ },
+ input_plane_it, output_plane_it);
+ },
+ input_it);
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+ "NHWC data layout is not supported by the kernel directly");
+
+ if (output != nullptr && output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
+ }
+
+ if (gamma != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
+ input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+ gamma->dimension(0),
+ "Gamma's size must be the same as size of input's channel");
+ }
+
+ if (beta != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
+ input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+ beta->dimension(0),
+ "Beta's size must be the same as size of input's channel");
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // We handle the planes manually
+ Window win = calculate_max_window(*input, Steps(1));
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+
+ // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
+ // skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
+ _epsilon(1e-12)
+{
+}
+
+void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output,
+ ITensor *gamma, ITensor *beta, float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _input = input;
+ _output = output == nullptr ? input : output;
+ _gamma = gamma;
+ _beta = beta;
+ _epsilon = epsilon;
+
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
+
+ if (_input->info()->data_type() == DataType::F32)
+ {
+ _func = &instance_normalization_nchw<float>;
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ else if (_input->info()->data_type() == DataType::F16)
+ {
+ _func = &instance_normalization_nchw<float16_t>;
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(_input->info(), _output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *gamma,
+ const ITensorInfo *beta, float epsilon)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+ input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+ return Status{};
+}
+
+void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
new file mode 100644
index 000000000..de218d489
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+ // Checks performed when output is configured
+ if ((output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+inline int32x4x4_t load_value(const int32_t *input_ptr)
+{
+ return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+ wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+ return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+ wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+ return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+ vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+ vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v)
+{
+ ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v)
+{
+ wrapper::vstore(ptr, v.val[0]);
+ wrapper::vstore(ptr + 4, v.val[1]);
+ wrapper::vstore(ptr + 8, v.val[2]);
+ wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
+{
+ wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+ wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale)
+{
+ const float32x4_t vscale = vdupq_n_f32(scale);
+
+ const float32x4x4_t ret = {{
+ vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
+ vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
+ }};
+ return ret;
+}
+} // namespace
+
+NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel()
+ : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+{
+}
+
+void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor,
+ ITensor *output, float multiplier)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), scale_factor->info(), output->info()));
+
+ _input = input;
+ _scale_factor = scale_factor;
+ _output = output;
+ _multiplier = multiplier;
+
+ // Configure kernel window
+ Window win_config = calculate_max_window(*input->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ INEKernel::configure(win_config);
+}
+
+Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *scale_factor,
+ const ITensorInfo *output, float multiplier)
+{
+ ARM_COMPUTE_UNUSED(multiplier);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
+
+ return Status{};
+}
+
+template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window)
+{
+ constexpr auto window_step = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ // Support Only 2D input
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ Iterator input(_input, win_collapsed);
+ Iterator output(_output, win_collapsed);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &id) {
+ auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
+ scale *= _multiplier;
+
+ const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
+ auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ output_ptr[x] = input_ptr[x] * scale;
+ }
+ },
+ input, output);
+}
+
+void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch (_output->info()->data_type())
+ {
+ case DataType::F32:
+ NEMultiplyScaleFactorKernel::multiply<float>(window);
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ NEMultiplyScaleFactorKernel::multiply<float16_t>(window);
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
new file mode 100644
index 000000000..ad1bb9051
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+namespace
+{
+
+/** Conditional element-wise operations */
+enum class ConditionalOperation
+{
+ PRELU, /**< (x * y) for x < 0, x for x >= 0 */
+};
+
+template <ConditionalOperation op, typename ScalarType>
+inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+ auto res = ScalarType(0);
+
+ switch (op)
+ {
+ case ConditionalOperation::PRELU:
+ res = a < 0 ? a * b : a;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+ return res;
+}
+
+template <ConditionalOperation op>
+inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
+ QuantizationInfo qinfo)
+{
+ return qinfo.quantize(elementwise_conditional_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP);
+}
+
+template <ConditionalOperation op, typename VectorType>
+inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
+{
+ VectorType res = {0, 0, 0, 0};
+ VectorType const_0 = {0, 0, 0, 0};
+
+ switch (op)
+ {
+ case ConditionalOperation::PRELU:
+ res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
+ ;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+ return res;
+}
+
+template <ConditionalOperation op>
+inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+ float32x4x4_t out = {{
+ elementwise_conditional_op<op>(a.val[0], b.val[0]),
+ elementwise_conditional_op<op>(a.val[1], b.val[1]),
+ elementwise_conditional_op<op>(a.val[2], b.val[2]),
+ elementwise_conditional_op<op>(a.val[3], b.val[3]),
+ }};
+ return out;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
+ const ScalarType &broadcast_value,
+ const bool reorder)
+{
+ VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+ return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
+ reorder ? a : broadcast_vector);
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
+ const ScalarType *input1_ptr,
+ const ScalarType *input2_ptr, ScalarType *output_ptr)
+{
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
+ }
+ return x;
+}
+
+template <ConditionalOperation op>
+inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
+ int window_step_x, const uint8_t *input1_ptr,
+ const uint8_t *input2_ptr, uint8_t *output_ptr,
+ int32x4_t voffset1, int32x4_t voffset2,
+ float32x4_t vscale1, float32x4_t vscale2,
+ float32x4_t voffseto, float32x4_t invvscaleo)
+{
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ // Get inputs and compute output
+ const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+ const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+ const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
+ store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+ }
+ return x;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
+ int window_step_x,
+ const ScalarType *non_broadcast_input_ptr,
+ const ScalarType &broadcast_value,
+ ScalarType *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+ wrapper::vstore(output_ptr + x,
+ elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
+ }
+ return x;
+}
+
+template <ConditionalOperation op>
+inline int elementwise_conditional_op_quantized_broadcast_loop(
+ int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
+ float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
+ float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+{
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const float32x4x4_t af =
+ load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+ const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
+ reorder ? af : broadcast_vector);
+ store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+ }
+ return x;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+ const Window &window)
+{
+ elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
+ &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
+ &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
+}
+
+template <ConditionalOperation op>
+void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
+ const Window &window)
+{
+ elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
+ &elementwise_conditional_op_quantized_broadcast_loop<op>,
+ &elementwise_conditional_op_quantized_loop<op>);
+}
+} // namespace
+
+NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
+
+ // Configure kernel window
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+
+ Window win = calculate_max_window(valid_region);
+
+ _input = input;
+ _alpha = alpha;
+ _output = output;
+ INEKernel::configure(win);
+}
+
+void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ if (_input->info()->data_type() == DataType::F32)
+ {
+ elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
+ _output, window);
+ }
+ else if (_input->info()->data_type() == DataType::QASYMM8)
+ {
+ elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
+ window);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Wrong Type");
+ }
+}
+
+Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+ const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
+
+ const TensorShape out_shape =
+ TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+ "Inputs are not broadcast compatible");
+
+ // Checks performed when output is configured
+ if (output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+
+ return Status{};
+}
+
+Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
+
+ return Status{};
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
new file mode 100644
index 000000000..acf0092eb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *scale_factor)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+ return Status{};
+}
+
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+ return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+ wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+ return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+ vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+ vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+inline float32x4_t round(const float32x4_t &fv)
+{
+ const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f);
+ const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f);
+ // If value < 0, mask = -1, else mask = 0
+ int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4));
+ return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4));
+}
+
+inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale)
+{
+ const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv);
+ const int32x4_t vposend = vdupq_n_s32(max_scale);
+ const int32x4_t vnagend = vdupq_n_s32(-max_scale);
+
+ const int32x4x4_t rf = {{
+#ifdef __aarch64__
+ vminq_s32(vposend,
+ vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+ vminq_s32(vposend,
+ vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+ vminq_s32(vposend,
+ vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+ vminq_s32(vposend,
+ vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+#else //__aarch64__
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+#endif //__aarch64__
+ }};
+ const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+ const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+ return vcombine_s8(pa, pb);
+}
+} // namespace
+
+NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel()
+ : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
+{
+}
+
+void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output,
+ ITensor *scale_factor)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), output->info(), scale_factor->info()));
+
+ _input = input;
+ _output = output;
+ _scale_factor = scale_factor;
+
+ // Configure kernel window
+ Window win_config = calculate_max_window(*input->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ INEKernel::configure(win_config);
+}
+
+Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *scale_factor)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor));
+
+ return Status{};
+}
+
+template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window)
+{
+ constexpr auto window_step = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+#ifdef __aarch64__
+ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else //__aarch64__
+ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP;
+#endif //__aarch64__
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ // Support Only 2D input
+ Window win_collapsed = window;
+ Iterator input(_input, win_collapsed);
+ Iterator output(_output, win_collapsed);
+ const auto dim_x = _input->info()->dimension(0);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &id) {
+ const auto start = reinterpret_cast<const T *>(input.ptr());
+ const auto min_max = std::minmax_element(start, start + dim_x);
+ const auto int8_scale = 127;
+ auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
+ if (range == 0)
+ {
+ *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
+ range = 1;
+ }
+ else
+ {
+ *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
+ }
+ const auto scale_factor_inv = int8_scale / range;
+
+ auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ wrapper::vstore(&output_ptr[x],
+ vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
+ quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
+ output_ptr[x] = static_cast<int8_t>(quantized);
+ }
+ },
+ input, output);
+}
+
+void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch (_input->info()->data_type())
+ {
+ case DataType::F32:
+ NEQuantizationSymmetricKernel::quantize<float>(window);
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ NEQuantizationSymmetricKernel::quantize<float16_t>(window);
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
new file mode 100644
index 000000000..59e7d9beb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+float32x2_t calculate_min(float32x4_t in)
+{
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ return wrapper::vpmin(pmin, pmin);
+}
+
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+float32x2_t calculate_max(float32x4_t in)
+{
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ return wrapper::vpmax(pmax, pmax);
+}
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+int32x2_t calculate_min(int32x4_t in)
+{
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ return wrapper::vpmin(pmin, pmin);
+}
+
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+int32x2_t calculate_max(int32x4_t in)
+{
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ return wrapper::vpmax(pmax, pmax);
+}
+
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+inline uint8x8_t calculate_min(uint8x16_t in)
+{
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ return wrapper::vpmin(pmin, pmin);
+}
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+inline uint8x8_t calculate_max(uint8x16_t in)
+{
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ return wrapper::vpmax(pmax, pmax);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+inline float16x4_t calculate_min(float16x8_t in)
+{
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ pmin = wrapper::vpmin(pmin, pmin);
+ return wrapper::vpmin(pmin, pmin);
+}
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+inline float16x4_t calculate_max(float16x8_t in)
+{
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ pmax = wrapper::vpmax(pmax, pmax);
+ return wrapper::vpmax(pmax, pmax);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <class F> class Reducer
+{
+public:
+ static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f,
+ const ReduceOperation op)
+ {
+ // Set out window
+ Window out_window(window);
+ out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ // Get first input and output slices
+ Window in_slice = window.first_slice_window_1D();
+ Window out_slice = out_window.first_slice_window_1D();
+
+ do
+ {
+ Iterator in(input, in_slice);
+ Iterator out(output, out_slice);
+
+ f(in, out, in_slice, out_slice, *input->info(), op);
+ } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+ }
+ static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f,
+ const ReduceOperation op)
+ {
+ // Set in window
+ Window in_window(window);
+ Window out_window(window);
+
+ in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+ out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1),
+ output->info()->dimension(1)));
+
+ // Get first input and output slices
+ Window in_slice = in_window.first_slice_window_2D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ do
+ {
+ Iterator in(input, in_slice);
+ Iterator out(output, out_slice);
+
+ f(in, out, in_slice, out_slice, *input->info(), 1, op);
+ } while (in_window.slide_window_slice_2D(in_slice) &&
+ out_window.slide_window_slice_2D(out_slice));
+ }
+ static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f,
+ const ReduceOperation op)
+ {
+ // Set in window
+ Window in_window(window);
+ Window out_window(window);
+
+ in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2),
+ output->info()->dimension(2)));
+
+ // Get first input and output slices
+ Window in_slice = in_window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_3D();
+
+ do
+ {
+ Iterator in(input, in_slice);
+ Iterator out(output, out_slice);
+
+ f(in, out, in_slice, out_slice, *input->info(), 2, op);
+ } while (in_window.slide_window_slice_3D(in_slice) &&
+ out_window.slide_window_slice_3D(out_slice));
+ }
+ static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f,
+ const ReduceOperation op)
+ {
+ // Set in/out window
+ Window in_window(window);
+ Window out_window(window);
+
+ in_window.set(3, Window::Dimension(0, 1, 1));
+ out_window.set(3, Window::Dimension(0, 1, 1));
+
+ // Get first input and output slices
+ Window in_slice = in_window.first_slice_window_4D();
+ Window out_slice = out_window.first_slice_window_4D();
+
+ do
+ {
+ Iterator in(input, in_slice);
+ Iterator out(output, out_slice);
+
+ f(in, out, in_slice, out_slice, *input->info(), 3, op);
+ } while (in_window.slide_window_slice_4D(in_slice) &&
+ out_window.slide_window_slice_4D(out_slice));
+ }
+};
+
+template <typename T, int S> struct RedOpX
+{
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+ const TensorInfo &in_info, const ReduceOperation op)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+ ARM_COMPUTE_UNUSED(in_info);
+ auto init_res_value = static_cast<T>(0.f);
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ case ReduceOperation::MAX:
+ {
+ init_res_value = *reinterpret_cast<T *>(input.ptr());
+ break;
+ }
+ default:
+ break;
+ }
+ auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+
+ execute_window_loop(in_slice,
+ [&](const Coordinates &) {
+ const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReduceOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ },
+ input);
+
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ {
+ *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0);
+ break;
+ }
+ case ReduceOperation::MAX:
+ {
+ *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ }
+};
+
+struct RedOpX_qasymm8
+{
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+ const TensorInfo &in_info, const ReduceOperation op)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+ ARM_COMPUTE_UNUSED(in_info);
+
+ uint8x16_t vec_res_value = {0};
+
+ if (op == ReduceOperation::MIN || op == ReduceOperation::MAX)
+ {
+ vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+ }
+
+ execute_window_loop(in_slice,
+ [&](const Coordinates &) {
+ const auto vec_elements = wrapper::vloadq(input.ptr());
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReduceOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ },
+ input);
+
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ {
+ *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+ break;
+ }
+ case ReduceOperation::MAX:
+ {
+ *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ }
+ }
+};
+
+template <typename T, int S> struct RedOpYZW
+{
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+ using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
+
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+ const TensorInfo &in_info, int axis, const ReduceOperation op)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+
+ execute_window_loop(
+ in_slice,
+ [&](const Coordinates &) {
+ neon_vector vec_res_value = {0};
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ case ReduceOperation::MAX:
+ {
+ vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+ break;
+ }
+ default:
+ {
+ vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ break;
+ }
+ }
+
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ T *in_ptr;
+ switch (axis)
+ {
+ case 1:
+ in_ptr = reinterpret_cast<T *>(
+ input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
+ break;
+ case 2:
+ in_ptr = reinterpret_cast<T *>(
+ input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
+ break;
+ case 3:
+ in_ptr = reinterpret_cast<T *>(
+ input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReduceOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ }
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+ },
+ input, output);
+ }
+};
+
+struct RedOpYZW_qasymm8
+{
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+ const TensorInfo &in_info, int axis, const ReduceOperation op)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+
+ execute_window_loop(
+ in_slice,
+ [&](const Coordinates &) {
+ auto vec_res_value = wrapper::vloadq(input.ptr());
+
+ for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
+ {
+ uint8_t *in_ptr;
+ switch (axis)
+ {
+ case 1:
+ in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
+ break;
+ case 2:
+ in_ptr =
+ input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
+ break;
+ case 3:
+ in_ptr =
+ input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReduceOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ }
+ wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value);
+ },
+ input, output);
+ }
+};
+
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis,
+ const ReduceOperation op)
+{
+ const bool is_complex = (input->info()->num_channels() == 2);
+ if (is_complex)
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+
+ switch (axis)
+ {
+ case 0:
+ switch (input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output,
+ RedOpX<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
+ case DataType::S32:
+ return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(),
+ op);
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 1:
+ switch (input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output,
+ RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(),
+ op);
+ case DataType::S32:
+ return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output,
+ RedOpYZW<int32_t, 4>(), op);
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 2:
+ switch (input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output,
+ RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(),
+ op);
+ case DataType::S32:
+ return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output,
+ RedOpYZW<int32_t, 4>(), op);
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 3:
+ switch (input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output,
+ RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(),
+ op);
+ case DataType::S32:
+ return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output,
+ RedOpYZW<int32_t, 4>(), op);
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ }
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_UNUSED(op);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+
+ if (input->num_channels() == 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
+ DataType::F16, DataType::F32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex");
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
+
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+ const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+ unsigned int axis, ReduceOperation op)
+{
+ ARM_COMPUTE_UNUSED(op);
+
+ // Calculate output shape and set if empty
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+
+ // Output auto initialization if not yet initialized
+ DataType output_data_type = input->data_type();
+ auto_init_if_empty(*output, input->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
+
+ unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+
+ return std::make_tuple(err, win);
+}
+} // namespace
+
+NEReductionOperationKernelEx::NEReductionOperationKernelEx()
+ : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX),
+ _border_size()
+{
+}
+
+BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; }
+
+void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output,
+ unsigned int axis, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+ unsigned int num_elems_processed_per_iteration =
+ 16 / data_size_from_type(input->info()->data_type());
+
+ _input = input;
+ _output = output;
+ _border_size =
+ (axis == 0)
+ ? BorderSize(0, num_elems_processed_per_iteration -
+ (input->info()->dimension(0) % num_elems_processed_per_iteration),
+ 0, 0)
+ : BorderSize();
+ _op = op;
+ _reduction_axis = axis;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
+
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ unsigned int axis, ReduceOperation op)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
+
+ return Status{};
+}
+
+void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ reduce_op(window, _input, _output, _reduction_axis, _op);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
new file mode 100644
index 000000000..36a2f55a9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
+
+ // Validate output if initialized
+ if (output->total_size() != 0)
+ {
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_batch =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
+ output->tensor_shape()[idx_batch]);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+ 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
+ output->tensor_shape().total_size());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
+ : _input(nullptr), _output(nullptr), _block_shape()
+{
+}
+
+void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
+ int32_t block_shape)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+
+ _input = input;
+ _block_shape = block_shape;
+ _output = output;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ INEKernel::configure(win);
+}
+
+Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ int32_t block_shape)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+ return Status{};
+}
+
+void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+ const DataLayout data_layout = _input->info()->data_layout();
+ const int channel_idx =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int element_size = _input->info()->element_size();
+
+ const size_t channel_size = _input->info()->dimension(channel_idx);
+
+ Window slice_out = window.first_slice_window_3D();
+
+ int batch_id = 0;
+
+ // Main loop for NCHW and NHWC
+ if (_output->info()->data_layout() == DataLayout::NCHW)
+ {
+ do
+ {
+ Iterator out(_output, slice_out);
+ execute_window_loop(slice_out,
+ [&](const Coordinates &id) {
+ const size_t channel_id = id.z();
+ const size_t in_x =
+ id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+ const size_t in_y =
+ id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+ const int z = channel_id % channel_size;
+ Coordinates input_coords{in_x, in_y, z, batch_id};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ },
+ out);
+ ++batch_id;
+ } while (window.slide_window_slice_3D(slice_out));
+ }
+ else
+ {
+ do
+ {
+ Iterator out(_output, slice_out);
+ execute_window_loop(slice_out,
+ [&](const Coordinates &id) {
+ const size_t channel_id = id.x();
+ const size_t in_x =
+ id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+ const size_t in_y =
+ id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+ const int z = channel_id % channel_size;
+ Coordinates input_coords{z, in_x, in_y, batch_id};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ },
+ out);
+ ++batch_id;
+ } while (window.slide_window_slice_3D(slice_out));
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp
new file mode 100644
index 000000000..94242b56b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/UtilsEx.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+const std::pair<unsigned int, unsigned int>
+arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
+ unsigned int kernel_width, unsigned int kernel_height,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom)
+{
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+ const unsigned int padx = info.pad_left() + info.pad_right();
+ const unsigned int pady = info.pad_top() + info.pad_bottom();
+
+ ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
+ ARM_COMPUTE_ERROR_ON(kernel_width <= padx);
+ ARM_COMPUTE_ERROR_ON(kernel_height <= pady);
+
+ // Find the transpose conv out dimensions
+ // transpose conv out:
+ // tconv_out + pad = 1 + (in - 1) * stride + invalid
+ // tconv_out = 1 + (in - 1) * stride + invalid - pad
+ const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right;
+ const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom;
+
+ return std::make_pair<unsigned int, unsigned int>(w, h);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
new file mode 100644
index 000000000..158fe0b0c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/CLFunctionsEx.h"
+
+// NOTE This empty file aims to validate "CLFunctionsEx.h".
+// DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
new file mode 100644
index 000000000..ae64a6edd
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+
+CLArgOperation::CLArgOperation()
+{
+ // DO NOTHING
+}
+
+void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+ ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
+ _input = input;
+ _output = output;
+ _axis = axis;
+ _arg_op = op;
+ // NOTE The argminmax_axis must have no duplication.
+ _num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+ _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _argop_kernels =
+ arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
+
+ TensorShape shape{input->info()->tensor_shape()};
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ shape.set(_axis[i], 1);
+ _interm_tensors[i].allocator()->init(
+ TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
+ .set_data_layout(input->info()->data_layout()));
+ _interm_tensors[i].allocator()->allocate();
+ }
+
+ // Set a vector that is ordered ICLTensors sequentially.
+ std::vector<ICLTensor *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ tensors.emplace_back(_interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Apply ArgMinMax on all kernels
+ for (size_t i = 0; i < _num_of_kernels; i++)
+ {
+ _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
+ }
+}
+
+Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+ const ITensorInfo *output, ArgOperation op)
+{
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+ // Create temporary tensor infos
+ auto interm_tensors =
+ arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ shape.set(axis[i], 1);
+ interm_tensors[i].set_data_type(input->data_type());
+ interm_tensors[i].set_tensor_shape(shape);
+ interm_tensors[i].set_num_channels(input->num_channels());
+ }
+
+ // Set a vector that is ordered ITensorInfo sequentially.
+ std::vector<const ITensorInfo *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ tensors.emplace_back(interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Validate argminmax only on all kernels
+ for (size_t i = 0; i < num_of_kernels; i++)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
+ }
+
+ return Status{};
+}
+
+void CLArgOperation::run()
+{
+ for (size_t i = 0; i < _num_of_kernels; ++i)
+ {
+ CLScheduler::get().enqueue(_argop_kernels[i]);
+ }
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
new file mode 100644
index 000000000..7c5fe5eda
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ BinaryLogicalOperation op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+ k->configure(input1, input2, output, op);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 000000000..742fc6f59
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+using namespace arm_compute;
+
+void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
+ k->configure(input, output, input_subtype);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
new file mode 100644
index 000000000..c2e4ca9ff
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+using namespace arm_compute;
+
+void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+ k->configure(input, output, block_size);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
new file mode 100644
index 000000000..2781784ca
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+ const ICLTensor *lookups)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+ k->configure(input, output, lookups);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..c6b166163
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h"
+
+using namespace arm_compute;
+
+void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input,
+ const arm_compute::ICLTensor *weights,
+ const arm_compute::ICLTensor *biases,
+ arm_compute::ICLTensor *output, bool needs_reshape,
+ const arm_compute::TensorShape &reshape)
+{
+ _input = input;
+ _weights = weights;
+ _biases = biases;
+ _output = output;
+ _needs_reshape = needs_reshape;
+
+ if (_needs_reshape)
+ {
+ // reshape
+ auto_init_if_empty(*_cl_buffer.info(),
+ _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
+ _input->info()->data_layout()));
+ _cl_reshape.configure(_input, &_cl_buffer);
+
+ _cl_fc.configure(&_cl_buffer, _weights, _biases, _output);
+
+ // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _cl_buffer.allocator()->allocate();
+ }
+ else
+ {
+ _cl_fc.configure(_input, _weights, _biases, _output);
+ }
+}
+
+void CLFullyConnectedReshapingLayer::run(void)
+{
+ if (_needs_reshape)
+ _cl_reshape.run();
+
+ _cl_fc.run();
+}
+
+void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc.prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
new file mode 100644
index 000000000..6cad9bd2e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLGatherEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+using namespace arm_compute;
+
+void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
+ int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ return CLGatherExKernel::validate(input, indices, output, axis);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
new file mode 100644
index 000000000..7180e9356
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+ const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+ k->configure(lookups, keys, input, output, hits);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..86ea5a66d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
+
+void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+ ICLTensor *gamma, ICLTensor *beta, float epsilon)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+ k->configure(input, output, gamma, beta, epsilon);
+ _kernel = std::move(k);
+}
+
+Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma, const ITensorInfo *beta,
+ float epsilon)
+{
+ return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
new file mode 100644
index 000000000..be35ea732
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNeg.h"
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+using namespace arm_compute;
+
+void CLNeg::configure(ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
new file mode 100644
index 000000000..38adedd10
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+ k->configure(input, alpha, output);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
new file mode 100644
index 000000000..2a34c0664
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+ _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+ _gemm_output(), _add_output(), _is_prepared(false)
+{
+}
+
+Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+ const ITensorInfo *hidden_state, const ITensorInfo *output,
+ const ActivationLayerInfo &info)
+{
+ const int idx_width = 0;
+ const int idx_height = 1;
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+ output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+ recurrent_weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+ recurrent_weights->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ hidden_state->tensor_shape());
+
+ auto shape_info =
+ TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+ input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
+ ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+ return Status{};
+}
+
+void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
+ const ICLTensor *recurrent_weights, const ICLTensor *bias,
+ ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
+ recurrent_weights->info(), bias->info(),
+ hidden_state->info(), output->info(), info));
+
+ const int idx_height = 1;
+ TensorShape shape =
+ compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+ _is_prepared = false;
+
+ _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+ // Manage intermediate buffers and configure
+ _memory_group.manage(&_fully_connected_out);
+ _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+ _memory_group.manage(&_gemm_output);
+ _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+ _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_add_output);
+
+ _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
+ &_add_output, ConvertPolicy::SATURATE);
+
+ _fully_connected_out.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+
+ _activation_kernel.configure(&_add_output, hidden_state, info);
+ _add_output.allocator()->allocate();
+
+ _copy_kernel.configure(hidden_state, output);
+}
+
+void CLRNNLayerEx::run()
+{
+ prepare();
+
+ _memory_group.acquire();
+
+ _fully_connected_kernel.run();
+ _gemm_state_f.run();
+ CLScheduler::get().enqueue(_add_kernel);
+ CLScheduler::get().enqueue(_activation_kernel);
+
+ // copy hidden out to output
+ CLScheduler::get().enqueue(_copy_kernel);
+
+ _memory_group.release();
+}
+
+void CLRNNLayerEx::prepare()
+{
+ if (!_is_prepared)
+ {
+ _fully_connected_kernel.prepare();
+ _gemm_state_f.prepare();
+
+ _is_prepared = true;
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
new file mode 100644
index 000000000..13a25c901
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
+ _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
+{
+}
+
+Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const std::set<uint32_t> &axis, bool keep_dims,
+ const ReduceOperation &op)
+{
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+ // Create temporary tensor infos
+ auto interm_tensors =
+ arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+
+ auto it = axis.begin();
+ for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+ {
+ shape.set(*it, 1, false);
+ interm_tensors[i].set_data_type(input->data_type());
+ interm_tensors[i].set_tensor_shape(shape);
+ interm_tensors[i].set_num_channels(input->num_channels());
+ interm_tensors[i].set_data_layout(input->data_layout());
+ interm_tensors[i].set_quantization_info(input->quantization_info());
+ }
+
+ // Set a vector that is ordered ITensorInfo sequentially.
+ std::vector<const ITensorInfo *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; ++i)
+ {
+ tensors.emplace_back(interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Validate ReduceOperation only on all kernels
+ it = axis.begin();
+ for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+ }
+
+ if (!keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
+ }
+
+ return Status{};
+}
+
+void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+ const std::set<uint32_t> &axis, bool keep_dims,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op));
+
+ _axis = axis;
+
+ _input = input;
+ _output = output;
+ _keep_dims = keep_dims;
+
+ // NOTE The axis must have no duplication.
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+ _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _reduce_kernels =
+ arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+ // Set a vector that is ordered ICLTensors sequentially.
+ std::vector<ICLTensor *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; ++i)
+ {
+ tensors.emplace_back(_interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Apply ReduceOperation on all kernels
+ TensorShape shape{input->info()->tensor_shape()};
+ auto it = axis.begin();
+ for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+ {
+ shape.set(*it, 1, false);
+ if (!keep_dims || i != (num_of_kernels - 1))
+ {
+ _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+ _memory_group.manage(&_interm_tensors[i]);
+ }
+ _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
+ if (i != 0)
+ {
+ _interm_tensors[i - 1].allocator()->allocate();
+ }
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if (!keep_dims)
+ {
+ _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output);
+ _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate();
+ }
+}
+
+void CLReduceOperation::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ const size_t num_of_kernels = _axis.size();
+ for (size_t i = 0; i < num_of_kernels; ++i)
+ {
+ CLScheduler::get().enqueue(_reduce_kernels[i]);
+ }
+
+ if (!_keep_dims)
+ {
+ _reshape.run();
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
new file mode 100644
index 000000000..c03826891
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size,
+ const ICLTensor *padding_size, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>();
+ k->configure(input, block_size, padding_size, output);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
new file mode 100644
index 000000000..0f455f96f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+ k->configure(input, output, block_size);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
new file mode 100644
index 000000000..80d50ad94
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "../../topk_v2.h"
+
+namespace arm_compute
+{
+
+CLTopKV2::CLTopKV2()
+ : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+ _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+ _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+ _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
+ _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
+ _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+ _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+ _reorder_negatives_kernel(), _store_kernel()*/
+{
+}
+
+void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+ int total_bits, int bits)
+{
+ _total_bits = total_bits;
+ _bits = bits;
+ _n = input->info()->tensor_shape()[0];
+
+ // _total_bits should be divided by _bits.
+ ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
+
+ _k = k;
+ _radix = 1 << bits;
+
+ _input = input;
+ _values = values;
+ _indices = indices;
+
+ std::string topk_env;
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+// Invalid result on GPU
+#if 0
+ char *env = getenv("ACL_TOPKV2");
+ if (env)
+ topk_env = env;
+
+ if (topk_env == "GPU_SINGLE")
+ {
+ _qs_idx_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+ _qs_temp_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+ _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
+ }
+ else if (topk_env == "GPU")
+ {
+ // n should be divided by (_GROUPS * _ITEMS)
+ ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0);
+
+ _hist_buf_size = _radix * _GROUPS * _ITEMS;
+ _glob_sum_buf_size = _HISTOSPLIT;
+
+ _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _hist_buf_size);
+ _glob_sum_buf =
+ cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _glob_sum_buf_size);
+ _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _glob_sum_buf_size);
+ _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int));
+ _in_key_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+ _out_key_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+ _in_ind_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+ _out_ind_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+ _p_in_key_buf = &_in_key_buf;
+ _p_out_key_buf = &_out_key_buf;
+ _p_in_ind_buf = &_in_ind_buf;
+ _p_out_ind_buf = &_out_ind_buf;
+
+ _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
+ _hist_kernel.configure(&_hist_buf, bits, _n);
+ _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+ _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
+ _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+ _reorder_kernel.configure(&_hist_buf, bits, _n);
+ _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
+ _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
+ _store_kernel.configure(values, indices, k, _n);
+ }
+ else
+#endif // Disable GPU implementation
+ {
+ // DO NOTHING for CPU.
+ }
+}
+
+void CLTopKV2::run()
+{
+ std::string topk_env;
+#if 0
+ char *env = getenv("ACL_TOPKV2");
+ if (env)
+ topk_env = env;
+
+ if (topk_env == "GPU_SINGLE")
+ {
+ run_on_gpu_single_quicksort();
+ }
+ else if (topk_env == "GPU")
+ {
+ run_on_gpu();
+ }
+ else
+#endif
+ {
+ run_on_cpu();
+ }
+}
+
+#if 0
+void CLTopKV2::run_on_gpu_single_quicksort()
+{
+ // This is a single threaded quick sort implementation.
+ CLScheduler::get().enqueue(_qs_kernel, false);
+
+ arm_compute::CLScheduler::get().sync();
+}
+
+void CLTopKV2::run_on_gpu()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ // 1. CLTopKV2Init set key buffer and index buffer.
+ // - Key buffer is set as the same value of the layer's input
+ // - Values in the index buffer are set as their indices.
+ CLScheduler::get().enqueue(_init_kernel, false);
+
+ int n_passes = _total_bits / _bits;
+
+ // 2. Repeat (total_bits/bits) times.
+ // - total_bits is the number of bits of the data type (e.g., 32 for float)
+ // - bits defines number of buckets (e.g. 16 buckets where bit is 4)
+ for (int pass = 0; pass < n_passes; ++pass)
+ {
+ arm_compute::CLScheduler::get().sync();
+
+ // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
+ _hist_kernel.setPass(pass, _p_in_key_buf);
+ CLScheduler::get().enqueue(_hist_kernel, false);
+
+ // 2.2. Calculate prefix sum locally with multiple threads
+ CLScheduler::get().enqueue(_scan_hist_kernel, false);
+ // 2.3. Calculate prefix sum within a work group
+ CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
+ // 2.4. Calculate global prefix sum
+ CLScheduler::get().enqueue(_paste_hist_kernel, false);
+
+ // 2.5. Reorder keys and indices based on the global prefix sum
+ _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf);
+ CLScheduler::get().enqueue(_reorder_kernel, false);
+
+ cl::Buffer *tmp;
+ // swap key buffers
+ tmp = _p_in_key_buf;
+ _p_in_key_buf = _p_out_key_buf;
+ _p_out_key_buf = tmp;
+
+ // swap index buffers
+ tmp = _p_in_ind_buf;
+ _p_in_ind_buf = _p_out_ind_buf;
+ _p_out_ind_buf = tmp;
+ }
+
+ // 3. Get the first negative index
+ // Because we swap in_buf and out_buf at the end of the above for loop,
+ // the output buffers are in bufs.
+ _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
+ CLScheduler::get().enqueue(_find_first_negative_kernel, false);
+
+ // 4. Correct odering of negatives
+ // - Since radix sort does not consider negatives, negatives are considered as bigger values
+ // than positives.
+ // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
+ _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf,
+ _p_out_ind_buf);
+ CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
+
+ // 5. Extract top k values from sorted keys and indices.
+ _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
+ CLScheduler::get().enqueue(_store_kernel, false);
+
+ arm_compute::CLScheduler::get().sync();
+
+#if 0
+ // below code is left for debugging.
+ int first_neg;
+ q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
+ std::cout << "first neg = " << first_neg << std::endl;
+
+ float in_key[_n];
+ q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
+ }
+
+ float out_key[_n];
+ q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
+ }
+
+ int in_ind[_n];
+ q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
+ }
+
+ int out_ind[_n];
+ q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
+ }
+
+ int hist_buf[_hist_buf_size];
+ q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
+ for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
+ std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
+ }
+
+ int glob_sum_buf[_glob_sum_buf_size];
+ q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
+ for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
+ std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
+ }
+
+#endif
+}
+#endif // Disable GPU implementation
+
+void CLTopKV2::run_on_cpu()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+ // const Window& w = _topkv2_kernel.window();
+
+ _input->map(q);
+ _values->map(q);
+ _indices->map(q);
+
+ // int row_size = (w[0].end() - w[0].start()) / w[0].step();
+ int row_size = _input->info()->tensor_shape()[0];
+ int rank = _input->info()->num_dimensions();
+
+ if (rank > 2)
+ throw std::runtime_error("Not supported type.");
+
+ int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
+
+ if (_input->info()->data_type() == DataType::F32)
+ {
+ nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(), (float *)_values->buffer());
+ }
+ else if (_input->info()->data_type() == DataType::S32)
+ {
+ nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(),
+ (int32_t *)_values->buffer());
+ }
+ else if (_input->info()->data_type() == DataType::QASYMM8)
+ {
+ nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(),
+ (uint8_t *)_values->buffer());
+ }
+ else
+ {
+ throw std::runtime_error("Not supported type.");
+ }
+
+ _input->unmap(q);
+ _values->unmap(q);
+ _indices->unmap(q);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
new file mode 100644
index 000000000..40e21671d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _scale_f(),
+ _conv_f(),
+ _flip_weights(),
+ _scaled_output(),
+ _original_weights(nullptr),
+ _weights_flipped(),
+ _is_prepared(false)
+{
+}
+
+Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, ITensorInfo *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+ const unsigned int kernel_x = weights->dimension(idx_w);
+ const unsigned int kernel_y = weights->dimension(idx_h);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
+ "invalid_right must be smaller than kernel_x");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
+ "inner_border_top must be smaller than kernel_y");
+
+ // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
+ auto out_dims = transposeconv_output_dimensions(
+ input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+ weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+ const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+ if (bias != nullptr)
+ {
+ if (is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+ "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+ "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+ "Output's depth is invalid.");
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+ pad_bottom);
+ TensorInfo scale_out_info(input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(scale_out_shape)
+ .set_data_layout(data_layout));
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+ conv_info, weights_info));
+
+ return Status{};
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+ ICLTensor *output, const PadStrideInfo &info,
+ unsigned int invalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ _original_weights = weights;
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(weights, &_weights_flipped);
+
+ // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
+ // added.
+ auto out_dims = transposeconv_output_dimensions(
+ input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+ weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+ invalid_bottom);
+
+ const TensorShape output_shape =
+ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(
+ *output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
+ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+ info, invalid_right, invalid_bottom));
+
+ _is_prepared = weights_info.retain_internal_weights();
+
+ _memory_group.manage(&_scaled_output);
+
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+ // to match output shape
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+ pad_right, pad_top, pad_bottom);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ // configure scale function
+ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+ DimensionRoundingType::FLOOR);
+ _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
+
+ // setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+ _scaled_output.allocator()->allocate();
+}
+
+void CLTransposeConvLayer::run()
+{
+ prepare();
+
+ _memory_group.acquire();
+
+ _scale_f.run();
+ _conv_f.run();
+
+ _memory_group.release();
+}
+
+void CLTransposeConvLayer::prepare()
+{
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ _weights_flipped.map(true);
+ _original_weights->map(CLScheduler::get().queue(), true);
+ CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+ _weights_flipped.unmap();
+ _original_weights->unmap(CLScheduler::get().queue());
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
+ _conv_f.prepare();
+
+ if (!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
new file mode 100644
index 000000000..0ce3e6700
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
+ : _upsample(),
+ _output(nullptr)
+{
+}
+
+Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const BorderSize &inner_border,
+ const PadStrideInfo &info)
+{
+ return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
+ const BorderSize &inner_border,
+ const PadStrideInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ _output = output;
+ _upsample.configure(input, _output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::run()
+{
+ _output->map(CLScheduler::get().queue(), true);
+ if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
+ {
+ const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+ std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+ }
+ else
+ {
+ memset(_output->buffer(), 0, _output->info()->total_size());
+ }
+ _output->unmap(CLScheduler::get().queue());
+
+ CLScheduler::get().enqueue(_upsample, false);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
new file mode 100644
index 000000000..f8e0ef8a6
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>();
+ k->configure(input, output, info);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
new file mode 100644
index 000000000..80fbf359d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/NEFunctionsEx.h"
+
+// NOTE This empty file aims to validate "NEFunctionsEx.h".
+// DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
new file mode 100644
index 000000000..5ba465b61
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArgMinMax.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+
+template <ReductionOperation OP>
+NEArgMinMaxStatic<OP>::NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernel(), _reduced_out(), _reshape()
+{
+}
+
+template <ReductionOperation OP>
+Status NEArgMinMaxStatic<OP>::validate(const ITensorInfo *input, int axis,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+
+ TensorShape out_shape = input->tensor_shape();
+ const int input_dims = input->num_dimensions();
+ int axis_local = axis;
+
+ // Convert negative axis
+ axis_local = wrap_around(axis_local, input_dims);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local) > input->num_dimensions() - 1);
+ out_shape.remove_dimension(axis_local);
+
+ const TensorInfo out_info = output->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+ return Status{};
+}
+
+template <ReductionOperation OP>
+void NEArgMinMaxStatic<OP>::configure(ITensor *input, int axis, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ int axis_local = axis;
+ const int input_dims = input->info()->num_dimensions();
+
+ // Convert negative axis
+ axis_local = wrap_around(axis_local, input_dims);
+
+ // Perform reduction for axis
+ TensorShape intermediate_shape = input->info()->tensor_shape();
+ intermediate_shape.set(axis_local, 1);
+ auto in = input;
+
+ _reduced_out.allocator()->init(TensorInfo(intermediate_shape, output->info()->num_channels(),
+ output->info()->data_type(),
+ output->info()->quantization_info()));
+ _memory_group.manage(&_reduced_out);
+ _reduction_kernel.configure(in, axis_local, &_reduced_out, OP);
+
+ // Allocate intermediate tensor
+ _reduced_out.allocator()->allocate();
+
+ // Configure reshape layer if we want to drop the dimensions
+ TensorShape out_shape = input->info()->tensor_shape();
+ out_shape.remove_dimension(axis_local);
+ auto_init_if_empty(*output->info(), output->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(&_reduced_out, output);
+}
+
+template <ReductionOperation OP> void NEArgMinMaxStatic<OP>::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _reduction_kernel.run();
+ _reshape.run();
+}
+
+// Supported Specializations
+template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>;
+template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
new file mode 100644
index 000000000..7c15fc453
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
+#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation COP>
+void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
+ ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+ k->configure(COP, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+template <BinaryLogicalOperation COP>
+Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
+ BinaryLogicalOperation op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+ k->configure(op, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, BinaryLogicalOperation op)
+{
+ return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
new file mode 100644
index 000000000..f2490e4e8
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NECast.h"
+
+#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
+ k->configure(input, output, input_subtype);
+ _kernel = std::move(k);
+}
+
+Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
+ SubDataType input_subtype)
+{
+ return NECastKernel::validate(input, output, input_subtype);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
new file mode 100644
index 000000000..db419e3a8
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
+ k->configure(input, output, block_shape);
+ _kernel = std::move(k);
+}
+
+Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ int32_t block_shape)
+{
+ return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
new file mode 100644
index 000000000..a95018a28
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NENegLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernelEx>();
+ k->configure(ElementWiseUnaryEx::NEG, input, output);
+ _kernel = std::move(k);
+}
+Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx::NEG, input, output);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
new file mode 100644
index 000000000..00c3ed94f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+ k->configure(input, output, lookups);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
new file mode 100644
index 000000000..d604fedbf
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+
+ return Status{};
+}
+} // namespace
+
+void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
+
+Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
+ const ITensorInfo *output)
+{
+ return NETransposeKernel::validate(input, output);
+}
+
+NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
+ _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
+ _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
+ _accumulate_biases(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+ // Configure gemmlowp function
+ _mm_gemmlowp.configure(input, weights, nullptr, output);
+}
+
+void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights,
+ const ITensor *biases, ITensor *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ fc_info));
+
+ _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ _accumulate_biases = false;
+ _original_weights = weights;
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if (biases != nullptr)
+ {
+ _accumulate_biases = true;
+
+ // Configure accumulate biases kernel
+ _accumulate_biases_kernel.configure(output, biases);
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensor *weights_to_use = weights;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+ bool _is_fc_after_conv;
+ if (is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv,
+ "NEFullyConnectedHybridLayer does not support after conv");
+ (void)_is_fc_after_conv;
+
+ // Reshape weights if needed
+ if (!_are_weights_reshaped)
+ {
+ // Reshape the weights
+ _reshape_weights_output.allocator()->init(
+ weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights->info())));
+ _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
+ weights_to_use = &_reshape_weights_output;
+ }
+
+ // Quantize input
+ _quantized_input.allocator()->init(
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+ _scale_factor.allocator()->init(
+ TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+ _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
+
+ // GEMM
+ _gemmlowp_output.allocator()->init(
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
+
+ // Multiply scale
+ _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
+ weights->info()->quantization_info().scale);
+
+ _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+
+ _quantized_input.allocator()->allocate();
+ _scale_factor.allocator()->allocate();
+ _gemmlowp_output.allocator()->allocate();
+}
+
+Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *biases, const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+
+ bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+
+ const ITensorInfo &reshaped_weights =
+ TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights)));
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if (biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *weights_to_use = weights;
+
+ if (!weights_reshaped)
+ {
+ // Validate reshape weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+ weights_to_use = &reshaped_weights;
+ }
+
+ // Fully Connected layer after a Fully Connected Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+
+ // Validate quantization kernel
+ const ITensorInfo &quantized_input = TensorInfo(
+ input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+ const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+
+ const ITensorInfo &gemmlowp_output = TensorInfo(
+ output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
+ &gemmlowp_output, &scale_factor, output, weights->quantization_info().scale));
+
+ return Status{};
+}
+
+void NEFullyConnectedHybridLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Quantize input
+ NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY);
+
+ // Run matrix multiply
+ _mm_gemmlowp.run();
+
+ // Multiply scale factor
+ NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY);
+
+ // Accumulate biases if provided
+ if (_accumulate_biases)
+ {
+ NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+ }
+}
+
+void NEFullyConnectedHybridLayer::prepare()
+{
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ auto release_unused = [](Tensor *w) {
+ if (!w->is_used())
+ {
+ w->allocator()->free();
+ }
+ };
+
+ // Reshape of the weights (happens only once)
+ if (!_are_weights_reshaped)
+ {
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_function.run();
+
+ _are_weights_reshaped = true;
+ // We can not release _original_weights because it can be used in other nodes
+ }
+
+ // Prepare GEMM prepare and release unused weights
+ _mm_gemmlowp.prepare();
+
+ // Release reshaped weights if unused
+ release_unused(&_reshape_weights_output);
+
+ _is_prepared = true;
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
new file mode 100644
index 000000000..a944f699a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+ if (is_data_type_quantized_asymmetric(input.data_type()))
+ {
+ // Since we need negative offsets for computing convolution, we need to change
+ // QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info(input.quantization_info().scale,
+ -input.quantization_info().offset);
+ const QuantizationInfo weights_quantization_info(weights.quantization_info().scale,
+ -weights.quantization_info().offset);
+
+ // Validate gemmlowp function
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
+ &input.clone()->set_quantization_info(input_quantization_info),
+ &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
+ &input, &weights, nullptr, &output, 1.f, 0.0f,
+ GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+ }
+
+ return Status{};
+}
+} // namespace
+
+NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+ _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+ _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
+ _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
+ _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
+ _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights,
+ ITensor *output)
+{
+ if (_is_quantized)
+ {
+ // Since we need negative offsets for computing convolution, we need to change
+ // QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+ const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+ input->info()->set_quantization_info(
+ QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+ weights->info()->set_quantization_info(
+ QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+ // Configure gemmlowp function
+ _mm_gemmlowp.configure(input, weights, nullptr, output);
+
+ // Revert back QuantizatioInfo as input and weights could be used in other fully connected
+ // layers
+ input->info()->set_quantization_info(input_quantization_info);
+ weights->info()->set_quantization_info(weights_quantization_info);
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f,
+ GEMMInfo(false, false, false /* Reshape weights only for the first run */));
+ }
+}
+
+void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(
+ (weights->info()->dimension(1) !=
+ (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+ // If the fully connected layer is called after a convolution layer, the input tensor must be
+ // linearized
+
+ // Initialize output tensor for flatten
+ TensorShape shape_flatten = compute_flatten_shape(input->info());
+ _flatten_output.allocator()->init(
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ shape_flatten));
+
+ // Configure flatten kernel
+ _memory_group.manage(&_flatten_output);
+ _flatten_kernel.configure(input, &_flatten_output);
+
+ // Configure matrix multiply kernel
+ configure_mm(&_flatten_output, weights, output);
+
+ // Allocate the output tensor for flatten once all the configure methods have been called
+ _flatten_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+ // Configure matrix multiply kernel
+ configure_mm(input, weights, output);
+}
+
+void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights,
+ const ITensor *biases, ITensor *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ fc_info));
+
+ _are_weights_converted = true;
+ _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ _is_fc_after_conv = true;
+ _accumulate_biases = false;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _original_weights = weights;
+
+ // Configure gemmlowp output
+ if (_is_quantized)
+ {
+ _gemmlowp_output.allocator()->init(
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+ DataType::S32));
+ }
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if (biases != nullptr && !_is_quantized)
+ {
+ _accumulate_biases = true;
+
+ // Configure accumulate biases kernel
+ _accumulate_biases_kernel.configure(output, biases);
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensor *weights_to_use = weights;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+ if (is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ _is_fc_after_conv = input->info()->num_dimensions() > 1;
+ }
+
+ // Reshape weights if needed
+ if (!_are_weights_reshaped)
+ {
+ // Reshape the weights
+ _reshape_weights_function.configure(weights, &_reshape_weights_output);
+ weights_to_use = &_reshape_weights_output;
+ }
+
+ // Convert weights if needed
+ if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Convert weights
+ _convert_weights.configure(weights_to_use, &_converted_weights_output,
+ input->info()->tensor_shape(), fc_info.weights_trained_layout);
+
+ weights_to_use = &_converted_weights_output;
+ _are_weights_converted = false;
+ }
+
+ ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
+ if (_is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ configure_conv_fc(input, weights_to_use, tmp_output);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ configure_fc_fc(input, weights_to_use, tmp_output);
+ }
+
+ // Configure output stage for asymmetric quantized types
+ if (_is_quantized)
+ {
+ float multiplier = input->info()->quantization_info().scale *
+ weights->info()->quantization_info().scale /
+ output->info()->quantization_info().scale;
+ int output_multiplier;
+ int output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
+ &output_shift);
+ _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
+ output_shift, output->info()->quantization_info().offset);
+ _gemmlowp_output.allocator()->allocate();
+ }
+
+ _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *biases, const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+ bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ bool is_fc_after_conv = true;
+ bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+ const ITensorInfo &flatten_input =
+ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_flatten_shape(input)));
+ const ITensorInfo &reshaped_weights =
+ TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights)));
+ const ITensorInfo &converted_weights =
+ weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+ : TensorInfo(*reshaped_weights.clone());
+ const ITensorInfo &gemmlowp_output = TensorInfo(
+ output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if (biases != nullptr && !is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *input_to_use = input;
+ const ITensorInfo *weights_to_use = weights;
+ const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->dimension(1) > 1;
+
+ if (is_batched_fc_layer)
+ {
+ is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+ output->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ is_fc_after_conv = input->num_dimensions() > 1;
+ }
+
+ if (!weights_reshaped)
+ {
+ // Validate reshape weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+ weights_to_use = &reshaped_weights;
+ }
+
+ if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Validate convert weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
+ weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+ weights_to_use = &converted_weights;
+ }
+
+ if (is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ (weights_to_use->dimension(1) !=
+ (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+ // Validate flatten kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+ input_to_use = &flatten_input;
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+ }
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
+
+ // Validate output stage for asymmetric quantized types
+ if (is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
+ &gemmlowp_output, biases, output));
+ }
+
+ return Status{};
+}
+
+void NEFullyConnectedLayerEx::run()
+{
+ if (!_is_prepared)
+ {
+ if (!_are_weights_reshaped)
+ _reshape_weights_output.allocator()->allocate();
+ if (!_are_weights_converted)
+ _converted_weights_output.allocator()->allocate();
+ _is_prepared = true;
+ }
+
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Reshape of the weights
+ if (!_are_weights_reshaped)
+ {
+ _reshape_weights_function.run();
+ }
+
+ // Convert weights if needed
+ if (!_are_weights_converted)
+ {
+ _convert_weights.run();
+ }
+
+ // Prepare GEMM prepare
+ if (!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ }
+ }
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Linearize input if it comes from a convolutional layer
+ if (_is_fc_after_conv)
+ {
+ NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+ }
+
+ // Run matrix multiply
+ if (_is_quantized)
+ {
+ _mm_gemmlowp.run();
+ }
+ else
+ {
+ _mm_gemm.run();
+ }
+
+ // Accumulate biases if provided
+ if (_is_quantized)
+ {
+ _gemmlowp_output_stage.run();
+ }
+ else
+ {
+ if (_accumulate_biases)
+ {
+ NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+ }
+ }
+}
+
+void NEFullyConnectedLayerEx::prepare()
+{
+#if 0 // TODO Remove this block
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ auto release_unused = [](Tensor *w) {
+ if (!w->is_used())
+ {
+ w->allocator()->free();
+ }
+ };
+
+ // Pointer to current weights
+ const ITensor *cur_weights = _original_weights;
+
+ // Reshape of the weights (happens only once)
+ if (!_are_weights_reshaped)
+ {
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_function.run();
+
+ cur_weights->mark_as_unused();
+ cur_weights = &_reshape_weights_output;
+ _are_weights_reshaped = true;
+ }
+
+ // Convert weights if needed (happens only once)
+ if (!_are_weights_converted)
+ {
+ _converted_weights_output.allocator()->allocate();
+ _convert_weights.run();
+
+ cur_weights->mark_as_unused();
+ _are_weights_converted = true;
+ }
+
+ // Release reshaped weights if unused
+ release_unused(&_reshape_weights_output);
+
+ // Prepare GEMM prepare and release unused weights
+ if (!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ }
+
+ // Release converted weights if unused
+ release_unused(&_reshape_weights_output);
+ release_unused(&_converted_weights_output);
+
+ _is_prepared = true;
+ }
+#endif
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..fcac3c7ae
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h"
+
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+
+using namespace arm_compute;
+
+void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input,
+ const arm_compute::ITensor *weights,
+ const arm_compute::ITensor *biases,
+ arm_compute::ITensor *output, bool needs_reshape,
+ const arm_compute::TensorShape &reshape,
+ KernelType kernel_type)
+{
+ _input = input;
+ _weights = weights;
+ _biases = biases;
+ _output = output;
+ _needs_reshape = needs_reshape;
+
+ const ITensor *input_to_use = input;
+ if (_needs_reshape)
+ {
+ // reshape
+ auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+ _neon_reshape.configure(_input, &_neon_buffer);
+ input_to_use = &_neon_buffer;
+ }
+
+ _neon_fc = [&]() {
+ if (kernel_type == KernelType::GENERAL)
+ {
+ auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager};
+ fc->configure(input_to_use, _weights, _biases, _output);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ else
+ {
+ assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+
+ bool is_hybrid = input->info()->data_type() == DataType::F32 &&
+ weights->info()->data_type() == DataType::S8;
+
+ if (is_hybrid)
+ {
+ auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
+ fc->configure(input_to_use, _weights, _biases, _output);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ else
+ {
+ auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager};
+ fc->configure(input_to_use, _weights, _biases, _output);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ }
+ }();
+
+ // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+ if (_needs_reshape)
+ {
+ _neon_buffer.allocator()->allocate();
+ }
+}
+
+void NEFullyConnectedReshapingLayer::run(void)
+{
+ if (_needs_reshape)
+ _neon_reshape.run();
+
+ _neon_fc->run();
+}
+
+void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
new file mode 100644
index 000000000..11794a1ea
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
+ _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+ _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+ _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+ _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
+ _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+ _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
+ _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
+ ITensor *output, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+ ARM_COMPUTE_UNUSED(c);
+ ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
+ a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+
+ const ITensor *matrix_a = a;
+ const ITensor *matrix_b = b;
+ GEMMInfo info = gemm_info;
+
+ // Clear state
+ _mtx_a_reshape_kernel = nullptr;
+ _mtx_b_reshape_kernel = nullptr;
+
+ // Set internal variables
+ _a_offset = a->info()->quantization_info().offset;
+ _b_offset = b->info()->quantization_info().offset;
+ _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+ _is_prepared = false;
+ _fused_assembly_path = false;
+ _original_b = b;
+
+ const ITensor *a_to_use = a;
+
+ // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+ if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ _fuse_output_stage = true;
+ _memory_group.manage(&_mm_result_s32);
+ TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+ _mm_result_s32.allocator()->init(info_mm_result_s32);
+ }
+
+#ifdef __aarch64__
+#if 0 // Can use after arm compute library v19.11
+ switch (a->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::U8:
+ case DataType::S8:
+ {
+ if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
+ info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+ _fused_assembly_path = _asm_glue.is_configured();
+ }
+ else
+ {
+ _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
+ gemm_info);
+ }
+ _assembly_path = _asm_glue.is_configured();
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
+ }
+#endif // 0
+ ARM_COMPUTE_ERROR("aarch64 not supported");
+#endif /* __aarch64__ */
+ if (!(_assembly_path || _run_vector_matrix_multiplication))
+ {
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+ // 4.0f) ]
+ TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
+ a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
+ // 16.0f) ]
+ TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
+ b->info()->quantization_info());
+ _tmp_a.allocator()->init(a_info);
+ _tmp_b.allocator()->init(b_info);
+ _memory_group.manage(&_tmp_a);
+ if (!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
+
+ // Configure interleave kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ k->configure(a_to_use, &_tmp_a);
+ _mtx_a_reshape_kernel = std::move(k);
+ }
+
+ // Configure transpose kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ k->configure(b, &_tmp_b);
+ _mtx_b_reshape_kernel = std::move(k);
+ }
+ }
+
+ if (!_fused_assembly_path)
+ {
+ // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+ if (_a_offset != 0)
+ {
+ TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
+
+ _vector_sum_col.allocator()->init(info_vector_sum_col);
+ if (!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_vector_sum_col);
+ }
+
+ // Configure Matrix B reduction kernel
+ _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
+ }
+
+ // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+ if (_b_offset != 0)
+ {
+ TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
+
+ _vector_sum_row.allocator()->init(info_vector_sum_row);
+ _memory_group.manage(&_vector_sum_row);
+
+ // Configure matrix A reduction kernel
+ _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
+ false);
+ }
+
+ if (_fuse_output_stage)
+ {
+ // Configure matrix multiply kernel
+ if (!_assembly_path)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(matrix_a, matrix_b, &_mm_result_s32);
+ _mm_kernel = std::move(k);
+ }
+
+ _offset_contribution_output_stage_kernel.configure(
+ &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+ _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
+ _b_offset, info.gemmlowp_output_stage());
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ if (!_assembly_path)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(matrix_a, matrix_b, output);
+ _mm_kernel = std::move(k);
+ }
+ // Configure offset contribution kernel
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row,
+ a_to_use->info()->dimension(0), _a_offset, _b_offset);
+ }
+ }
+
+ // Allocate tensors
+ if (!_assembly_path && !_run_vector_matrix_multiplication)
+ {
+ _tmp_a.allocator()->allocate();
+ if (!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
+ }
+
+ if (!_fused_assembly_path)
+ {
+ if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+ {
+ _vector_sum_col.allocator()->allocate();
+ }
+
+ if (_b_offset != 0)
+ {
+ _vector_sum_row.allocator()->allocate();
+ }
+ }
+
+ if (_fuse_output_stage)
+ {
+ _mm_result_s32.allocator()->allocate();
+ }
+}
+
+Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+ const ITensorInfo *c, const ITensorInfo *output,
+ const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+ "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+ "The product AB is defined only if the number of columns in A is "
+ "equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+ "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+ "Matrix B already reshaped is not supported");
+
+ GEMMInfo info = gemm_info;
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ const ITensorInfo *a_to_use = a;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo mm_result_s32_info{};
+
+ int32_t a_offset = a->quantization_info().offset;
+ int32_t b_offset = b->quantization_info().offset;
+
+ bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ if (fuse_output_stage)
+ {
+ auto_init_if_empty(
+ mm_result_s32_info,
+ a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ }
+
+ // Check if we need to run the optimized assembly kernel
+ bool run_optimised = false;
+ bool run_optimised_requantized = false;
+ const bool reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+ if (a_to_use->data_type() == DataType::QASYMM8 &&
+ info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, output, 1.f, 0.f,
+ reshape_b_only_on_first_run));
+ run_optimised_requantized = run_optimised;
+ }
+ else
+ {
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(
+ a_to_use, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f,
+ reshape_b_only_on_first_run));
+ }
+
+ if (run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+ if (info.depth_output_gemm3d() != 0)
+ {
+ if (info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+ "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+ "NEGEMM cannot reinterpret the output tensor as 3D");
+
+ const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+ if (!run_vector_matrix_multiplication)
+ {
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+ // 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
+ // / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
+ }
+ }
+
+ if (!run_optimised_requantized)
+ {
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
+
+ // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+ if (a_offset != 0)
+ {
+ info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
+ b, &info_vector_sum_col, a->dimension(0), false));
+ }
+
+ // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+ if (b_offset != 0)
+ {
+ info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
+ a_to_use, &info_vector_sum_row, a->dimension(0), false));
+ }
+
+ if (fuse_output_stage)
+ {
+ if (!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
+ matrix_a_info, matrix_b_info, &mm_result_s32_info));
+ }
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
+ &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
+ info.gemmlowp_output_stage()));
+ }
+ else
+ {
+ if (!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+ }
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
+ output, a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
+ }
+ }
+ return Status{};
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Reshape inputs
+ if (_mtx_a_reshape_kernel)
+ {
+ NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+ }
+ if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+ {
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ }
+
+ // Run GEMM
+ if (_asm_glue.is_configured())
+ {
+ _asm_glue.run();
+ }
+ else
+ {
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ }
+
+ if (!_fused_assembly_path)
+ {
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if (_b_offset != 0)
+ {
+ NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+ {
+ NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ }
+
+ if (_fuse_output_stage)
+ {
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+ }
+ else
+ {
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+ }
+ }
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
+{
+ if (!_is_prepared)
+ {
+ // Run assembly reshape
+ if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ _asm_glue.prepare();
+ _original_b->mark_as_unused();
+ }
+ // Run non-assembly reshape
+ else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ // Run reshape kernel and mark original weights tensor as unused
+ _tmp_b.allocator()->allocate();
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ _original_b->mark_as_unused();
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if (_a_offset != 0 && _reshape_b_only_on_first_run)
+ {
+ _vector_sum_col.allocator()->allocate();
+ NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ }
+
+ _is_prepared = true;
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
new file mode 100644
index 000000000..90dabb35a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ return NEGatherKernelEx::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
new file mode 100644
index 000000000..624185d2c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
+ ITensor *output, ITensor *hits)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
+ k->configure(lookups, keys, input, output, hits);
+ _kernel = std::move(k);
+}
+
+Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits)
+{
+ return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..1c2c8f027
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
+ _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+{
+}
+
+void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma,
+ ITensor *beta, float epsilon)
+{
+ const DataLayout data_layout = input->info()->data_layout();
+
+ // Configure Kernels
+ _is_nchw = data_layout == DataLayout::NCHW;
+
+ if (!_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+
+ _permute_output.configure(&_permuted_output, output != nullptr ? output : input,
+ PermutationVector(2U, 0U, 1U));
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+ }
+}
+
+Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma, const ITensorInfo *beta,
+ float epsilon)
+{
+ return NEInstanceNormalizationLayerKernelEx::validate(
+ &input->clone()->set_data_layout(DataLayout::NCHW),
+ &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+}
+
+void NEInstanceNormalizationLayerEx::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Permute input
+ if (!_is_nchw)
+ {
+ _permute_input.run();
+ }
+
+ NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+
+ // Permute output
+ if (!_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
new file mode 100644
index 000000000..1150cef76
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
+
+#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
+ k->configure(input, alpha, output);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
new file mode 100644
index 000000000..84411c266
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+ _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+ _gemm_output(), _add_output(), _is_prepared(false)
+{
+}
+
+Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+ const ITensorInfo *hidden_state, const ITensorInfo *output,
+ const ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+ output);
+
+ const int idx_width = 0;
+ const int idx_height = 1;
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+ recurrent_weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+ recurrent_weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ hidden_state->tensor_shape());
+
+ auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
+ recurrent_weights, hidden_state->dimension(idx_height)),
+ 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
+ &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+ return Status{};
+}
+
+void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
+ const ITensor *recurrent_weights, const ITensor *bias,
+ ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
+ recurrent_weights->info(), bias->info(),
+ hidden_state->info(), output->info(), info));
+
+ const int idx_height = 1;
+ TensorShape shape = misc::shape_calculator::compute_rnn_shape(
+ recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+ _is_prepared = false;
+
+ // Manage intermediate buffers and configure
+ _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+ // Manage intermediate buffers and configure
+ _memory_group.manage(&_fully_connected_out);
+ _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+ _memory_group.manage(&_gemm_output);
+ _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+ _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_add_output);
+
+ _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
+ ConvertPolicy::SATURATE);
+
+ _fully_connected_out.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+
+ _activation_kernel.configure(&_add_output, hidden_state, info);
+ _add_output.allocator()->allocate();
+
+ _copy_kernel.configure(hidden_state, output);
+}
+
+void NERNNLayerEx::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _fully_connected_kernel.run();
+
+ _gemm_state_f.run();
+
+ NEScheduler::get().schedule(&_add_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+
+ // copy hidden out to output
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+}
+
+void NERNNLayerEx::prepare()
+{
+ if (!_is_prepared)
+ {
+ _fully_connected_kernel.prepare();
+ _gemm_state_f.prepare();
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
new file mode 100644
index 000000000..c65e93570
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+ _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ TensorShape out_shape = input->tensor_shape();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+ input->num_dimensions() - 1);
+ if (output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if (keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+ return Status{};
+}
+
+void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels =
+ arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+ _reduced_outs =
+ arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
+
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ // Perform reduction for every axis
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
+ : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+ out_shape.set(axis_local[i], 1);
+ auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+ if (i == _reduction_ops - 1 && keep_dims)
+ {
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+ }
+ else
+ {
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+ input->info()->data_type(),
+ input->info()->quantization_info())
+ .set_data_layout(output->info()->data_layout()));
+ _memory_group.manage(_reduced_outs.get() + i);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
+ ReductionOperation::MEAN_SUM);
+ }
+ }
+
+ // Allocate intermediate tensors
+ for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ {
+ _reduced_outs[i].allocator()->allocate();
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if (!keep_dims)
+ {
+ TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+ }
+}
+
+void NEReduceMeanEx::run()
+{
+ _memory_group.acquire();
+
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ _reduction_kernels[i].run();
+ }
+
+ if (!_keep_dims)
+ {
+ _reshape.run();
+ }
+ _memory_group.release();
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
new file mode 100644
index 000000000..b36f8287a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+ _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output, ReduceOperation op)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ TensorShape out_shape = input->tensor_shape();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+ input->num_dimensions() - 1);
+ if (output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if (keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+ return Status{};
+}
+
+void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+ ITensor *output, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels.resize(_reduction_ops);
+ _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
+
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ // Perform reduction for every axis
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ TensorShape out_shape =
+ i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ out_shape.set(axis_local[i], 1);
+ auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+ if (i == _reduction_ops - 1 && keep_dims)
+ {
+ _reduction_kernels[i].configure(in, output, axis_local[i], op);
+ }
+ else
+ {
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+ input->info()->data_type(),
+ input->info()->quantization_info()));
+ _memory_group.manage(&_reduced_outs[i]);
+ _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op);
+ }
+ }
+
+ // Allocate intermediate tensors
+ for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ {
+ _reduced_outs[i].allocator()->allocate();
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if (!keep_dims)
+ {
+ TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+ }
+}
+
+void NEReduceOperation::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ _reduction_kernels[i].run();
+ }
+
+ if (!_keep_dims)
+ {
+ _reshape.run();
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
new file mode 100644
index 000000000..3c18217ef
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+ _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ TensorShape out_shape = input->tensor_shape();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+ input->num_dimensions() - 1);
+ if (output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if (keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+ return Status{};
+}
+
+void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels.resize(_reduction_ops);
+ _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
+
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ // Perform reduction for every axis
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ TensorShape out_shape =
+ i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ out_shape.set(axis_local[i], 1);
+ auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+ if (i == _reduction_ops - 1 && keep_dims)
+ {
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM);
+ }
+ else
+ {
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+ input->info()->data_type(),
+ input->info()->quantization_info())
+ .set_data_layout(input->info()->data_layout()));
+ _memory_group.manage(&_reduced_outs[i]);
+ _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
+ ReductionOperation::SUM);
+ }
+ }
+
+ // Allocate intermediate tensors
+ for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ {
+ _reduced_outs[i].allocator()->allocate();
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if (!keep_dims)
+ {
+ TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+ }
+}
+
+void NEReduceSum::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ _reduction_kernels[i].run();
+ }
+
+ if (!_keep_dims)
+ {
+ _reshape.run();
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
new file mode 100644
index 000000000..c3431c418
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Define dimension to split the window
+ *
+ * @param[in] axis Reduction axis
+ *
+ * @return The dimension to split the window
+ */
+size_t reduction_window_split_dimension(unsigned int axis)
+{
+ switch (axis)
+ {
+ case 0:
+ return Window::DimY;
+ case 1:
+ case 2:
+ case 3:
+ return Window::DimX;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ }
+}
+} // namespace
+
+NEReductionOperationEx::NEReductionOperationEx()
+ : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
+{
+}
+
+Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ unsigned int axis, ReduceOperation op)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op));
+
+ return Status{};
+}
+
+void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ NEReductionOperationEx::validate(input->info(), output->info(), axis, op));
+
+ // Configure reduction kernel
+ _reduction_kernel.configure(input, output, axis, op);
+ _window_split = reduction_window_split_dimension(axis);
+ _reduction_axis = axis;
+
+ if (axis == 0)
+ {
+ // Configure fill border kernel
+ const BorderSize fill_border_size = _reduction_kernel.border_size();
+ PixelValue pixelValue;
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ {
+ switch (input->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ pixelValue = PixelValue(std::numeric_limits<float>::max());
+ break;
+ }
+ case DataType::F16:
+ {
+ pixelValue = PixelValue(static_cast<half>(65504.0f));
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ pixelValue =
+ PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported DataType");
+ }
+ }
+ break;
+ }
+ case ReduceOperation::MAX:
+ {
+ switch (input->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ pixelValue = PixelValue(-std::numeric_limits<float>::max());
+ break;
+ }
+ case DataType::F16:
+ {
+ pixelValue = PixelValue(static_cast<half>(-65504.0f));
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ pixelValue =
+ PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported DataType");
+ }
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Reduction Operation unsupported");
+ }
+ _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
+ }
+}
+
+void NEReductionOperationEx::run()
+{
+ if (_reduction_axis == 0)
+ {
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
new file mode 100644
index 000000000..c9f914fb0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
+ : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+{
+}
+
+void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
+ const ITensor *paddings, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ _memset_kernel.configure(
+ output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+ }
+ _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
+ const int block_shape_y, const Size2D &padding_left,
+ const Size2D &padding_right, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ _memset_kernel.configure(
+ output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+ }
+ _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
+ output);
+}
+
+Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+ const ITensorInfo *paddings, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+ return Status{};
+}
+
+Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
+ const int block_shape_y, const Size2D &padding_left,
+ const Size2D &padding_right, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
+ input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+ return Status{};
+}
+
+void NESpaceToBatchLayerEx::run()
+{
+ // Zero out output only if we have paddings
+ if (_has_padding)
+ {
+ NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
new file mode 100644
index 000000000..b6ae21cc0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
+ k->configure(input, output, block_shape);
+ _kernel = std::move(k);
+}
+
+Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ int32_t block_shape)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
+ return Status{};
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
new file mode 100644
index 000000000..fd15ef05f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _conv_f(),
+ _upsample_f(),
+ _flip_weights(),
+ _permute_input(),
+ _permute_weights(),
+ _permute_output(),
+ _scaled_output(),
+ _weights_flipped(),
+ _permuted_input(),
+ _permuted_weights(),
+ _permuted_output(),
+ _is_nchw(false),
+ _original_weights(nullptr),
+ _input(nullptr),
+ _info(),
+ _is_prepared(false)
+{
+}
+
+Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, const ITensorInfo *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
+ const unsigned int width_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
+
+ auto out_dims = transposeconv_output_dimensions(
+ input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
+ weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else if (bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
+
+ if (output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
+ "Output's dim 0 is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
+ "Output's dim 1 is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
+ "Output's dim 2 is invalid.");
+ }
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+ pad_bottom);
+ TensorInfo scale_out_info(
+ input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+ scale_out_info.set_data_layout(input->data_layout());
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ const unsigned int batches_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
+ scale_out_info.dimension(batches_idx));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
+ scale_out_info.dimension(channel_idx));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+ conv_info, WeightsInfo()));
+
+ return Status{};
+}
+
+void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias,
+ ITensor *output, const PadStrideInfo &info,
+ unsigned int invalid_right, unsigned int invalid_bottom)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ _input = input;
+ _original_weights = weights;
+ _info = info;
+ _is_prepared = false;
+ _is_nchw = data_layout == DataLayout::NCHW;
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ const unsigned int width_idx =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ auto out_dims = transposeconv_output_dimensions(
+ input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+ weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
+ invalid_right, invalid_bottom);
+
+ const TensorShape output_shape =
+ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+ info, invalid_right, invalid_bottom));
+
+ _memory_group.manage(&_scaled_output);
+
+ if (!_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_weights);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from NHWC -> NCHW
+ _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+ // order to match output shape
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
+ invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
+ _permuted_input.info()->quantization_info());
+ scale_out_info.set_data_layout(DataLayout::NCHW);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+ DimensionRoundingType::CEIL);
+ _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
+
+ _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
+ _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
+ _flip_weights.configure(&_permuted_weights, &_weights_flipped);
+
+ // setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ const auto out_shape = output->info()->tensor_shape();
+ TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
+ TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
+ output->info()->quantization_info());
+ _permuted_output.allocator()->init(permuted_out_info);
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
+
+ // Configure the function to transform the convoluted output to NHWC
+ _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+ // order to match output shape
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+ pad_right, pad_top, pad_bottom);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+ _scaled_output.allocator()->init(scale_out_info);
+ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+ DimensionRoundingType::FLOOR);
+ _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(weights, &_weights_flipped);
+
+ // setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+ }
+ _scaled_output.allocator()->allocate();
+}
+
+void NETransposeConvLayer::run()
+{
+ prepare();
+
+ // MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Permute input
+ if (!_is_nchw)
+ {
+ _permute_input.run();
+ }
+
+ _upsample_f.run();
+ _conv_f.run();
+
+ // Permute output
+ if (!_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+
+void NETransposeConvLayer::prepare()
+{
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ // Permute weights
+ if (!_is_nchw)
+ {
+ _permute_weights.run();
+ }
+ NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
+ _conv_f.prepare();
+
+ if (!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
new file mode 100644
index 000000000..67e1bfb02
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/GenericGather.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+ return (input->num_dimensions() != 4 && output->num_dimensions() == 4 &&
+ input->data_layout() == DataLayout::NCHW);
+}
+
+void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+ arm_compute::ITensor *output, int axis)
+{
+ _input = input;
+ _indices = indices;
+ _output = output;
+ _axis = axis;
+
+ arm_compute::PermutationVector pv;
+ if (shouldPermute(input->info(), output->info()))
+ {
+ // NOTE This vector comes from CLPermuteKernel implementation
+ //
+ // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H /
+ // C
+ //
+ // Original | Permuted
+ // 0 | C | W (from 1)
+ // 1 | W | H (from 2)
+ // 2 | H | C (from 0)
+ //
+ pv = arm_compute::PermutationVector{1, 2, 0};
+ }
+
+ if (utils::isGpuMode())
+ {
+ if (shouldPermute(input->info(), output->info()))
+ {
+ _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis);
+ _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv);
+
+ // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _cl_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis);
+ }
+ }
+ else
+ {
+ throw std::runtime_error("Not supported, yet");
+ }
+}
+
+void GenericGather::run(void)
+{
+ if (utils::isGpuMode())
+ {
+ _cl_gather.run();
+ if (shouldPermute(_input->info(), _output->info()))
+ {
+ _cl_permute.run();
+ }
+ }
+ else
+ {
+ throw std::runtime_error("Not supported, yet");
+ }
+}
+
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp
new file mode 100644
index 000000000..8025ae28e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/misc/functions/GenericReshapeLayer.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+namespace
+{
+
+bool shouldPermute(const arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+ return (input->num_dimensions() == 4 || output->num_dimensions() == 4) &&
+ (input->num_dimensions() != output->num_dimensions() &&
+ input->data_layout() == DataLayout::NCHW);
+}
+
+} // namespace
+
+void GenericReshapeLayer::configure(const arm_compute::ITensor *input, arm_compute::ITensor *output)
+{
+ _input = input;
+ _output = output;
+
+ arm_compute::PermutationVector pv;
+ if (input->info()->data_layout() == DataLayout::NCHW && input->info()->num_dimensions() == 4 &&
+ output->info()->num_dimensions() != 4)
+ {
+ // NOTE This vector comes from CLPermuteKernel implementation
+ //
+ // This implementation permutes a tensor of shape W / H / C into another tensor of shape
+ // C / W / H
+ //
+ // Original | Permuted
+ // 0 | W | C (from 2)
+ // 1 | H | W (from 0)
+ // 2 | C | H (from 1)
+ //
+ pv = arm_compute::PermutationVector{2, 0, 1};
+ }
+ else if (input->info()->data_layout() == DataLayout::NCHW &&
+ input->info()->num_dimensions() != 4 && output->info()->num_dimensions() == 4)
+ {
+ // NOTE This vector comes from CLPermuteKernel implementation
+ //
+ // This implementation permutes a tensor of shape C / W / H into another tensor of shape
+ // W / H / C
+ //
+ // Original | Permuted
+ // 0 | C | W (from 1)
+ // 1 | W | H (from 2)
+ // 2 | H | C (from 0)
+ //
+ pv = arm_compute::PermutationVector{1, 2, 0};
+ }
+
+ if (utils::isGpuMode())
+ {
+ const auto const_input = CAST_CL(const_cast<arm_compute::ITensor *>(input));
+ if (shouldPermute(input->info(), output->info()))
+ {
+ _cl_permute.configure(const_input, &_cl_permuted, pv);
+ _cl_reshape.configure(&_cl_permuted, CAST_CL(output));
+
+ // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _cl_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _cl_reshape.configure(const_input, CAST_CL(output));
+ }
+ }
+ else
+ {
+ if (shouldPermute(input->info(), output->info()))
+ {
+ _neon_permute.configure(input, &_neon_permuted, pv);
+ _neon_reshape.configure(&_neon_permuted, output);
+
+ // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _neon_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _neon_reshape.configure(input, output);
+ }
+ }
+}
+
+void GenericReshapeLayer::run(void)
+{
+ if (utils::isGpuMode())
+ {
+ if (shouldPermute(_input->info(), _output->info()))
+ {
+ _cl_permute.run();
+ }
+ _cl_reshape.run();
+ }
+ else
+ {
+ if (shouldPermute(_input->info(), _output->info()))
+ {
+ _neon_permute.run();
+ }
+ _neon_reshape.run();
+ }
+}
+
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp
new file mode 100644
index 000000000..44a4bb9ed
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace utils
+{
+
+bool isGpuMode()
+{
+ char *neon = std::getenv("NEON");
+ if (neon == nullptr)
+ return true;
+ else if (neon[0] == '1')
+ return false;
+ return true;
+}
+
+} // namespace utils
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h
new file mode 100644
index 000000000..f94effea1
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/topk_v2.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file topk_v2.h
+ * @brief This file contains TopK method and TopContainer class for TopK operation
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+
+typedef int32_t int32;
+
+namespace nnfw
+{
+namespace rt
+{
+namespace optimized_ops
+{
+/**
+ * @brief class to define TopK operation
+ * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
+ * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
+ * TFLite.
+ * (TFLite additionaly supports kTfLiteInt64.)
+ *
+ * The class that collects top indexes of k values. Based on template
+ * tensorflow::gtl::TopN<> but, for optimization,
+ * it re-uses the same container.
+ */
+template <typename T> class TopContainer
+{
+public:
+ /**
+ * @brief Prevent default constructor of of this class
+ */
+ TopContainer() = delete;
+ /**
+ * @brief Constructor with params
+ * @param [in] row_size Size of row in data
+ * @param [in] k The top k predictions
+ */
+ TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
+ {
+ container_.reserve(std::min(k, row_size) + 1);
+ }
+
+ /**
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * @param [in] topContainer To copy
+ */
+ TopContainer(const TopContainer &) = delete;
+ /*
+ * @brief Prevent instances of this class from being copied (As this class contains pointers)
+ * @param [in] topContainer To copy
+ * @return Reference of TopContainer
+ */
+ TopContainer &operator=(const TopContainer &) = delete;
+
+ /**
+ * @brief Start collecting
+ * @param [in] values To set as values
+ * @return N/A
+ */
+ void start_collecting(const T *values)
+ {
+ values_ = values;
+ container_.clear();
+ }
+
+ /**
+ * @brief Push a value to be compared for topk
+ * @param [in] a A value to compare
+ * @return N/A
+ */
+ void push(int32 a)
+ {
+ auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+ if (container_.size() <= (size_t)k_)
+ {
+ container_.push_back(a);
+ if (container_.size() == (size_t)(k_ + 1))
+ {
+ std::make_heap(container_.begin(), container_.end(), comparator);
+ std::pop_heap(container_.begin(), container_.end(), comparator);
+ }
+ }
+ else if (comparator(a, container_.front()))
+ {
+ container_.back() = a;
+ std::push_heap(container_.begin(), container_.end(), comparator);
+ std::pop_heap(container_.begin(), container_.end(), comparator);
+ }
+ }
+
+ /**
+ * @brief Get sorted result from pushed values
+ * @return Reference of vector with sorted values
+ */
+ const std::vector<int32> &sorted_result()
+ {
+ auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+ if (container_.size() <= (size_t)(k_))
+ {
+ std::sort(container_.begin(), container_.end(), comparator);
+ }
+ else
+ {
+ std::sort_heap(container_.begin(), container_.end() - 1, comparator);
+ container_.resize(k_);
+ }
+ return container_;
+ }
+
+private:
+ int32 k_;
+ std::vector<int32> container_;
+ const T *values_ = nullptr;
+
+ bool compare_fun(int32 a, int32 b) const
+ {
+ if (values_[b] < values_[a])
+ {
+ return true;
+ }
+ else if (values_[b] > values_[a])
+ {
+ return false;
+ }
+ else
+ {
+ return a < b;
+ }
+ }
+};
+
+/**
+ * @brief Operates TopK operation with params
+ * @param [in] row_size Size of row in data
+ * @param [in] num_rows The number of rows in data
+ * @param [in] data To be operated in
+ * @param [in] k The top k predictions
+ * @param [out] output_indexes Indexes of targets in the top k predictions
+ * @param [out] output_values Values of targets in the top k predictions
+ * @return N/A
+ */
+template <typename T>
+void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
+ T *output_values)
+{
+ TopContainer<T> topc(k, row_size);
+ for (int row = 0; row < num_rows; ++row)
+ {
+ const T *values_row = data + row * row_size;
+ topc.start_collecting(values_row);
+ for (int32 c = 0; c < row_size; ++c)
+ {
+ topc.push(c);
+ }
+
+ // Prepare output buffers.
+ int32 *indexes_row = output_indexes + row * k;
+ T *output_row = output_values + row * k;
+ // We always assume that the output is sorted.
+ const auto &top_k = topc.sorted_result();
+ std::copy(top_k.begin(), top_k.end(), indexes_row);
+ std::transform(top_k.begin(), top_k.end(), output_row,
+ [values_row](const int32 loc) { return values_row[loc]; });
+ }
+}
+
+} // namespace optimized_ops
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
new file mode 100644
index 000000000..5ea6cdadd
--- /dev/null
+++ b/compute/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectories()
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
new file mode 100644
index 000000000..9ddec350b
--- /dev/null
+++ b/compute/cker/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_library(nnfw_lib_cker INTERFACE)
+
+nnfw_find_package(Eigen QUIET)
+option(BUILD_CKER_OPTIMIZE "Build optimize cker library" ON)
+
+if(Eigen_FOUND AND BUILD_CKER_OPTIMIZE)
+ target_link_libraries(nnfw_lib_cker INTERFACE eigen)
+ target_compile_definitions(nnfw_lib_cker INTERFACE CKER_OPTIMIZED_EIGEN)
+endif(Eigen_FOUND AND BUILD_CKER_OPTIMIZE)
+
+target_include_directories(nnfw_lib_cker INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
diff --git a/compute/cker/README.md b/compute/cker/README.md
new file mode 100644
index 000000000..149320ffc
--- /dev/null
+++ b/compute/cker/README.md
@@ -0,0 +1,7 @@
+# cker
+
+cker - Portable CPU kernel library
+
+__cker__ means `CPU kernel`
+
+Current __cker__ is porting of Tensorflow lite's reference_op kernel (Tensorflow 1.12) and gemmlow
diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h
new file mode 100644
index 000000000..39449c68f
--- /dev/null
+++ b/compute/cker/include/cker/Shape.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SHAPE_H__
+#define __NNFW_CKER_SHAPE_H__
+
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+#include <vector>
+
+#define UNUSED_RELEASE(a) (void)(a)
+
+namespace nnfw
+{
+namespace cker
+{
+
+class Shape
+{
+public:
+ // Shapes with dimensions up to 4 are stored directly in the structure, while
+ // larger shapes are separately allocated.
+ static constexpr int kMaxSmallSize = 4;
+
+ Shape &operator=(Shape const &) = delete;
+
+ Shape() : _size(0) {}
+
+ explicit Shape(int dimensions_count) : _size(dimensions_count)
+ {
+ if (dimensions_count > kMaxSmallSize)
+ {
+ _dims_pointer = new int32_t[dimensions_count];
+ }
+ }
+
+ Shape(int shape_size, int32_t value) : _size(0)
+ {
+ Resize(shape_size);
+ for (int i = 0; i < shape_size; ++i)
+ {
+ SetDim(i, value);
+ }
+ }
+
+ Shape(int dimensions_count, const int32_t *dims_data) : _size(0)
+ {
+ ReplaceWith(dimensions_count, dims_data);
+ }
+
+ Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); }
+
+ // Avoid using this constructor. We should be able to delete it when C++17
+ // rolls out.
+ Shape(Shape const &other) : _size(other.DimensionsCount())
+ {
+ if (_size > kMaxSmallSize)
+ {
+ _dims_pointer = new int32_t[_size];
+ }
+ std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size);
+ }
+
+ bool operator==(const Shape &comp) const
+ {
+ return this->_size == comp._size &&
+ std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0;
+ }
+
+ ~Shape()
+ {
+ if (_size > kMaxSmallSize)
+ {
+ delete[] _dims_pointer;
+ }
+ }
+
+ inline int32_t DimensionsCount() const { return _size; }
+ inline int32_t Dims(int i) const
+ {
+ assert(i >= 0);
+ assert(i < _size);
+ return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i];
+ }
+ inline void SetDim(int i, int32_t val)
+ {
+ assert(i >= 0);
+ assert(i < _size);
+ if (_size > kMaxSmallSize)
+ {
+ _dims_pointer[i] = val;
+ }
+ else
+ {
+ _dims[i] = val;
+ }
+ }
+
+ inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+ inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+ // The caller must ensure that the shape is no bigger than 4-D.
+ inline const int32_t *DimsDataUpTo4D() const { return _dims; }
+
+ inline void Resize(int dimensions_count)
+ {
+ if (_size > kMaxSmallSize)
+ {
+ delete[] _dims_pointer;
+ }
+ _size = dimensions_count;
+ if (dimensions_count > kMaxSmallSize)
+ {
+ _dims_pointer = new int32_t[dimensions_count];
+ }
+ }
+
+ inline void ReplaceWith(int dimensions_count, const int32_t *dims_data)
+ {
+ Resize(dimensions_count);
+ int32_t *dst_dims = DimsData();
+ std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
+ }
+
+ template <typename T> inline void BuildFrom(const T &src_iterable)
+ {
+ const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end());
+ Resize(dimensions_count);
+ int32_t *data = DimsData();
+ for (auto it : src_iterable)
+ {
+ *data = it;
+ ++data;
+ }
+ }
+
+ // This will probably be factored out. Old code made substantial use of 4-D
+ // shapes, and so this function is used to extend smaller shapes. Note that
+ // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+ // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+ // inputs should already be 4-D, so this function should not be needed.
+ inline static Shape ExtendedShape(int new_shape_size, const Shape &shape)
+ {
+ return Shape(new_shape_size, shape, 1);
+ }
+
+ inline void BuildFrom(const std::initializer_list<int> init_list)
+ {
+ BuildFrom<const std::initializer_list<int>>(init_list);
+ }
+
+ // Returns the total count of elements, that is the size when flattened into a
+ // vector.
+ inline int FlatSize() const
+ {
+ int buffer_size = 1;
+ const int *dims_data = DimsData();
+ for (int i = 0; i < _size; i++)
+ {
+ const int dim = dims_data[i];
+ assert(dim >= 1);
+ buffer_size *= dim;
+ }
+ return buffer_size;
+ }
+
+ bool operator!=(const Shape &comp) const { return !((*this) == comp); }
+
+private:
+ // For use only by ExtendedShape(), written to guarantee (return-value) copy
+ // elision in C++17.
+ // This creates a shape padded to the desired size with the specified value.
+ Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0)
+ {
+ assert(new_shape_size >= shape.DimensionsCount());
+ assert(new_shape_size <= kMaxSmallSize);
+ Resize(new_shape_size);
+ const int size_increase = new_shape_size - shape.DimensionsCount();
+ for (int i = 0; i < size_increase; ++i)
+ {
+ SetDim(i, pad_value);
+ }
+ std::memcpy(DimsData() + size_increase, shape.DimsData(),
+ sizeof(int32_t) * shape.DimensionsCount());
+ }
+
+ int32_t _size;
+ union {
+ int32_t _dims[kMaxSmallSize];
+ int32_t *_dims_pointer;
+ };
+};
+
+inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
+{
+ UNUSED_RELEASE(shape2);
+ UNUSED_RELEASE(index2);
+ assert(shape1.Dims(index1) == shape2.Dims(index2));
+ return shape1.Dims(index1);
+}
+
+inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); }
+
+inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3)
+{
+ assert(shape.DimensionsCount() == 4);
+ const int *dims_data = shape.DimsDataUpTo4D();
+ assert(i0 >= 0 && i0 < dims_data[0]);
+ assert(i1 >= 0 && i1 < dims_data[1]);
+ assert(i2 >= 0 && i2 < dims_data[2]);
+ assert(i3 >= 0 && i3 < dims_data[3]);
+ return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+inline int FlatSizeSkipDim(const Shape &shape, int skip_dim)
+{
+ const int dims_count = shape.DimensionsCount();
+ assert(skip_dim >= 0 && skip_dim < dims_count);
+ const auto *dims_data = shape.DimsData();
+ int flat_size = 1;
+ for (int i = 0; i < dims_count; ++i)
+ {
+ flat_size *= (i == skip_dim) ? 1 : dims_data[i];
+ }
+ return flat_size;
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+inline int MatchingFlatSize(const Shape &shape, const Shape &check_shape_0)
+{
+ UNUSED_RELEASE(check_shape_0);
+ assert(shape.DimensionsCount() == check_shape_0.DimensionsCount());
+ const int dims_count = shape.DimensionsCount();
+ for (int i = 0; i < dims_count; ++i)
+ {
+ assert(shape.Dims(i) == check_shape_0.Dims(i));
+ }
+ return shape.FlatSize();
+}
+
+inline int MatchingFlatSize(const Shape &shape, const Shape &check_shape_0,
+ const Shape &check_shape_1)
+{
+ UNUSED_RELEASE(check_shape_0);
+ assert(shape.DimensionsCount() == check_shape_0.DimensionsCount());
+ const int dims_count = shape.DimensionsCount();
+ for (int i = 0; i < dims_count; ++i)
+ {
+ assert(shape.Dims(i) == check_shape_0.Dims(i));
+ }
+ return MatchingFlatSize(shape, check_shape_1);
+}
+
+inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0)
+{
+ UNUSED_RELEASE(check_shape_0);
+ const int dims_count = shape.DimensionsCount();
+ for (int i = 0; i < dims_count; ++i)
+ {
+ if (i != skip_dim)
+ {
+ assert(shape.Dims(i) == check_shape_0.Dims(i));
+ }
+ }
+ return FlatSizeSkipDim(shape, skip_dim);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SHAPE_H__
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
new file mode 100644
index 000000000..85654b040
--- /dev/null
+++ b/compute/cker/include/cker/Types.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TYPES_H__
+#define __NNFW_CKER_TYPES_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace cker
+{
+
+enum class FusedActivationFunctionType
+{
+ kNone = 0,
+ kRelu6 = 1,
+ kRelu1 = 2,
+ kRelu = 3,
+};
+enum class PaddingType
+{
+ kNone = 0,
+ kSame = 1,
+ kValid = 2,
+};
+
+struct PaddingValues
+{
+ int16_t width;
+ int16_t height;
+};
+
+struct PoolParams
+{
+ FusedActivationFunctionType activation;
+ PaddingType padding_type;
+ PaddingValues padding_values;
+ int stride_height;
+ int stride_width;
+ int filter_height;
+ int filter_width;
+ // uint8, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+};
+
+struct SoftmaxParams
+{
+ // beta is not really used (not a Tensorflow parameter) and not implemented
+ // for LogSoftmax.
+ double beta;
+ // uint8 inference params. Used even when beta defaults to 1.0.
+ int32_t input_multiplier;
+ int32_t input_left_shift;
+ // Reverse scaling is only used by LogSoftmax.
+ int32_t reverse_scaling_divisor;
+ int32_t reverse_scaling_right_shift;
+ int diff_min;
+};
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TYPES_H__
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
new file mode 100644
index 000000000..d1f1723c4
--- /dev/null
+++ b/compute/cker/include/cker/Utils.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_UTILS_H__
+#define __NNFW_CKER_UTILS_H__
+
+#include <algorithm>
+#include <cstdint>
+
+#include "cker/gemmlowp/FixedPoint.h"
+#include "Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
+{
+ return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
+}
+
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
+{
+ int left_shift = shift > 0 ? shift : 0;
+ int right_shift = shift > 0 ? 0 : -shift;
+ return gemmlowp::RoundingDivideByPOT(
+ gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
+ right_shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
+ int left_shift)
+{
+ return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width)
+{
+ return (b * height + h) * width + w;
+}
+
+inline int CountLeadingZeros(uint32_t integer_input)
+{
+ const uint32_t one_in_leading_positive = 1U << 31;
+ int leading_zeros = 0;
+ while (integer_input < one_in_leading_positive)
+ {
+ integer_input <<= 1;
+ ++leading_zeros;
+ }
+ return leading_zeros;
+}
+
+// Comment from tensorflow lite:
+//
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N> struct NdArrayDesc
+{
+ // The "extent" of each dimension. Indices along dimension d must be in the
+ // half-open interval [0, extents[d]).
+ int extents[N];
+
+ // The number of *elements* (not bytes) between consecutive indices of each
+ // dimension.
+ int strides[N];
+};
+
+// Comment from tensorflow lite:
+//
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3)
+{
+ assert(i0 >= 0 && i0 < desc.extents[0]);
+ assert(i1 >= 0 && i1 < desc.extents[1]);
+ assert(i2 >= 0 && i2 < desc.extents[2]);
+ assert(i3 >= 0 && i3 < desc.extents[3]);
+ return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
+}
+
+template <int N>
+inline void
+NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
+ NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out)
+{
+ assert(desc0_out != nullptr);
+ assert(desc1_out != nullptr);
+
+ auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
+ auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
+
+ // Copy dims to desc, calculating strides.
+ int desc0_stride = 1;
+ int desc1_stride = 1;
+ for (int i = N - 1; i >= 0; --i)
+ {
+ desc0_out->extents[i] = extended_input0_shape.Dims(i);
+ desc0_out->strides[i] = desc0_stride;
+ desc0_stride *= extended_input0_shape.Dims(i);
+ desc1_out->extents[i] = extended_input1_shape.Dims(i);
+ desc1_out->strides[i] = desc1_stride;
+ desc1_stride *= extended_input1_shape.Dims(i);
+ }
+
+ // Walk over each dimension. If the extents are equal do nothing.
+ // Otherwise, set the desc with extent 1 to have extent equal to the other and
+ // stride 0.
+ for (int i = 0; i < N; ++i)
+ {
+ const int extent0 = extended_input0_shape.Dims(i);
+ const int extent1 = extended_input1_shape.Dims(i);
+ if (extent0 != extent1)
+ {
+ if (extent0 == 1)
+ {
+ desc0_out->strides[i] = 0;
+ desc0_out->extents[i] = extent1;
+ }
+ else
+ {
+ assert(extent1 == 1);
+ desc1_out->strides[i] = 0;
+ desc1_out->extents[i] = extent0;
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_UTILS_H__
diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h
new file mode 100644
index 000000000..645a61485
--- /dev/null
+++ b/compute/cker/include/cker/eigen/Utils.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EIGEN_UTILS_H__
+#define __NNFW_CKER_EIGEN_UTILS_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+
+#include <Eigen/Core>
+#include <type_traits>
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen matrix expression. The same explanation as for VectorMap
+// above also applies here.
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+ std::is_const<Scalar>::value,
+ Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic,
+ Eigen::Dynamic>>,
+ Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape)
+{
+ const int dims_count = shape.DimensionsCount();
+ const int rows = shape.Dims(dims_count - 1);
+ const int cols = FlatSizeSkipDim(shape, dims_count - 1);
+ return MatrixMap<Scalar>(data, rows, cols);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_EIGEN_UTILS_H__
diff --git a/compute/cker/include/cker/gemmlowp/FixedPoint.h b/compute/cker/include/cker/gemmlowp/FixedPoint.h
new file mode 100644
index 000000000..159e01a22
--- /dev/null
+++ b/compute/cker/include/cker/gemmlowp/FixedPoint.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__
+#define __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__
+
+#include <algorithm>
+#include <cassert>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace gemmlowp
+{
+
+inline int32_t RoundingHalfSum(int32_t a, int32_t b)
+{
+ int64_t a64 = a;
+ int64_t b64 = b;
+ int64_t sum = a64 + b64;
+ int64_t sign = sum >= 0 ? 1 : -1;
+ return static_cast<int32_t>((sum + sign) / 2);
+}
+
+inline int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b)
+{
+ bool overflow = a == b && a == std::numeric_limits<int32_t>::min();
+ int64_t a_64(a);
+ int64_t b_64(b);
+ int64_t ab_64 = a_64 * b_64;
+ int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+ int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
+ return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
+}
+
+// Correctly-rounded-to-nearest division by a power-of-two.
+// Also known as a rounding arithmetic right shift.
+inline int32_t RoundingDivideByPOT(int32_t x, int exponent)
+{
+ assert(exponent >= 0);
+ assert(exponent <= 31);
+ const int32_t mask = ((1ll << exponent) - 1);
+ const int32_t zero = 0;
+ const int32_t one = 1;
+ const int32_t remainder = x & mask;
+ const int32_t threshold = (mask >> 1) + ((x < zero) ? one : zero);
+ return ((x >> exponent) + ((remainder > threshold) ? one : zero));
+}
+
+// Returns the product of a run-time integer value by a compile-time power
+// of two, with either a positive exponent (equivalent to an arithmetic
+// left shift, saturating) or a negative exponent (equivalent to an arithmetic
+// right shift, rounding to nearest).
+template <int Exponent, int ExponentSign = (Exponent > 0 ? 1 : Exponent < 0 ? -1 : 0)>
+struct ImplSaturatingRoundingMultiplyByPOT
+{
+};
+
+template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, 0>
+{
+ static int32_t eval(int32_t x) { return x; }
+};
+
+template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, 1>
+{
+ static int32_t eval(int32_t x)
+ {
+ const int32_t min = (std::numeric_limits<int32_t>::min());
+ const int32_t max = (std::numeric_limits<int32_t>::max());
+ const int32_t threshold = ((1 << (31 - Exponent)) - 1);
+ const int32_t zero = 0;
+ const int32_t one = 1;
+
+ const int32_t positive_mask = ((x > threshold) ? ~zero : zero);
+ const int32_t negative_mask = ((x < -threshold) ? ~zero : zero);
+
+ int32_t result = (x * (one << Exponent));
+ result = (positive_mask ? max : result);
+ result = (negative_mask ? min : result);
+ return result;
+ }
+};
+
+template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, -1>
+{
+ static int32_t eval(int32_t x) { return RoundingDivideByPOT(x, -Exponent); }
+};
+
+template <int Exponent> int32_t SaturatingRoundingMultiplyByPOT(int32_t x)
+{
+ return ImplSaturatingRoundingMultiplyByPOT<Exponent>::eval(x);
+}
+
+template <int tIntegerBits> class FixedPoint
+{
+public:
+ static constexpr int kTotalBits = 8 * sizeof(int32_t);
+ static constexpr int kIntegerBits = tIntegerBits;
+ static constexpr int kFractionalBits = kTotalBits - 1 - kIntegerBits;
+ static_assert(kIntegerBits >= 0 && kIntegerBits < kTotalBits, "bad IntegerBits");
+
+ static int32_t ScalarRawMax() { return std::numeric_limits<int32_t>::max(); }
+
+ static FixedPoint FromRaw(int32_t x)
+ {
+ FixedPoint retval;
+ retval.raw() = x;
+ return retval;
+ }
+
+ static FixedPoint FromScalarRaw(int32_t x) { return FromRaw(x); }
+
+ template <int Exponent> static FixedPoint ConstantPOT()
+ {
+ static constexpr int kOffset = kFractionalBits + Exponent;
+ static_assert(kOffset < 31, "Constant not exactly representable in this fixed-point format");
+ return FromScalarRaw((int32_t)1 << kOffset);
+ }
+
+ static FixedPoint Zero() { return FromScalarRaw(0); }
+
+ static FixedPoint One()
+ {
+ return FromScalarRaw(kIntegerBits == 0 ? ScalarRawMax() : ((int32_t)1 << kFractionalBits));
+ }
+
+ int32_t raw() const { return i_; }
+ int32_t &raw() { return i_; }
+
+private:
+ int32_t i_;
+};
+
+// A FixedPoint multiplication is just a
+// SaturatingRoundingDoublingHighMul operation on the underlying
+// raw integer values. The IntegerBits simply add up, as is obvious
+// from the fact that the range is [-2^IntegerBits, 2^IntegerBits).
+template <int tIntegerBits_a, int tIntegerBits_b>
+FixedPoint<tIntegerBits_a + tIntegerBits_b> operator*(FixedPoint<tIntegerBits_a> a,
+ FixedPoint<tIntegerBits_b> b)
+{
+ FixedPoint<tIntegerBits_a + tIntegerBits_b> c;
+ c.raw() = SaturatingRoundingDoublingHighMul(a.raw(), b.raw());
+ return c;
+}
+
+// Tweaking IntegerBits gives exact multiplication by a power of two.
+template <int tExponent, int tIntegerBits>
+FixedPoint<tExponent + tIntegerBits> ExactMulByPot(FixedPoint<tIntegerBits> a)
+{
+ FixedPoint<tExponent + tIntegerBits> c;
+ c.raw() = a.raw();
+ return c;
+}
+
+template <int tIntegerBits>
+FixedPoint<tIntegerBits> operator+(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b)
+{
+ return FixedPoint<tIntegerBits>::FromRaw((a.raw() + b.raw()));
+}
+template <int tIntegerBits>
+FixedPoint<tIntegerBits> operator-(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b)
+{
+ return FixedPoint<tIntegerBits>::FromRaw((a.raw() - b.raw()));
+}
+template <int tIntegerBits>
+FixedPoint<tIntegerBits> operator&(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b)
+{
+ return FixedPoint<tIntegerBits>::FromRaw((a.raw() & b.raw()));
+}
+
+// Rescale changes the number of IntegerBits and updates the underlying
+// raw integer value accordingly.
+template <int tIntegerBitsDst, int tIntegerBitsSrc>
+FixedPoint<tIntegerBitsDst> Rescale(FixedPoint<tIntegerBitsSrc> x)
+{
+ static constexpr int kExponent = tIntegerBitsSrc - tIntegerBitsDst;
+ FixedPoint<tIntegerBitsDst> result;
+ result.raw() = SaturatingRoundingMultiplyByPOT<kExponent>(x.raw());
+ return result;
+}
+
+// Implementation of exponential function.
+
+// Returns exp(x) for x in [-1/4, 0).
+inline FixedPoint<0> exp_on_interval_between_negative_one_quarter_and_0_excl(FixedPoint<0> a)
+{
+ typedef FixedPoint<0> F;
+ const F constant_term = F::FromScalarRaw(RoundingDivideByPOT(1895147668, 0));
+ const F constant_1_over_3 = F::FromScalarRaw(RoundingDivideByPOT(715827883, 0));
+ // We're evaluating a Taylor expansion around -1/8, so we do the change of
+ // variable: x = a + 1/8.
+ // In fixed-point with 0 integer bits, 1/8 is represented by 1 << 28.
+ F x = a + F::template ConstantPOT<-3>();
+ F x2 = x * x;
+ F x3 = x2 * x;
+ F x4 = x2 * x2;
+ F x4_over_4 = F::FromScalarRaw(SaturatingRoundingMultiplyByPOT<-2>(x4.raw()));
+ F x4_over_24_plus_x3_over_6_plus_x2_over_2 = F::FromScalarRaw(
+ SaturatingRoundingMultiplyByPOT<-1>((((x4_over_4 + x3) * constant_1_over_3) + x2).raw()));
+ return (constant_term + constant_term * (x + x4_over_24_plus_x3_over_6_plus_x2_over_2));
+}
+
+// Returns exp(x) for x < 0.
+template <int tIntegerBits> FixedPoint<0> exp_on_negative_values(FixedPoint<tIntegerBits> a)
+{
+ typedef FixedPoint<tIntegerBits> InputF;
+ typedef FixedPoint<0> ResultF;
+ static constexpr int kFractionalBits = InputF::kFractionalBits;
+ static constexpr int kIntegerBits = InputF::kIntegerBits;
+ const InputF kOneQuarter = InputF::template ConstantPOT<-2>();
+ InputF mask = kOneQuarter - InputF::FromScalarRaw(1);
+ InputF a_mod_quarter_minus_one_quarter = (a & mask) - kOneQuarter;
+ ResultF result = exp_on_interval_between_negative_one_quarter_and_0_excl(
+ Rescale<0>(a_mod_quarter_minus_one_quarter));
+ int32_t remainder = (a_mod_quarter_minus_one_quarter - a).raw();
+
+#define GEMMLOWP_EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier) \
+ if (kIntegerBits > Exponent) \
+ { \
+ const ResultF kMultiplier = \
+ ResultF::FromScalarRaw(RoundingDivideByPOT(FixedPointMultiplier, 0)); \
+ static constexpr int kShiftAmount = \
+ ((kIntegerBits > Exponent) ? (kFractionalBits + Exponent) : 0); \
+ result = ((remainder & (1 << kShiftAmount)) ? (result * kMultiplier) : result); \
+ }
+
+ GEMMLOWP_EXP_BARREL_SHIFTER(-2, 1672461947);
+ GEMMLOWP_EXP_BARREL_SHIFTER(-1, 1302514674);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+0, 790015084);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+1, 290630308);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+2, 39332535);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+3, 720401);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+4, 242);
+
+#undef GEMMLOWP_EXP_BARREL_SHIFTER
+
+ static constexpr int clampB = ((kIntegerBits > 5) ? (36 - kIntegerBits) : 0);
+ if (kIntegerBits > 5)
+ {
+ const InputF clamp = InputF::FromScalarRaw(RoundingDivideByPOT(-(1 << clampB), 0));
+ result.raw() = ((a.raw() < clamp.raw()) ? ResultF::Zero().raw() : result.raw());
+ }
+
+ result.raw() = (a.raw() ? result.raw() : ResultF::One().raw());
+ return result;
+}
+
+// Returns 1 / (1 + x) for x in (0, 1).
+inline FixedPoint<0> one_over_one_plus_x_for_x_in_0_1(FixedPoint<0> a)
+{
+ typedef FixedPoint<0> F0;
+ typedef FixedPoint<2> F2;
+ F0 half_denominator = F0::FromScalarRaw(RoundingHalfSum(a.raw(), F0::One().raw()));
+ // Newton-Raphson division
+ // https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division
+ // Refer to that page for the logic behind the 48/17 and 32/17 constants.
+ const F2 constant_48_over_17 = F2::FromScalarRaw(RoundingDivideByPOT(1515870810, 0));
+ const F2 constant_neg_32_over_17 = F2::FromScalarRaw(RoundingDivideByPOT(-1010580540, 0));
+ F2 x = constant_48_over_17 + half_denominator * constant_neg_32_over_17;
+ for (int i = 0; i < 3; i++)
+ {
+ F2 half_denominator_times_x = half_denominator * x;
+ F2 one_minus_half_denominator_times_x = F2::One() - half_denominator_times_x;
+ x = x + Rescale<2>(x * one_minus_half_denominator_times_x);
+ }
+ return Rescale<0>(ExactMulByPot<-1>(x));
+}
+
+} // namespace gemmlowp
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
new file mode 100644
index 000000000..b20919429
--- /dev/null
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_AVERAGE_POOL_H__
+#define __NNFW_CKER_AVERAGE_POOL_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+#include "cker/operation/optimized/AveragePool.h"
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#include "cker/operation/reference/AveragePool.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void AveragePool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+#if defined(CKER_OPTIMIZED_EIGEN)
+ optimized::AveragePool(params, input_shape, input_data, output_shape, output_data);
+#else // defined(CKER_OPTIMIZED_EIGEN)
+ reference::AveragePool(params, input_shape, input_data, output_shape, output_data);
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+}
+
+inline void AveragePool(const PoolParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
+{
+ assert(params.quantized_activation_min <= params.quantized_activation_max);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ int filter_count = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start);
+ if (filter_count <= 0)
+ {
+ continue;
+ }
+ for (int channel = 0; channel < depth; ++channel)
+ {
+ int32_t acc = 0;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+ {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ acc += input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+ }
+ }
+ acc = (acc + filter_count / 2) / filter_count;
+ acc = std::max(acc, params.quantized_activation_min);
+ acc = std::min(acc, params.quantized_activation_max);
+ output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+ static_cast<uint8_t>(acc);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_AVERAGE_POOL_H__
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
new file mode 100644
index 000000000..60dd02651
--- /dev/null
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
+#define __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
+
+#include <functional>
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct BinaryArithmeticOpParam
+{
+ // Shape dependent / common to data / op types.
+ // BroadcastableOpCategory broadcast_category;
+ // uint8 inference params.
+ int32_t input1_offset;
+ int32_t input2_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int32_t output_shift;
+ // Add / Sub, not Mul, uint8 inference params.
+ int32_t left_shift;
+ int32_t input1_multiplier;
+ int32_t input1_shift;
+ int32_t input2_multiplier;
+ int32_t input2_shift;
+ // uint8, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+
+ // Processed output dimensions.
+ // Let input "a" be the one that broadcasts in the faster-changing dimension.
+ // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and
+ // {b0, b1, b2, b3, b4},
+ // broadcast_shape[4] = b0 = a0.
+ // broadcast_shape[3] = b1; a1 = 1.
+ // broadcast_shape[2] = b2 = a2.
+ // broadcast_shape[1] = a3; b3 = 1.
+ // broadcast_shape[0] = b4 = a4.
+ // int broadcast_shape[5];
+};
+
+template <typename T>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const T *input1_data, const Shape &input2_shape,
+ const T *input2_data, const Shape &output_shape, T *output_data,
+ const std::function<T(const T &, const T &)> &fn)
+{
+ const int32_t flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i)
+ {
+ output_data[i] = ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
+ params.quantized_activation_min,
+ params.quantized_activation_max);
+ }
+}
+
+template <>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const float *input1_data, const Shape &input2_shape,
+ const float *input2_data, const Shape &output_shape,
+ float *output_data,
+ const std::function<float(const float &, const float &)> &fn)
+{
+ const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ for (int i = 0; i < size; i++)
+ {
+ output_data[i] =
+ ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
+ params.float_activation_min, params.float_activation_max);
+ }
+}
+
+template <typename T>
+inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &params,
+ const Shape &input1_shape, const T *input1_data,
+ const Shape &input2_shape, const T *input2_data,
+ const Shape &output_shape, T *output_data,
+ const std::function<T(const T &, const T &)> &fn)
+{
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+ const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+ // Comment from tensorflow lite:
+ //
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+ {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+ {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+ {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+ {
+ output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
+ fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+ params.quantized_activation_min, params.quantized_activation_max);
+ }
+ }
+ }
+ }
+}
+
+template <>
+inline void BroadcastBinaryArithmeticOpSlow(
+ const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
+ const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
+ float *output_data, const std::function<float(const float &, const float &)> &fn)
+{
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+ const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+ {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+ {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+ {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+ {
+ output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
+ fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+ params.float_activation_min, params.float_activation_max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h
new file mode 100644
index 000000000..69a179c8c
--- /dev/null
+++ b/compute/cker/include/cker/operation/Concatenation.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CONCATENATION_H__
+#define __NNFW_CKER_CONCATENATION_H__
+
+#include <cstdint>
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct ConcatenationParams
+{
+ int8_t axis;
+ const int32_t *input_zeropoint;
+ const float *input_scale;
+ uint16_t inputs_count;
+ int32_t output_zeropoint;
+ float output_scale;
+};
+
+template <typename Scalar>
+inline void Concatenation(const ConcatenationParams &params, const Shape *const *input_shapes,
+ const Scalar *const *input_data, const Shape &output_shape,
+ Scalar *output_data)
+{
+ int axis = params.axis;
+ int inputs_count = params.inputs_count;
+ const int concat_dimensions = output_shape.DimensionsCount();
+ assert(axis < concat_dimensions);
+
+ int64_t concat_size = 0;
+ for (int i = 0; i < inputs_count; i++)
+ {
+ assert(input_shapes[i]->DimensionsCount() == concat_dimensions);
+ for (int j = 0; j < concat_dimensions; j++)
+ {
+ if (j != axis)
+ {
+ auto dim_checked = MatchingDim(*input_shapes[i], j, output_shape, j);
+ UNUSED_RELEASE(dim_checked);
+ }
+ }
+ concat_size += input_shapes[i]->Dims(axis);
+ }
+ assert(concat_size == output_shape.Dims(axis));
+ int64_t outer_size = 1;
+ for (int i = 0; i < axis; ++i)
+ {
+ outer_size *= output_shape.Dims(i);
+ }
+ // For all input arrays,
+ // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+ int64_t base_inner_size = 1;
+ for (int i = axis + 1; i < concat_dimensions; ++i)
+ {
+ base_inner_size *= output_shape.Dims(i);
+ }
+
+ Scalar *output_ptr = output_data;
+ for (int k = 0; k < outer_size; k++)
+ {
+ for (int i = 0; i < inputs_count; ++i)
+ {
+ const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
+ memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
+ output_ptr += copy_size;
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CONCATENATION_H__
diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h
new file mode 100644
index 000000000..35b0336fa
--- /dev/null
+++ b/compute/cker/include/cker/operation/Conv.h
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CONV_H__
+#define __NNFW_CKER_CONV_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct ConvParams
+{
+ PaddingType padding_type;
+ PaddingValues padding_values;
+ // TODO(starka): This was just "stride", so check that width+height is OK.
+ int16_t stride_width;
+ int16_t stride_height;
+ int16_t dilation_width_factor;
+ int16_t dilation_height_factor;
+ // uint8_t inference params.
+ // TODO(b/65838351): Use smaller types if appropriate.
+ int32_t input_offset;
+ int32_t weights_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int output_shift;
+ // uint8_t, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+};
+
+inline void Conv(const ConvParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+ const float *bias_data, const Shape &output_shape, float *output_data)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ UNUSED_RELEASE(bias_shape);
+
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data)
+ {
+ assert(bias_shape.FlatSize() == output_depth);
+ }
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+ {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ float total = 0.f;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ const int in_offset = Offset(input_shape, batch, in_y, in_x, 0);
+ const int filter_offset = Offset(filter_shape, out_channel, filter_y, filter_x, 0);
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+ {
+ float input_value = input_data[in_offset + in_channel];
+ float filter_value = filter_data[filter_offset + in_channel];
+ total += (input_value * filter_value);
+ }
+ }
+ }
+ }
+ float bias_value = 0.0f;
+ if (bias_data)
+ {
+ bias_value = bias_data[out_channel];
+ }
+ output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+ ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8_t *input_data,
+ const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int32_t input_offset = params.input_offset;
+ const int32_t filter_offset = params.weights_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ assert(output_activation_min <= output_activation_max);
+
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ UNUSED_RELEASE(bias_shape);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data)
+ {
+ assert(bias_shape.FlatSize() == output_depth);
+ }
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+ {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32_t acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ const int in_base = Offset(input_shape, batch, in_y, in_x, 0);
+ const int filter_base = Offset(filter_shape, out_channel, filter_y, filter_x, 0);
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ for (int in_channel = 0; in_channel < input_depth; in_channel++)
+ {
+ int32_t input_val = input_data[in_channel + in_base];
+ int32_t filter_val = filter_data[in_channel + filter_base];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ }
+ }
+ }
+ if (bias_data)
+ {
+ acc += bias_data[out_channel];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+ static_cast<uint8_t>(acc);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CONCATENATION_H_
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
new file mode 100644
index 000000000..7d022477d
--- /dev/null
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEPTHWISE_CONV_H__
+#define __NNFW_CKER_DEPTHWISE_CONV_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct DepthwiseConvParams
+{
+ PaddingType padding_type;
+ PaddingValues padding_values;
+ int16_t stride_width;
+ int16_t stride_height;
+ int16_t dilation_width_factor;
+ int16_t dilation_height_factor;
+ int16_t depth_multiplier;
+ // uint8 inference params.
+ // TODO(b/65838351): Use smaller types if appropriate.
+ int32_t input_offset;
+ int32_t weights_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int output_shift;
+ // uint8, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+};
+
+inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &filter_shape,
+ const uint8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int depth_multiplier = params.depth_multiplier;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int32_t input_offset = params.input_offset;
+ const int32_t filter_offset = params.weights_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ assert(output_activation_min <= output_activation_max);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_shape.FlatSize() == output_depth);
+ UNUSED_RELEASE(output_depth);
+ UNUSED_RELEASE(bias_shape);
+
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int ic = 0; ic < input_depth; ++ic)
+ {
+ for (int m = 0; m < depth_multiplier; m++)
+ {
+ const int oc = m + ic * depth_multiplier;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32_t acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ int32_t input_val = input_data[Offset(input_shape, b, in_y, in_x, ic)];
+ int32_t filter_val = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ }
+ }
+ if (bias_data)
+ {
+ acc += bias_data[oc];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[Offset(output_shape, b, out_y, out_x, oc)] = static_cast<uint8_t>(acc);
+ }
+ }
+ }
+ }
+ }
+}
+
+inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &filter_shape,
+ const float *filter_data, const Shape &bias_shape, const float *bias_data,
+ const Shape &output_shape, float *output_data)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int depth_multiplier = params.depth_multiplier;
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_shape.FlatSize() == output_depth);
+ UNUSED_RELEASE(output_depth);
+ UNUSED_RELEASE(bias_shape);
+
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int ic = 0; ic < input_depth; ++ic)
+ {
+ for (int m = 0; m < depth_multiplier; m++)
+ {
+ const int oc = m + ic * depth_multiplier;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ float total = 0.f;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)];
+ float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
+ total += (input_value * filter_value);
+ }
+ }
+ }
+ float bias_value = 0.0f;
+ if (bias_data)
+ {
+ bias_value = bias_data[oc];
+ }
+ output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax(
+ total + bias_value, output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_DEPTHWISE_CONV_H__
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
new file mode 100644
index 000000000..428fb1b53
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_H__
+#define __NNFW_CKER_FULLY_CONNECTED_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct FullyConnectedParams
+{
+ // uint8 inference params.
+ // TODO(b/65838351): Use smaller types if appropriate.
+ int32_t input_offset;
+ int32_t weights_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int output_shift;
+ // uint8, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+ // FullyConnectedWeightsFormat weights_format;
+};
+
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &weights_shape,
+ const float *weights_data, const Shape &bias_shape,
+ const float *bias_data, const Shape &output_shape, float *output_data)
+{
+ UNUSED_RELEASE(input_shape);
+ UNUSED_RELEASE(bias_shape);
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int output_dims_count = output_shape.DimensionsCount();
+ const int weights_dims_count = weights_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+ const int output_depth =
+ MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+ const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int out_c = 0; out_c < output_depth; ++out_c)
+ {
+ float total = 0.f;
+ for (int d = 0; d < accum_depth; ++d)
+ {
+ total += input_data[b * accum_depth + d] * weights_data[out_c * accum_depth + d];
+ }
+ float bias_value = 0.0f;
+ if (bias_data)
+ {
+ bias_value = bias_data[out_c];
+ }
+ output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
+ total + bias_value, output_activation_min, output_activation_max);
+ }
+ }
+}
+
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &filter_shape,
+ const uint8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape,
+ uint8_t *output_data)
+{
+ UNUSED_RELEASE(input_shape);
+ UNUSED_RELEASE(bias_shape);
+ const int32_t input_offset = params.input_offset;
+ const int32_t filter_offset = params.weights_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ assert(filter_shape.DimensionsCount() >= 2);
+ assert(output_shape.DimensionsCount() >= 1);
+
+ assert(output_activation_min <= output_activation_max);
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int output_dim_count = output_shape.DimensionsCount();
+ const int filter_dim_count = filter_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+ const int output_depth =
+ MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
+ const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int out_c = 0; out_c < output_depth; ++out_c)
+ {
+ int32_t acc = 0;
+ for (int d = 0; d < accum_depth; ++d)
+ {
+ int32_t input_val = input_data[b * accum_depth + d];
+ int32_t filter_val = filter_data[out_c * accum_depth + d];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ if (bias_data)
+ {
+ acc += bias_data[out_c];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_FULLY_CONNECTED_H__
diff --git a/compute/cker/include/cker/operation/Gather.h b/compute/cker/include/cker/operation/Gather.h
new file mode 100644
index 000000000..9cd96eeb7
--- /dev/null
+++ b/compute/cker/include/cker/operation/Gather.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_GATHER_H__
+#define __NNFW_CKER_GATHER_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct GatherParams
+{
+ int32_t axis;
+};
+
+template <typename T, typename CoordsT = int32_t>
+inline void Gather(const GatherParams &op_params, const Shape &input_shape, const T *input_data,
+ const Shape &coords_shape, const CoordsT *coords_data, const Shape &,
+ T *output_data)
+{
+ int axis = op_params.axis;
+ if (axis < 0)
+ {
+ axis += input_shape.DimensionsCount();
+ }
+ assert(axis >= 0);
+ assert(axis < input_shape.DimensionsCount());
+ const int axis_size = input_shape.Dims(axis);
+ const int coords_count = coords_shape.FlatSize();
+
+ int outer_size = 1;
+ for (int i = 0; i < axis; ++i)
+ {
+ outer_size *= input_shape.Dims(i);
+ }
+
+ int inner_size = 1;
+ for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+ {
+ inner_size *= input_shape.Dims(i);
+ }
+
+ for (int outer = 0; outer < outer_size; ++outer)
+ {
+ for (int i = 0; i < coords_count; ++i)
+ {
+ assert(coords_data[i] >= 0);
+ assert(coords_data[i] < axis_size);
+ std::memcpy(output_data + (outer * coords_count + i) * inner_size,
+ input_data + (outer * axis_size + coords_data[i]) * inner_size,
+ sizeof(T) * inner_size);
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_GATHER_H__
diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h
new file mode 100644
index 000000000..794dcebc8
--- /dev/null
+++ b/compute/cker/include/cker/operation/InstanceNorm.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_INSTANCE_NORM_H__
+#define __NNFW_CKER_INSTANCE_NORM_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct InstanceNormParams
+{
+ float epsilon;
+ float float_activation_min;
+ float float_activation_max;
+};
+
+inline void InstanceNorm(const InstanceNormParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &gamma_shape, const float *gamma_data,
+ const Shape &beta_shape, const float *beta_data, const Shape &output_shape,
+ float *output_data)
+{
+ const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1);
+ const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2);
+ const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3);
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+
+ UNUSED_RELEASE(gamma_shape);
+ UNUSED_RELEASE(beta_shape);
+ assert(output_activation_min <= output_activation_max);
+
+ for (int32_t batch = 0; batch < batches; batch++)
+ {
+ for (int32_t channel = 0; channel < channels; channel++)
+ {
+ double sum = 0.0f;
+ double square_sum = 0.0f;
+ int32_t size = heights * widths;
+
+ for (int32_t height = 0; height < heights; height++)
+ {
+ for (int32_t width = 0; width < widths; width++)
+ {
+ double input_val = input_data[Offset(input_shape, batch, height, width, channel)];
+ sum += input_val;
+ square_sum += (input_val * input_val);
+ }
+ }
+
+ double mean = sum / size;
+ double var = square_sum / size - mean * mean;
+
+ double gamma = gamma_data[channel];
+ double beta = beta_data[channel];
+
+ double a = gamma / (std::sqrt(var + params.epsilon));
+ double b = -mean * a + beta;
+
+ for (int32_t height = 0; height < heights; height++)
+ {
+ for (int32_t width = 0; width < widths; width++)
+ {
+ double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
+ double output_value = input_value * a + b;
+ output_data[Offset(output_shape, batch, height, width, channel)] =
+ ActivationFunctionWithMinMax((float)output_value, output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_INSTANCE_NORM_H__
diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
new file mode 100644
index 000000000..872095531
--- /dev/null
+++ b/compute/cker/include/cker/operation/Logistic.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGISTIC_H__
+#define __NNFW_CKER_LOGISTIC_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ float *output_data)
+{
+ // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2)
+ const int size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < size; i++)
+ {
+ output_data[i] = 1.f / (1.f + std::exp(-input_data[i]));
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGISTIC_H__
diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h
new file mode 100644
index 000000000..326168b99
--- /dev/null
+++ b/compute/cker/include/cker/operation/MaxPool.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_MAX_POOL_H__
+#define __NNFW_CKER_MAX_POOL_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+#include "cker/operation/optimized/MaxPool.h"
+#include "cker/operation/reference/MaxPool.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void MaxPool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+#if defined(CKER_OPTIMIZED_EIGEN)
+ optimized::MaxPool(params, input_shape, input_data, output_shape, output_data);
+#else // defined(CKER_OPTIMIZED_EIGEN)
+ reference::MaxPool(params, input_shape, input_data, output_shape, output_data);
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+}
+
+inline void MaxPool(const PoolParams &params, const Shape &input_shape, const uint8_t *input_data,
+ const Shape &output_shape, uint8_t *output_data)
+{
+ assert(params.quantized_activation_min <= params.quantized_activation_max);
+ assert(params.quantized_activation_min >= 0);
+ assert(params.quantized_activation_max <= 255);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int channel = 0; channel < depth; ++channel)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ uint8_t max = 0;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+ {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ max = std::max(max, input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+ }
+ }
+ max = std::max<uint8_t>(max, params.quantized_activation_min);
+ max = std::min<uint8_t>(max, params.quantized_activation_max);
+ output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+ static_cast<uint8_t>(max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_MAX_POOL_H__
diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h
new file mode 100644
index 000000000..af432f3a8
--- /dev/null
+++ b/compute/cker/include/cker/operation/Pad.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_PAD_H__
+#define __NNFW_CKER_PAD_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <stdexcept>
+#include <iostream>
+namespace nnfw
+{
+namespace cker
+{
+inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape,
+ const float *input_data, const Shape &output_shape, float *output_data,
+ const float *constant_value_data)
+{
+ // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
+ // TODO: come up with more subtle solution that uses subtensors like arm compute
+ // TODO: Check if it works for all layouts
+
+ using PaddingInfo = std::pair<int32_t, int32_t>;
+ /** List of padding information */
+ using PaddingList = std::vector<PaddingInfo>;
+
+ auto constant_value = constant_value_data ? *constant_value_data : 0;
+ assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
+
+ PaddingList padding_list(pad_rank);
+ for (int32_t n = 0; n < pad_rank; ++n)
+ {
+ const int32_t *from = padding_data + (n * 2);
+ padding_list[n] = {from[0], from[1]};
+ }
+ for (int32_t i = 0; i < pad_rank; ++i)
+ {
+ assert(output_shape.Dims(i) ==
+ input_shape.Dims(i) + padding_list[i].first + padding_list[i].second);
+ }
+ /* Use pad_rank since given input/output shapes are expanded to 4d before calling all cker
+ functions:
+ 1. to prevent access violation in padding_list;
+ 2. handling as 4d is slower than as 2d/3d.
+ */
+ switch (pad_rank)
+ {
+ case 0:
+ case 1:
+ {
+ const int32_t in_row_len = input_shape.Dims(0);
+ std::fill_n(output_data, padding_list[0].first, constant_value);
+ std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float));
+ std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
+ constant_value);
+ break;
+ }
+ case 2: // HW
+ {
+ const int32_t in_row_len = input_shape.Dims(1);
+ const int32_t out_row_size = output_shape.Dims(1);
+
+ // prepend padding rows
+ std::fill_n(output_data, padding_list[0].first * out_row_size, constant_value);
+
+ const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+ for (auto i = padding_list[0].first, j = 0; i < r_h_inp_lim; ++i, ++j)
+ {
+ auto out_offset = i * out_row_size;
+ const auto in_offset = j * in_row_len;
+
+ // prepend padding values
+ std::fill_n(output_data + out_offset, padding_list[1].first, constant_value);
+
+ out_offset += padding_list[1].first;
+
+ // copy a row of input data
+ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+
+ out_offset += in_row_len;
+
+ // append padding values
+ std::fill_n(output_data + out_offset, padding_list[1].second, constant_value);
+ }
+
+ // append padding rows
+ std::fill_n(output_data + r_h_inp_lim * out_row_size, padding_list[0].second * out_row_size,
+ constant_value);
+ break;
+ }
+ case 3: // HWC
+ {
+ const int32_t in_row_len = input_shape.Dims(2);
+ const int32_t out_row_size = output_shape.Dims(2);
+ const auto plain_size = out_row_size * output_shape.Dims(1);
+
+ // prepend padding plains
+ std::fill_n(output_data, padding_list[0].first * plain_size, constant_value);
+
+ const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+ for (auto i = padding_list[0].first, i_inp = 0; i < r_h_inp_lim; ++i, ++i_inp)
+ {
+ const auto out_w_offset = (i * output_shape.Dims(1) + 0) * output_shape.Dims(2);
+
+ // prepend padding rows
+ std::fill_n(output_data + out_w_offset, padding_list[1].first * out_row_size,
+ constant_value);
+
+ const auto r_w_inp_lim = input_shape.Dims(1) + padding_list[1].first;
+ for (auto j = padding_list[1].first, j_inp = 0; j < r_w_inp_lim; ++j, ++j_inp)
+ {
+ auto out_offset = (i * output_shape.Dims(1) + j) * output_shape.Dims(2);
+ const auto in_offset = (i_inp * input_shape.Dims(1) + j_inp) * input_shape.Dims(2);
+
+ // prepend padding values
+ std::fill_n(output_data + out_offset, padding_list[2].first, constant_value);
+
+ out_offset += padding_list[2].first;
+
+ // copy a row of input data
+ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+
+ out_offset += in_row_len;
+
+ // append padding values
+ std::fill_n(output_data + out_offset, padding_list[2].second, constant_value);
+ }
+
+ // append padding rows
+ std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
+ padding_list[1].second * out_row_size, constant_value);
+ }
+
+ // append padding plains
+ std::fill_n(output_data + r_h_inp_lim * plain_size, padding_list[0].second * plain_size,
+ constant_value);
+ break;
+ }
+ case 4:
+ {
+ auto get_offset = [](const Shape &shape, int32_t n, int32_t h, int32_t w) -> int32_t {
+ return ((n * shape.Dims(1) + h) * shape.Dims(2) + w) * shape.Dims(3);
+ };
+ const int32_t in_row_len = input_shape.Dims(3);
+ const int32_t out_row_size = output_shape.Dims(3);
+ const auto plain_size = out_row_size * output_shape.Dims(2);
+ const auto parallelepiped_size = plain_size * output_shape.Dims(1);
+
+ // prepend padding parallelepipeds
+ std::fill_n(output_data, padding_list[0].first * parallelepiped_size, constant_value);
+
+ const auto r_b_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+ for (auto i = padding_list[0].first, i_inp = 0; i < r_b_inp_lim; ++i, ++i_inp)
+ {
+ const auto out_h_offset = get_offset(output_shape, i, 0, 0);
+ // prepend padding plains
+ std::fill_n(output_data + out_h_offset, padding_list[1].first * plain_size, constant_value);
+
+ const auto r_h_inp_lim = input_shape.Dims(1) + padding_list[1].first;
+ for (auto j = padding_list[1].first, j_inp = 0; j < r_h_inp_lim; ++j, ++j_inp)
+ {
+ const auto out_w_offset = get_offset(output_shape, i, j, 0);
+
+ // prepend padding rows
+ std::fill_n(output_data + out_w_offset, padding_list[2].first * out_row_size,
+ constant_value);
+
+ const auto r_w_inp_lim = input_shape.Dims(2) + padding_list[2].first;
+ for (auto k = padding_list[2].first, k_inp = 0; k < r_w_inp_lim; ++k, ++k_inp)
+ {
+ auto out_c_offset = get_offset(output_shape, i, j, k);
+ const auto in_offset = get_offset(input_shape, i_inp, j_inp, k_inp);
+
+ // prepend padding values
+ std::fill_n(output_data + out_c_offset, padding_list[3].first, constant_value);
+
+ out_c_offset += padding_list[3].first;
+
+ // copy a row of input data
+ memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float));
+
+ out_c_offset += in_row_len;
+
+ // append padding values
+ std::fill_n(output_data + out_c_offset, padding_list[3].second, constant_value);
+ }
+
+ // append padding rows
+ std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
+ padding_list[2].second * out_row_size, constant_value);
+ }
+
+ // append padding plains
+ std::fill_n(output_data + out_h_offset + r_h_inp_lim * plain_size,
+ padding_list[1].second * plain_size, constant_value);
+ }
+ // append padding parallelepipeds
+ std::fill_n(output_data + r_b_inp_lim * parallelepiped_size,
+ padding_list[0].second * parallelepiped_size, constant_value);
+ break;
+ }
+ default:
+ throw std::runtime_error("Padding for rank > 4 NYI");
+ break;
+ }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_PAD_H__
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
new file mode 100644
index 000000000..ea404a002
--- /dev/null
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SOFTMAX_H__
+#define __NNFW_CKER_SOFTMAX_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/Types.h"
+#include "cker/gemmlowp/FixedPoint.h"
+#include "cker/operation/optimized/SoftMax.h"
+#include "cker/operation/reference/SoftMax.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+#if defined(CKER_OPTIMIZED_EIGEN)
+ optimized::Softmax(params, input_shape, input_data, output_shape, output_data);
+#else // defined(CKER_OPTIMIZED_EIGEN)
+ reference::Softmax(params, input_shape, input_data, output_shape, output_data);
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+}
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
+{
+ const int32_t input_beta_multiplier = params.input_multiplier;
+ const int32_t input_beta_left_shift = params.input_left_shift;
+ const int diff_min = params.diff_min;
+ // The representation chosen for the input to the exp() function is Q5.26.
+ // We need to leave extra space since values that we skip might be as large as
+ // -32 before multiplying by input_beta_multiplier, and therefore as large as
+ // -16 afterwards. Note that exp(-8) is definitely not insignificant to
+ // accumulation, but exp(-16) definitely is.
+ static const int kScaledDiffIntegerBits = 5;
+ static const int kAccumulationIntegerBits = 12;
+ using FixedPointScaledDiff = gemmlowp::FixedPoint<kScaledDiffIntegerBits>;
+ using FixedPointAccum = gemmlowp::FixedPoint<kAccumulationIntegerBits>;
+ using FixedPoint0 = gemmlowp::FixedPoint<0>;
+
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ for (int i = 0; i < outer_size; ++i)
+ {
+ uint8_t max_in_row = 0;
+ for (int c = 0; c < depth; ++c)
+ {
+ max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+ }
+
+ FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+ for (int c = 0; c < depth; ++c)
+ {
+ int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+ if (input_diff >= diff_min)
+ {
+ const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+ sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+ exp_on_negative_values(scaled_diff_f8));
+ }
+ }
+
+ int32_t fixed_sum_of_exps = sum_of_exps.raw();
+ int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps));
+ // This is the number of bits to the left of the binary point above 1.0.
+ // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
+ // no later adjustment will be needed.
+ int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+ int32_t shifted_sum_minus_one =
+ static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
+ (static_cast<uint32_t>(1) << 31));
+
+ FixedPoint0 shifted_scale =
+ one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+ for (int c = 0; c < depth; ++c)
+ {
+ int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+ if (input_diff >= diff_min)
+ {
+ const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+ FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+ int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
+ num_bits_over_unit + 31 - 8);
+
+ output_data[i * depth + c] = static_cast<uint8_t>(
+ std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
+ }
+ else
+ {
+ output_data[i * depth + c] = 0;
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SOFTMAX_H__
diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h
new file mode 100644
index 000000000..535fe86cf
--- /dev/null
+++ b/compute/cker/include/cker/operation/TransposeConv.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRANSPOSE_CONV_H__
+#define __NNFW_CKER_TRANSPOSE_CONV_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct TransposeConvParams
+{
+ PaddingType padding_type;
+ PaddingValues padding_values;
+ // TODO(starka): This was just "stride", so check that width+height is OK.
+ int16_t stride_width;
+ int16_t stride_height;
+ int16_t dilation_width_factor;
+ int16_t dilation_height_factor;
+ // uint8_t inference params.
+ // TODO(b/65838351): Use smaller types if appropriate.
+ int32_t input_offset;
+ int32_t weights_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int output_shift;
+ // uint8_t, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+};
+
+inline void TransposeConv(const TransposeConvParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &filter_shape,
+ const float *filter_data, const Shape &output_shape, float *output_data)
+{
+
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+
+ // Although transpose convolution simplifies to convolution with transposed
+ // weights for strides of 1, non-unitary striding complicates matters. To
+ // keep this reference implementation as clear as possible, we use a
+ // "scatter" access pattern, where we loop through all the input elements,
+ // computing their influence on the output, rather than looping through the
+ // output elements in the typical "gather" access pattern of a conv. We
+ // therefore must initialize the output array to zero.
+ const int num_elements = output_shape.FlatSize();
+ for (int i = 0; i < num_elements; i++)
+ {
+ output_data[i] = 0.0f;
+ }
+
+ // Loop through input elements one at a time.
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int in_y = 0; in_y < input_height; ++in_y)
+ {
+ for (int in_x = 0; in_x < input_width; ++in_x)
+ {
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+ {
+ // Loop through the output elements it will influence
+ const int out_x_origin = (in_x * stride_width) - pad_width;
+ const int out_y_origin = (in_y * stride_height) - pad_height;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+ {
+ // Compute output element location
+ const int out_x = out_x_origin + filter_x;
+ const int out_y = out_y_origin + filter_y;
+ // We cannot accumulate out of bounds
+ if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+ (out_y < output_height))
+ {
+ float input_value =
+ input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+ float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y,
+ filter_x, in_channel)];
+ output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
+ input_value * filter_value;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TRANSPOSE_CONV_H__
diff --git a/compute/cker/include/cker/operation/optimized/AveragePool.h b/compute/cker/include/cker/operation/optimized/AveragePool.h
new file mode 100644
index 000000000..d94a5811a
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/AveragePool.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__
+#define __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// TODO Change to apply neon for this function if it is faster
+inline void AveragePool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+
+ // TODO(benoitjacob) make this a proper reference impl without Eigen!
+ const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+ auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+ // TODO(benoitjacob) get rid of the dynamic memory allocation here!
+ Eigen::VectorXf out_count(out_mat.cols());
+ out_count.setZero();
+ // Prefill the output to 0.
+ out_mat.setZero();
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int h = 0; h < input_height; ++h)
+ {
+ for (int w = 0; w < input_width; ++w)
+ {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ int hpad = h + params.padding_values.height;
+ int wpad = w + params.padding_values.width;
+ int h_start =
+ (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+ int h_end = std::min(hpad / stride_height + 1, output_height);
+ int w_start =
+ (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+ int w_end = std::min(wpad / stride_width + 1, output_width);
+ // compute elementwise sum
+ for (int ph = h_start; ph < h_end; ++ph)
+ {
+ for (int pw = w_start; pw < w_end; ++pw)
+ {
+ int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+ out_mat.col(out_offset) += in_mat.col(NodeOffset(b, h, w, input_height, input_width));
+ out_count(out_offset)++;
+ }
+ }
+ }
+ }
+ }
+ // Divide the output by the actual number of elements being averaged over
+ assert(out_count.minCoeff() > 0);
+ out_mat.array().rowwise() /= out_count.transpose().array();
+
+ const int flat_size = output_shape.FlatSize();
+ for (int i = 0; i < flat_size; ++i)
+ {
+ output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min,
+ params.float_activation_max);
+ }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__
diff --git a/compute/cker/include/cker/operation/optimized/MaxPool.h b/compute/cker/include/cker/operation/optimized/MaxPool.h
new file mode 100644
index 000000000..07a14aee4
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/MaxPool.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_MAX_POOL_H__
+#define __NNFW_CKER_OPTIMIZED_MAX_POOL_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// TODO Change to apply neon for this function if it is faster
+inline void MaxPool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+
+ const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+ auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+ // Prefill the output to minimum representable float value
+ out_mat.setConstant(std::numeric_limits<float>::lowest());
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int h = 0; h < input_height; ++h)
+ {
+ for (int w = 0; w < input_width; ++w)
+ {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ int hpad = h + params.padding_values.height;
+ int wpad = w + params.padding_values.width;
+ int h_start =
+ (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+ int h_end = std::min(hpad / stride_height + 1, output_height);
+ int w_start =
+ (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+ int w_end = std::min(wpad / stride_width + 1, output_width);
+ // compute elementwise sum
+ for (int ph = h_start; ph < h_end; ++ph)
+ {
+ for (int pw = w_start; pw < w_end; ++pw)
+ {
+ int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+ out_mat.col(out_offset) =
+ out_mat.col(out_offset)
+ .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
+ }
+ }
+ }
+ }
+ }
+ const int flat_size = output_shape.FlatSize();
+ for (int i = 0; i < flat_size; ++i)
+ {
+ output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min,
+ params.float_activation_max);
+ }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_OPTIMIZED_MAX_POOL_H__
diff --git a/compute/cker/include/cker/operation/optimized/SoftMax.h b/compute/cker/include/cker/operation/optimized/SoftMax.h
new file mode 100644
index 000000000..e44f251d0
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/SoftMax.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_SOFTMAX_H__
+#define __NNFW_CKER_OPTIMIZED_SOFTMAX_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ // Validate whether if shapes of input and output are the same
+ MatchingFlatSize(input_shape, output_shape);
+
+ const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+ auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+ // Compute the exponential first, removing the max coefficient for numerical
+ // stability.
+ out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta;
+ // We are separating out the exp function so that exp can be vectorized.
+ out_mat = out_mat.array().exp();
+ // Normalize to get the activations.
+ Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse();
+ out_mat.array().rowwise() *= scale;
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_OPTIMIZED_SOFTMAX_H__
diff --git a/compute/cker/include/cker/operation/reference/AveragePool.h b/compute/cker/include/cker/operation/reference/AveragePool.h
new file mode 100644
index 000000000..3ddab4b24
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/AveragePool.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__
+#define __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+inline void AveragePool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ int filter_count = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start);
+ if (filter_count <= 0)
+ {
+ continue;
+ }
+ for (int channel = 0; channel < depth; ++channel)
+ {
+ float total = 0.f;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+ {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ total += input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+ }
+ }
+ const float average = total / (float)filter_count;
+ output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+ ActivationFunctionWithMinMax(average, params.float_activation_min,
+ params.float_activation_max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__
diff --git a/compute/cker/include/cker/operation/reference/MaxPool.h b/compute/cker/include/cker/operation/reference/MaxPool.h
new file mode 100644
index 000000000..a0f0263c7
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/MaxPool.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_MAX_POOL_H__
+#define __NNFW_CKER_REFERENCE_MAX_POOL_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+inline void MaxPool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int channel = 0; channel < depth; ++channel)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ float max = std::numeric_limits<float>::lowest();
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+ {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ max = std::max(max, input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+ }
+ }
+ output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+ ActivationFunctionWithMinMax(max, params.float_activation_min,
+ params.float_activation_max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_MAX_POOL_H__
diff --git a/compute/cker/include/cker/operation/reference/SoftMax.h b/compute/cker/include/cker/operation/reference/SoftMax.h
new file mode 100644
index 000000000..420cb319b
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/SoftMax.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_SOFTMAX_H__
+#define __NNFW_CKER_REFERENCE_SOFTMAX_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ for (int i = 0; i < outer_size; ++i)
+ {
+ // Find max element value which we'll use to ensure numerical stability
+ // taking advantage of the following equality:
+ // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+ float max = std::numeric_limits<float>::lowest();
+ for (int c = 0; c < depth; ++c)
+ {
+ max = std::max(max, input_data[i * depth + c]);
+ }
+
+ // Compute sum.
+ float sum = 0.f;
+ for (int c = 0; c < depth; ++c)
+ {
+ sum += std::exp((input_data[i * depth + c] - max) * params.beta);
+ }
+
+ // Compute result.
+ for (int c = 0; c < depth; ++c)
+ {
+ output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) * params.beta) / sum;
+ }
+ }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_SOFTMAX_H__
diff --git a/compute/ncnn/CMakeLists.txt b/compute/ncnn/CMakeLists.txt
new file mode 100644
index 000000000..a8f50120f
--- /dev/null
+++ b/compute/ncnn/CMakeLists.txt
@@ -0,0 +1,34 @@
+if(NOT BUILD_SRCN_KERNEL)
+ message(STATUS "SRCN kernel library build: disabled")
+ return()
+else(NOT BUILD_SRCN_KERNEL)
+ message(STATUS "SRCN kernel library build: OK")
+endif()
+
+# Find and use pre-installed OpenMP
+find_package(OpenMP QUIET)
+if(NOT OpenMP_FOUND)
+ return()
+endif(NOT OpenMP_FOUND)
+
+file(GLOB_RECURSE SOURCES src/*.cc)
+file(GLOB_RECURSE TESTS src/*_test.cc)
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(nnfw_lib_srcn STATIC ${SOURCES})
+target_include_directories(nnfw_lib_srcn PUBLIC include)
+if(NOT TARGET OpenMP::OpenMP_CXX)
+ find_package(Threads REQUIRED)
+ add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE)
+ set_property(TARGET OpenMP::OpenMP_CXX
+ PROPERTY INTERFACE_COMPILE_OPTIONS ${OpenMP_CXX_FLAGS})
+ # Only works if the same flag is passed to the linker; use CMake 3.9+ otherwise (Intel, AppleClang)
+ set_property(TARGET OpenMP::OpenMP_CXX
+ PROPERTY INTERFACE_LINK_LIBRARIES ${OpenMP_CXX_FLAGS} Threads::Threads)
+
+endif()
+target_link_libraries(nnfw_lib_srcn PRIVATE OpenMP::OpenMP_CXX)
+target_link_libraries(nnfw_lib_srcn PRIVATE nnfw_common)
+target_compile_definitions(nnfw_lib_srcn PRIVATE TIZEN) # ANDROID or TIZEN
+#target_compile_definitions(nnfw_lib_srcn PRIVATE NCNN) # Enable if ready
+set_target_properties(nnfw_lib_srcn PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/compute/ncnn/README.md b/compute/ncnn/README.md
new file mode 100644
index 000000000..5c39d249a
--- /dev/null
+++ b/compute/ncnn/README.md
@@ -0,0 +1,9 @@
+### NCNN compute library
+
+This compute library is based on NCNN project (https://github.com/Tencent/ncnn) with custom optimization
+
+Current base commit: https://github.com/Tencent/ncnn/commit/0219f507b71bdb945d776c8586c162f2c22bba54
+
+Added files for custom optimization is placed on
+- Headers: include/ncnn/srcn
+- Soruces: src/srcn
diff --git a/compute/ncnn/include/ncnn/layer/binaryop.h b/compute/ncnn/include/ncnn/layer/binaryop.h
new file mode 100644
index 000000000..4ccfd94b4
--- /dev/null
+++ b/compute/ncnn/include/ncnn/layer/binaryop.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef __NCNN_LAYER_BINARYOP_H__
+#define __NCNN_LAYER_BINARYOP_H__
+
+#include "ncnn/mat.h"
+
+namespace nnfw
+{
+namespace ncnn
+{
+
+enum class BinaryOp
+{
+ Operation_ADD = 0,
+ Operation_SUB = 1,
+ Operation_MUL = 2,
+ Operation_DIV = 3,
+ Operation_MAX = 4,
+ Operation_MIN = 5,
+ Operation_POW = 6,
+ Operation_SQUAREDDIFFERENCE = 7
+};
+
+struct BinaryOpParam
+{
+ BinaryOp op_type;
+ float b;
+
+ BinaryOpParam() : op_type{BinaryOp::Operation_ADD}, b{0.0f} {}
+};
+
+int ncnn_binary_op(const BinaryOpParam &param, const Mat &bottom_blob, const Mat &bottom_blob1,
+ Mat &top_blob);
+// TODO Inplace function porting
+// int ncnn_binary_op_inplace(const BinaryParam &param, Mat &bottom_top_blob) const;
+// int ncnn_binary_op_inplace(const BinaryOpParam &param, std::vector<Mat> &bottom_top_blobs) const;
+
+} // namespace ncnn
+} // naemsapce nnfw
+
+#endif // __NCNN_LAYER_BINARYOP_H__
diff --git a/compute/ncnn/include/ncnn/layer/instance_norm.h b/compute/ncnn/include/ncnn/layer/instance_norm.h
new file mode 100644
index 000000000..b7d89281d
--- /dev/null
+++ b/compute/ncnn/include/ncnn/layer/instance_norm.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef __NNFW_LAYER_INSTANCE_NORM_H_
+#define __NNFW_LAYER_INSTANCE_NORM_H_
+
+#include "ncnn/mat.h"
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace nnfw
+{
+namespace ncnn
+{
+
+void ncnn_instance_norm_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat,
+ int channels, float eps);
+
+void ncnn_instance_norm_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat,
+ int channels, float eps);
+
+void ncnn_instance_norm_with_relu_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat,
+ int channels, float eps, float slope);
+
+void ncnn_instance_norm_with_relu_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat,
+ int channels, float eps, float slope);
+
+} // namespace ncnn
+
+} // namespace nnfw
+
+#endif // __NNFW_LAYER_INSTANCE_NORM_H_
diff --git a/compute/ncnn/include/ncnn/mat.h b/compute/ncnn/include/ncnn/mat.h
new file mode 100644
index 000000000..2a577939d
--- /dev/null
+++ b/compute/ncnn/include/ncnn/mat.h
@@ -0,0 +1,738 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_NCNN_MAT_H__
+#define __NNFW_NCNN_MAT_H__
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace nnfw
+{
+namespace ncnn
+{
+
+// the three dimension matrix
+class Mat
+{
+public:
+ // empty
+ Mat();
+ // vec
+ Mat(int w, size_t elemsize = 4);
+ // image
+ Mat(int w, int h, size_t elemsize = 4);
+ // dim
+ Mat(int w, int h, int c, size_t elemsize = 4);
+ // copy
+ Mat(const Mat &m);
+ // external vec
+ Mat(int w, void *data, size_t elemsize = 4);
+ // external image
+ Mat(int w, int h, void *data, size_t elemsize = 4);
+ // external dim
+ Mat(int w, int h, int c, void *data, size_t elemsize = 4);
+ // release
+ ~Mat();
+ // assign
+ Mat &operator=(const Mat &m);
+ // set all
+ void fill(float v);
+ template <typename T> void fill(T v);
+ // deep copy
+ Mat clone() const;
+ // reshape vec
+ Mat reshape(int w) const;
+ // reshape image
+ Mat reshape(int w, int h) const;
+ // reshape dim
+ Mat reshape(int w, int h, int c) const;
+ // allocate vec
+ void create(int w, size_t elemsize = 4);
+ // allocate image
+ void create(int w, int h, size_t elemsize = 4);
+// allocate dim
+#ifdef _MEMORY_TO_TIME_
+ void create(int w, int h, int c, size_t elemsize = 4, bool isNew = false);
+#else
+ void create(int w, int h, int c, size_t elemsize = 4);
+#endif
+#ifdef USE_OPENCL_INSIDE
+ void create_empity_mat(int _w, int _h, int _c, size_t _elemsize);
+#endif
+
+ // refcount++
+ void addref();
+ // refcount--
+ void release();
+
+ bool empty() const;
+ size_t total() const;
+
+ // data reference
+ Mat channel(int c);
+ const Mat channel(int c) const;
+ float *row(int y);
+ const float *row(int y) const;
+ template <typename T> T *row(int y);
+ template <typename T> const T *row(int y) const;
+
+ // access raw data
+ template <typename T> operator T *();
+ template <typename T> operator const T *() const;
+
+ // convenient access float vec element
+ float &operator[](int i);
+ const float &operator[](int i) const;
+
+ enum
+ {
+ PIXEL_CONVERT_SHIFT = 16,
+ PIXEL_FORMAT_MASK = 0x0000ffff,
+ PIXEL_CONVERT_MASK = 0xffff0000,
+
+ PIXEL_RGB = 1,
+ PIXEL_BGR = (1 << 1),
+ PIXEL_GRAY = (1 << 2),
+ PIXEL_RGBA = (1 << 3),
+
+ PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+ PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+
+ PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+ PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+
+ PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+ PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+
+ PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+ PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+ PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+ };
+
+#ifdef _MEMORY_TO_TIME_
+ static void from_pixels(const unsigned char *pixels, Mat &m, int type, int w, int h);
+ static void from_pixels(const unsigned char *pixels, Mat &m, int type, int w, int h, int top,
+ int bottom, int left, int right);
+#endif // _MEMORY_TO_TIME_
+
+ // convenient construct from pixel data
+ static Mat from_pixels(const unsigned char *pixels, int type, int w, int h);
+ // convenient construct from pixel data and add the padding && only supports same PIXEL_RGB2BGR
+ // and PIXEL_BGR2RGB now
+ static Mat from_pixels(const unsigned char *pixels, int type, int w, int h, int top, int bottom,
+ int left, int right);
+ // convenient construct from pixel data and resize to specific size
+ static Mat from_pixels_resize(const unsigned char *pixels, int type, int w, int h,
+ int target_width, int target_height);
+
+ // convenient export to pixel data
+ void to_pixels(unsigned char *pixels, int type);
+ // convenient export to pixel data and cut the padding && only supports same PIXEL_RGB2BGR and
+ // PIXEL_BGR2RGB now
+ void to_pixels(unsigned char *pixels, int type, int top, int bottom, int left, int right);
+ // convenient export to pixel data and resize to specific size
+ void to_pixels_resize(unsigned char *pixels, int type, int target_width, int target_height);
+
+ // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+ void substract_mean_normalize(const float *mean_vals, const float *norm_vals);
+
+ // convenient construct from half precisoin floating point data
+ static Mat from_float16(const unsigned short *data, int size);
+
+ // pointer to the data
+ void *data;
+
+ // pointer to the reference counter
+ // when points to user-allocated data, the pointer is NULL
+ int *refcount;
+
+ // element size in bytes
+ // 4 = float32/int32
+ // 2 = float16
+ // 1 = int8/uint8
+ // 0 = empty
+ size_t elemsize;
+
+ // the dimensionality
+ int dims;
+
+ int w;
+ int h;
+ int c;
+
+ size_t cstep;
+};
+
+// misc function
+// image pixel bilinear resize
+void resize_bilinear_c1(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w,
+ int h);
+void resize_bilinear_c3(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w,
+ int h);
+void resize_bilinear_c4(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w,
+ int h);
+
+// mat process
+enum
+{
+ BORDER_CONSTANT = 0,
+ BORDER_REPLICATE = 1,
+};
+void copy_make_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right, int type,
+ float v);
+void copy_cut_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right);
+void resize_bilinear(const Mat &src, Mat &dst, int w, int h);
+
+// the alignment of all the allocated buffers
+#define MALLOC_ALIGN 16
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template <typename _Tp> static inline _Tp *alignPtr(_Tp *ptr, int n = (int)sizeof(_Tp))
+{
+ return (_Tp *)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static inline size_t alignSize(size_t sz, int n) { return (sz + n - 1) & -n; }
+
+static inline void *fastMalloc(size_t size)
+{
+ unsigned char *udata = (unsigned char *)malloc(size + sizeof(void *) + MALLOC_ALIGN);
+ if (!udata)
+ return 0;
+ unsigned char **adata = alignPtr((unsigned char **)udata + 1, MALLOC_ALIGN);
+ adata[-1] = udata;
+ return adata;
+}
+
+static inline void fastFree(void *ptr)
+{
+ if (ptr)
+ {
+ unsigned char *udata = ((unsigned char **)ptr)[-1];
+ free(udata);
+ }
+}
+
+// exchange-add operation for atomic operations on reference counters
+#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) \
+ (int)_InterlockedExchangeAdd(const_cast<void *>(reinterpret_cast<volatile void *>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && \
+ !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) \
+ __c11_atomic_fetch_add((_Atomic(int) *)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int) *)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) \
+ (int)__atomic_fetch_add((unsigned *)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned *)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#include <intrin.h>
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile *)addr, delta)
+#else
+static inline void NCNN_XADD(int *addr, int delta)
+{
+ int tmp = *addr;
+ *addr += delta;
+ return tmp;
+}
+#endif
+
+inline Mat::Mat() : data(0), refcount(0), elemsize(0), dims(0), w(0), h(0), c(0), cstep(0) {}
+
+inline Mat::Mat(int _w, size_t _elemsize) : data(0), refcount(0), dims(0) { create(_w, _elemsize); }
+
+inline Mat::Mat(int _w, int _h, size_t _elemsize) : data(0), refcount(0), dims(0)
+{
+ create(_w, _h, _elemsize);
+}
+
+inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize) : data(0), refcount(0), dims(0)
+{
+ create(_w, _h, _c, _elemsize);
+}
+
+inline Mat::Mat(const Mat &m)
+ : data(m.data), refcount(m.refcount), elemsize(m.elemsize), dims(m.dims)
+{
+ if (refcount)
+ NCNN_XADD(refcount, 1);
+
+ w = m.w;
+ h = m.h;
+ c = m.c;
+
+ cstep = m.cstep;
+}
+
+inline Mat::Mat(int _w, void *_data, size_t _elemsize)
+ : data(_data), refcount(0), elemsize(_elemsize), dims(1)
+{
+ w = _w;
+ h = 1;
+ c = 1;
+
+ cstep = w;
+}
+
+inline Mat::Mat(int _w, int _h, void *_data, size_t _elemsize)
+ : data(_data), refcount(0), elemsize(_elemsize), dims(2)
+{
+ w = _w;
+ h = _h;
+ c = 1;
+
+ cstep = w * h;
+}
+
+inline Mat::Mat(int _w, int _h, int _c, void *_data, size_t _elemsize)
+ : data(_data), refcount(0), elemsize(_elemsize), dims(3)
+{
+ w = _w;
+ h = _h;
+ c = _c;
+
+ cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+inline Mat::~Mat() { release(); }
+
+inline Mat &Mat::operator=(const Mat &m)
+{
+ if (this == &m)
+ return *this;
+
+ if (m.refcount)
+ NCNN_XADD(m.refcount, 1);
+
+ release();
+
+ data = m.data;
+ refcount = m.refcount;
+ elemsize = m.elemsize;
+
+ dims = m.dims;
+ w = m.w;
+ h = m.h;
+ c = m.c;
+
+ cstep = m.cstep;
+
+ return *this;
+}
+
+inline void Mat::fill(float _v)
+{
+ int size = total();
+ float *ptr = (float *)data;
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _c = vdupq_n_f32(_v);
+#if __aarch64__
+ if (nn > 0)
+ {
+ asm volatile("0: \n"
+ "subs %w0, %w0, #1 \n"
+ "st1 {%4.4s}, [%1], #16 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn), "1"(ptr),
+ "w"(_c) // %4
+ : "cc", "memory");
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile("0: \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {%e4-%f4}, [%1 :128]!\n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn), "1"(ptr),
+ "w"(_c) // %4
+ : "cc", "memory");
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain > 0; remain--)
+ {
+ *ptr++ = _v;
+ }
+}
+
+template <typename T> inline void Mat::fill(T _v)
+{
+ int size = total();
+ T *ptr = (T *)data;
+ for (int i = 0; i < size; i++)
+ {
+ ptr[i] = _v;
+ }
+}
+
+inline Mat Mat::clone() const
+{
+ if (empty())
+ return Mat();
+
+ Mat m;
+ if (dims == 1)
+ m.create(w, elemsize);
+ else if (dims == 2)
+ m.create(w, h, elemsize);
+ else if (dims == 3)
+ m.create(w, h, c, elemsize);
+
+ if (total() > 0)
+ {
+ memcpy(m.data, data, total() * elemsize);
+ }
+
+ return m;
+}
+
+inline Mat Mat::reshape(int _w) const
+{
+ if (w * h * c != _w)
+ return Mat();
+
+ if (dims == 3 && cstep != (size_t)w * h)
+ {
+ Mat m;
+ m.create(_w, elemsize);
+
+ // flatten
+ for (int i = 0; i < c; i++)
+ {
+ const void *ptr = (unsigned char *)data + i * cstep * elemsize;
+ void *mptr = (unsigned char *)m.data + i * w * h * elemsize;
+ memcpy(mptr, ptr, w * h * elemsize);
+ }
+
+ return m;
+ }
+
+ Mat m = *this;
+
+ m.dims = 1;
+ m.w = _w;
+ m.h = 1;
+ m.c = 1;
+
+ m.cstep = _w;
+
+ return m;
+}
+
+inline Mat Mat::reshape(int _w, int _h) const
+{
+ if (w * h * c != _w * _h)
+ return Mat();
+
+ if (dims == 3 && cstep != (size_t)w * h)
+ {
+ Mat m;
+ m.create(_w, _h, elemsize);
+
+ // flatten
+ for (int i = 0; i < c; i++)
+ {
+ const void *ptr = (unsigned char *)data + i * cstep * elemsize;
+ void *mptr = (unsigned char *)m.data + i * w * h * elemsize;
+ memcpy(mptr, ptr, w * h * elemsize);
+ }
+
+ return m;
+ }
+
+ Mat m = *this;
+
+ m.dims = 2;
+ m.w = _w;
+ m.h = _h;
+ m.c = 1;
+
+ m.cstep = _w * _h;
+
+ return m;
+}
+
+inline Mat Mat::reshape(int _w, int _h, int _c) const
+{
+ if (w * h * c != _w * _h * _c)
+ return Mat();
+
+ if (dims < 3)
+ {
+ if ((size_t)_w * _h != alignSize(_w * _h * elemsize, 16) / elemsize)
+ {
+ Mat m;
+ m.create(_w, _h, _c, elemsize);
+
+ // align channel
+ for (int i = 0; i < _c; i++)
+ {
+ const void *ptr = (unsigned char *)data + i * _w * _h * elemsize;
+ void *mptr = (unsigned char *)m.data + i * m.cstep * m.elemsize;
+ memcpy(mptr, ptr, _w * _h * elemsize);
+ }
+
+ return m;
+ }
+ }
+ else if (c != _c)
+ {
+ // flatten and then align
+ Mat tmp = reshape(_w * _h * _c);
+ return tmp.reshape(_w, _h, _c);
+ }
+
+ Mat m = *this;
+
+ m.dims = 3;
+ m.w = _w;
+ m.h = _h;
+ m.c = _c;
+
+ m.cstep = alignSize(_w * _h * elemsize, 16) / elemsize;
+
+ return m;
+}
+
+inline void Mat::create(int _w, size_t _elemsize)
+{
+ if (dims == 1 && w == _w && elemsize == _elemsize)
+ return;
+
+ release();
+
+ elemsize = _elemsize;
+
+ dims = 1;
+ w = _w;
+ h = 1;
+ c = 1;
+
+ cstep = w;
+
+ if (total() > 0)
+ {
+ size_t totalsize = total() * elemsize;
+ data = fastMalloc(totalsize + (int)sizeof(*refcount));
+ refcount = (int *)(((unsigned char *)data) + totalsize);
+ *refcount = 1;
+ }
+}
+
+inline void Mat::create(int _w, int _h, size_t _elemsize)
+{
+ if (dims == 2 && w == _w && h == _h && elemsize == _elemsize)
+ return;
+
+ release();
+
+ elemsize = _elemsize;
+
+ dims = 2;
+ w = _w;
+ h = _h;
+ c = 1;
+
+ cstep = w * h;
+
+ if (total() > 0)
+ {
+ size_t totalsize = total() * elemsize;
+ data = fastMalloc(totalsize + (int)sizeof(*refcount));
+ refcount = (int *)(((unsigned char *)data) + totalsize);
+ *refcount = 1;
+ }
+}
+
+#ifdef _MEMORY_TO_TIME_
+inline void Mat::create(int _w, int _h, int _c, size_t _elemsize, bool isNew)
+{
+ if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize)
+ return;
+
+ if (!isNew && dims == 3)
+ {
+ elemsize = _elemsize;
+
+ w = _w;
+ h = _h;
+ c = _c;
+
+ cstep = alignSize(w * h * elemsize, 16) / elemsize;
+ return;
+ }
+
+ release();
+
+ elemsize = _elemsize;
+
+ dims = 3;
+ w = _w;
+ h = _h;
+ c = _c;
+
+ cstep = alignSize(w * h * elemsize, 16) / elemsize;
+
+ if (total() > 0)
+ {
+ size_t totalsize = total() * elemsize;
+ data = fastMalloc(totalsize + (int)sizeof(*refcount));
+ refcount = (int *)(((unsigned char *)data) + totalsize);
+ *refcount = 1;
+ }
+}
+
+#else
+inline void Mat::create(int _w, int _h, int _c, size_t _elemsize)
+{
+ if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize)
+ return;
+
+ release();
+
+ elemsize = _elemsize;
+
+ dims = 3;
+ w = _w;
+ h = _h;
+ c = _c;
+
+ cstep = alignSize(w * h * elemsize, 16) / elemsize;
+
+ if (total() > 0)
+ {
+ size_t totalsize = total() * elemsize;
+ data = fastMalloc(totalsize + (int)sizeof(*refcount));
+ refcount = (int *)(((unsigned char *)data) + totalsize);
+ *refcount = 1;
+ }
+}
+#endif //_MEMORY_TO_TIME_
+
+#ifdef USE_OPENCL_INSIDE
+inline void Mat::create_empity_mat(int _w, int _h, int _c, size_t _elemsize)
+{
+ if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize)
+ return;
+
+ release();
+
+ elemsize = _elemsize;
+
+ dims = 3;
+ w = _w;
+ h = _h;
+ c = _c;
+
+ cstep = alignSize(w * h * elemsize, 16) / elemsize;
+ data = NULL;
+}
+#endif // USE_OPENCL_INSIDE
+
+inline void Mat::addref()
+{
+ if (refcount)
+ NCNN_XADD(refcount, 1);
+}
+
+inline void Mat::release()
+{
+ if (refcount && NCNN_XADD(refcount, -1) == 1)
+ fastFree(data);
+
+ data = 0;
+
+ elemsize = 0;
+
+ dims = 0;
+ w = 0;
+ h = 0;
+ c = 0;
+
+ cstep = 0;
+
+ refcount = 0;
+}
+
+inline bool Mat::empty() const { return data == 0 || total() == 0; }
+
+inline size_t Mat::total() const { return cstep * c; }
+
+inline Mat Mat::channel(int c)
+{
+ return Mat(w, h, (unsigned char *)data + cstep * c * elemsize, elemsize);
+}
+
+inline const Mat Mat::channel(int c) const
+{
+ return Mat(w, h, (unsigned char *)data + cstep * c * elemsize, elemsize);
+}
+
+inline float *Mat::row(int y) { return (float *)data + w * y; }
+
+inline const float *Mat::row(int y) const { return (const float *)data + w * y; }
+
+template <typename T> inline T *Mat::row(int y) { return (T *)data + w * y; }
+
+template <typename T> inline const T *Mat::row(int y) const { return (const T *)data + w * y; }
+
+template <typename T> inline Mat::operator T *() { return (T *)data; }
+
+template <typename T> inline Mat::operator const T *() const { return (const T *)data; }
+
+inline float &Mat::operator[](int i) { return ((float *)data)[i]; }
+
+inline const float &Mat::operator[](int i) const { return ((const float *)data)[i]; }
+
+} // namespace ncnn
+} // namespace nnfw
+
+#endif // __NNFW_NCNN_MAT_H__
diff --git a/compute/ncnn/include/ncnn/srcn/conv_type.h b/compute/ncnn/include/ncnn/srcn/conv_type.h
new file mode 100644
index 000000000..59152a094
--- /dev/null
+++ b/compute/ncnn/include/ncnn/srcn/conv_type.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_CONV_TYPE_H__
+#define __NNFW_SRCN_CONV_TYPE_H__
+
+namespace nnfw
+{
+namespace srcn
+{
+
+enum convType_t
+{
+ row_major = 0,
+ col_major
+};
+
+struct convMat_t
+{
+ int w;
+ int h;
+ int c;
+ int n;
+ float *data;
+};
+
+struct convParams_t
+{
+ int kernel_w;
+ int kernel_h;
+ int stride_w;
+ int stride_h;
+ int dilation_w;
+ int dilation_h;
+ int padding;
+ int pad_w;
+ int pad_h;
+};
+
+struct winogradParams_t
+{
+ int kernel_w;
+ int kernel_h;
+ int stride_w;
+ int stride_h;
+ int dilation_w;
+ int dilation_h;
+ int batch;
+ int w;
+ int h;
+ int inch;
+ int outch;
+ int num_threads;
+ convType_t conv_type;
+ float *weight_data;
+};
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_CONV_TYPE_H__
diff --git a/compute/ncnn/include/ncnn/srcn/srcn_conv.h b/compute/ncnn/include/ncnn/srcn/srcn_conv.h
new file mode 100644
index 000000000..11130c0db
--- /dev/null
+++ b/compute/ncnn/include/ncnn/srcn/srcn_conv.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_CONV_H__
+#define __NNFW_SRCN_CONV_H__
+
+#include "conv_type.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+int check_winograd(winogradParams_t &params);
+
+float *trans_weight2winograd(winogradParams_t &params, unsigned int *size = NULL);
+
+void winograd_release(float *winograd_weight);
+
+void srcn_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, const float *winograd_weight, int num_threads,
+ convType_t conv_type);
+
+void srcn_deconvolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, int num_threads, convType_t conv_type);
+
+void *trans_weight2sparse(const convMat_t &weights_mat);
+
+void sparse_release(const int outch, void *ptr);
+
+void srcn_sparse_convolution2D(const convMat_t &in_mat, convMat_t &out_mat,
+ const convParams_t &in_param, const void *sparse_weight,
+ int number_threas, convType_t conv_type);
+
+void srcn_batch_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat,
+ convMat_t &out_mat, const convParams_t &in_param,
+ const float *winograd_weight, int num_threads, convType_t conv_type);
+
+void srcn_convolution2D_gpu(const convMat_t &in_mat, const convMat_t &weights_mat,
+ convMat_t &out_mat, const convParams_t &in_param, convType_t conv_type);
+
+void srcn_convolution2D_dpu(const convMat_t &in_mat, const convMat_t &weights_mat,
+ convMat_t &out_mat, const convParams_t &in_param, convType_t conv_type);
+
+void srcn_depthwise_conv(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convMat_t &bias, const convParams_t &in_param, int num_threads,
+ convType_t conv_type);
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_CONV_H__
diff --git a/compute/ncnn/src/layer/arm/neon_mathfun.h b/compute/ncnn/src/layer/arm/neon_mathfun.h
new file mode 100644
index 000000000..6e3cb66c8
--- /dev/null
+++ b/compute/ncnn/src/layer/arm/neon_mathfun.h
@@ -0,0 +1,315 @@
+/* NEON implementation of sin, cos, exp and log
+ *
+ * Inspired by Intel Approximate Math library, and based on the
+ * corresponding algorithms of the cephes math library
+ */
+
+/* Copyright (C) 2011 Julien Pommier
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * (this is the zlib license)
+ */
+
+#include <arm_neon.h>
+
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 -1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 -1.2420140846E-1
+#define c_cephes_log_p4 +1.4249322787E-1
+#define c_cephes_log_p5 -1.6668057665E-1
+#define c_cephes_log_p6 +2.0000714765E-1
+#define c_cephes_log_p7 -2.4999993993E-1
+#define c_cephes_log_p8 +3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+
+/* natural logarithm computed for 4 simultaneous float
+ * return NaN for x <= 0
+ */
+static inline float32x4_t log_ps(float32x4_t x)
+{
+ float32x4_t one = vdupq_n_f32(1);
+
+ x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+ uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+ int32x4_t ux = vreinterpretq_s32_f32(x);
+
+ int32x4_t emm0 = vshrq_n_s32(ux, 23);
+
+ /* keep only the fractional part */
+ ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+ ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+ x = vreinterpretq_f32_s32(ux);
+
+ emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+ float32x4_t e = vcvtq_f32_s32(emm0);
+
+ e = vaddq_f32(e, one);
+
+ /* part2:
+ * if( x < SQRTHF ) {
+ * e -= 1;
+ * x = x + x - 1.0;
+ * } else { x = x - 1.0; }
+ */
+ uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+ float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+ x = vsubq_f32(x, one);
+ e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+ x = vaddq_f32(x, tmp);
+
+ float32x4_t z = vmulq_f32(x, x);
+
+ float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+ y = vmulq_f32(y, x);
+
+ y = vmulq_f32(y, z);
+
+ tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+ y = vaddq_f32(y, tmp);
+
+ tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+ y = vsubq_f32(y, tmp);
+
+ tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+ x = vaddq_f32(x, y);
+ x = vaddq_f32(x, tmp);
+ x = vreinterpretq_f32_u32(
+ vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+ return x;
+}
+
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+
+/* exp() computed for 4 float at once */
+static inline float32x4_t exp_ps(float32x4_t x)
+{
+ float32x4_t tmp, fx;
+
+ float32x4_t one = vdupq_n_f32(1);
+ x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+ x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+
+ /* express exp(x) as exp(g + n*log(2)) */
+ fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+
+ /* perform a floorf */
+ tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+ /* if greater, substract 1 */
+ uint32x4_t mask = vcgtq_f32(tmp, fx);
+ mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+
+ fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+ tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+ float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+ x = vsubq_f32(x, tmp);
+ x = vsubq_f32(x, z);
+
+ static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2,
+ c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5};
+ float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
+ float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
+ float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
+ float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
+ float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
+ float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
+
+ y = vmulq_f32(y, x);
+ z = vmulq_f32(x, x);
+
+ y = vaddq_f32(y, c1);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, c2);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, c3);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, c4);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, c5);
+
+ y = vmulq_f32(y, z);
+ y = vaddq_f32(y, x);
+ y = vaddq_f32(y, one);
+
+ /* build 2^n */
+ int32x4_t mm;
+ mm = vcvtq_s32_f32(fx);
+ mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+ mm = vshlq_n_s32(mm, 23);
+ float32x4_t pow2n = vreinterpretq_f32_s32(mm);
+
+ y = vmulq_f32(y, pow2n);
+ return y;
+}
+
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1 8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0 2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2 4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+
+/* evaluation of 4 sines & cosines at once.
+ *
+ * The code is the exact rewriting of the cephes sinf function.
+ * Precision is excellent as long as x < 8192 (I did not bother to
+ * take into account the special handling they have for greater values
+ * -- it does not return garbage for arguments over 8192, though, but
+ * the extra precision is missing).
+ *
+ * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+ * surprising but correct result.
+ *
+ * Note also that when you compute sin(x), cos(x) is available at
+ * almost no extra price so both sin_ps and cos_ps make use of
+ * sincos_ps..
+ */
+static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos)
+{
+ // any x
+ float32x4_t xmm1, xmm2, xmm3, y;
+
+ uint32x4_t emm2;
+
+ uint32x4_t sign_mask_sin, sign_mask_cos;
+ sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+ x = vabsq_f32(x);
+
+ /* scale by 4/Pi */
+ y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+
+ /* store the integer part of y in mm0 */
+ emm2 = vcvtq_u32_f32(y);
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+ emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+ y = vcvtq_f32_u32(emm2);
+
+ /* get the polynom selection mask
+ * there is one polynom for 0 <= x <= Pi/4
+ * and another one for Pi/4<x<=Pi/2
+ *
+ * Both branches will be computed.
+ */
+ uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ * x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+ xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+ xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+ x = vaddq_f32(x, xmm1);
+ x = vaddq_f32(x, xmm2);
+ x = vaddq_f32(x, xmm3);
+
+ sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+ sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
+ * and the second polynom (Pi/4 <= x <= 0) in y2 */
+ float32x4_t z = vmulq_f32(x, x);
+ float32x4_t y1, y2;
+
+ y1 = vmulq_n_f32(z, c_coscof_p0);
+ y2 = vmulq_n_f32(z, c_sincof_p0);
+ y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+ y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+ y1 = vmulq_f32(y1, z);
+ y2 = vmulq_f32(y2, z);
+ y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+ y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+ y1 = vmulq_f32(y1, z);
+ y2 = vmulq_f32(y2, z);
+ y1 = vmulq_f32(y1, z);
+ y2 = vmulq_f32(y2, x);
+ y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+ y2 = vaddq_f32(y2, x);
+ y1 = vaddq_f32(y1, vdupq_n_f32(1));
+
+ /* select the correct result from the two polynoms */
+ float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
+ float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
+ *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+ *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+
+static inline float32x4_t sin_ps(float32x4_t x)
+{
+ float32x4_t ysin, ycos;
+ sincos_ps(x, &ysin, &ycos);
+ return ysin;
+}
+
+static inline float32x4_t cos_ps(float32x4_t x)
+{
+ float32x4_t ysin, ycos;
+ sincos_ps(x, &ysin, &ycos);
+ return ycos;
+}
+
+static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
+{
+ float32x4_t reciprocal = vrecpeq_f32(b);
+ reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+ // reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+ return vmulq_f32(a, reciprocal);
+}
+
+static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
+{
+ // pow(x, m) = exp(m * log(x))
+ return exp_ps(vmulq_f32(b, log_ps(a)));
+}
diff --git a/compute/ncnn/src/layer/binaryop.cc b/compute/ncnn/src/layer/binaryop.cc
new file mode 100644
index 000000000..a09d55f78
--- /dev/null
+++ b/compute/ncnn/src/layer/binaryop.cc
@@ -0,0 +1,1640 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ncnn/layer/binaryop.h"
+#include <math.h>
+#include <algorithm>
+#include <functional>
+#include <sys/time.h>
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "arm/neon_mathfun.h"
+#endif // __ARM_NEON
+
+namespace nnfw
+{
+namespace ncnn
+{
+
+template <typename Op> static int binary_op(const Mat &a, const Mat &b, Mat &c)
+{
+ Op op;
+
+ int w = a.w;
+ int h = a.h;
+ int channels = a.c;
+ int size = w * h;
+
+ int w1 = b.w;
+ int h1 = b.h;
+ int channels1 = b.c;
+ int size1 = w1 * h1;
+
+ if (a.dims == 3)
+ {
+ c.create(w, h, channels);
+ if (c.empty())
+ return -100;
+
+ if (b.dims == 3)
+ {
+ if (b.w == 1 && b.h == 1)
+ {
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = a.channel(q);
+ const float *ptr1 = b.channel(q);
+ float *outptr = c.channel(q);
+
+ float tt = *ptr1;
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = op(ptr[i], tt);
+ }
+ }
+
+ return 0;
+ }
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = a.channel(q);
+ const float *ptr1 = b.channel(q);
+ float *outptr = c.channel(q);
+
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = op(ptr[i], ptr1[i]);
+ }
+ }
+
+ return 0;
+ }
+
+ if (b.dims == 2)
+ {
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = a.channel(q);
+ const float *ptr1 = (const float *)b + h * q;
+ float *outptr = c.channel(q);
+
+ for (int y = 0; y < h; y++)
+ {
+ const float b0 = ptr1[y];
+ for (int x = 0; x < w; x++)
+ {
+ outptr[x] = op(ptr[x], b0);
+ }
+
+ ptr += w;
+ outptr += w;
+ }
+ }
+
+ return 0;
+ }
+
+ if (b.dims == 1)
+ {
+ if (b.w == 1)
+ {
+ const float b0 = b[0];
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = a.channel(q);
+ float *outptr = c.channel(q);
+
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = op(ptr[i], b0);
+ }
+ }
+
+ return 0;
+ }
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = a.channel(q);
+ const float b0 = b[q];
+ float *outptr = c.channel(q);
+
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = op(ptr[i], b0);
+ }
+ }
+
+ return 0;
+ }
+ }
+ else if (a.dims == 2)
+ {
+ if (b.dims == 3)
+ {
+ c.create(w1, h1, channels1);
+ if (c.empty())
+ return -100;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float *ptr = (const float *)a + h1 * q;
+ const float *ptr1 = b.channel(q);
+ float *outptr = c.channel(q);
+
+ for (int y = 0; y < h1; y++)
+ {
+ const float a0 = ptr[y];
+ for (int x = 0; x < w1; x++)
+ {
+ outptr[x] = op(a0, ptr1[x]);
+ }
+
+ ptr1 += w1;
+ outptr += w1;
+ }
+ }
+
+ return 0;
+ }
+
+ c.create(w, h);
+ if (c.empty())
+ return -100;
+
+ if (b.dims == 2)
+ {
+ for (int i = 0; i < size; i++)
+ {
+ c[i] = op(a[i], b[i]);
+ }
+
+ return 0;
+ }
+
+ if (b.dims == 1)
+ {
+ c.create(w, h);
+ if (c.empty())
+ return -100;
+
+ if (b.w == 1)
+ {
+ const float b0 = b[0];
+ for (int i = 0; i < size; i++)
+ {
+ c[i] = op(a[i], b0);
+ }
+
+ return 0;
+ }
+
+ const float *ptr = a;
+ float *outptr = c;
+
+ for (int y = 0; y < h; y++)
+ {
+ const float b0 = b[y];
+ for (int x = 0; x < w; x++)
+ {
+ outptr[x] = op(ptr[x], b0);
+ }
+
+ ptr += w;
+ outptr += w;
+ }
+
+ return 0;
+ }
+ }
+ else if (a.dims == 1)
+ {
+ if (a.w == 1)
+ {
+ if (b.dims == 3)
+ {
+ c.create(w1, h1, channels1);
+ if (c.empty())
+ return -100;
+
+ const float a0 = a[0];
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float *ptr1 = b.channel(q);
+ float *outptr = c.channel(q);
+
+ for (int i = 0; i < size1; i++)
+ {
+ outptr[i] = op(a0, ptr1[i]);
+ }
+ }
+
+ return 0;
+ }
+
+ if (b.dims == 2)
+ {
+ c.create(w1, h1);
+ if (c.empty())
+ return -100;
+
+ const float a0 = a[0];
+ for (int i = 0; i < size1; i++)
+ {
+ c[i] = op(a0, b[i]);
+ }
+
+ return 0;
+ }
+
+ if (b.dims == 1)
+ {
+ c.create(w1);
+ if (c.empty())
+ return -100;
+
+ const float a0 = a[0];
+ for (int i = 0; i < size1; i++)
+ {
+ c[i] = op(a0, b[i]);
+ }
+
+ return 0;
+ }
+ }
+
+ if (b.dims == 3)
+ {
+ c.create(w1, h1, channels1);
+ if (c.empty())
+ return -100;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float a0 = a[q];
+ const float *ptr1 = b.channel(q);
+ float *outptr = c.channel(q);
+
+ for (int i = 0; i < size1; i++)
+ {
+ outptr[i] = op(a0, ptr1[i]);
+ }
+ }
+
+ return 0;
+ }
+
+ if (b.dims == 2)
+ {
+ c.create(w1, h1);
+ if (c.empty())
+ return -100;
+
+ const float *ptr1 = b;
+ float *outptr = c;
+
+ for (int y = 0; y < h1; y++)
+ {
+ const float a0 = a[y];
+ for (int x = 0; x < w1; x++)
+ {
+ outptr[x] = op(a0, ptr1[x]);
+ }
+
+ ptr1 += w1;
+ outptr += w1;
+ }
+
+ return 0;
+ }
+
+ if (b.dims == 1)
+ {
+ c.create(w);
+ if (c.empty())
+ return -100;
+
+ if (b.w == 1)
+ {
+ const float b0 = b[0];
+ for (int i = 0; i < size; i++)
+ {
+ c[i] = op(a[i], b0);
+ }
+
+ return 0;
+ }
+
+ for (int i = 0; i < size; i++)
+ {
+ c[i] = op(a[i], b[i]);
+ }
+ }
+ }
+
+ return 0;
+}
+
+template <typename Op> static int binary_op_scalar_inplace(Mat &a, float b)
+{
+ Op op;
+
+ int w = a.w;
+ int h = a.h;
+ int channels = a.c;
+ int size = w * h;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ float *ptr = a.channel(q);
+
+ for (int i = 0; i < size; i++)
+ {
+ ptr[i] = op(ptr[i], b);
+ }
+ }
+
+ return 0;
+}
+
+template <typename T> struct binary_op_max : std::binary_function<T, T, T>
+{
+ T operator()(const T &x, const T &y) const { return std::max(x, y); }
+};
+
+template <typename T> struct binary_op_min : std::binary_function<T, T, T>
+{
+ T operator()(const T &x, const T &y) const { return std::min(x, y); }
+};
+
+template <typename T> struct binary_op_pow : std::binary_function<T, T, T>
+{
+ T operator()(const T &x, const T &y) const { return pow(x, y); }
+};
+
+template <typename T> struct binary_op_SquaredDifference : std::binary_function<T, T, T>
+{
+ T operator()(const T &x, const T &y) const { return pow((x - y), 2); }
+};
+
+int ncnn_binary_op(const BinaryOpParam &param, const Mat &bottom_blob, const Mat &bottom_blob1,
+ Mat &top_blob)
+{
+ int ret = 0;
+ auto op_type = param.op_type;
+ // auto b = param.b;
+
+ // Only support add operation, none broadcasting
+ // Other case, need to remove internal memory allocation and check correctness
+ if (op_type != BinaryOp::Operation_ADD)
+ {
+ throw std::runtime_error{"NYI: Only support ADD operation"};
+ }
+ if (bottom_blob.dims != bottom_blob1.dims)
+ {
+ throw std::runtime_error{"NYI: Cannot use broadcasting"};
+ }
+
+// printf("-------------------BinaryOp---------------\n");
+
+// printf("op_type = %d, ", op_type);
+// printf("in1: (%d, %d, %d), dims = %d, ", bottom_blob.w, bottom_blob.h, bottom_blob.c,
+// bottom_blob.dims);
+// printf("in2: (%d, %d, %d), dims = %d\n", bottom_blob1.w, bottom_blob1.h, bottom_blob1.c,
+// bottom_blob1.dims);
+
+#if __ARM_NEON
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ int w1 = bottom_blob1.w;
+ int h1 = bottom_blob1.h;
+ int channels1 = bottom_blob1.c;
+ int size1 = w1 * h1;
+
+ if (op_type == BinaryOp::Operation_ADD)
+ {
+ if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
+ {
+ // Fix for nnfw: disable allocation for output
+ // top_blob.create(w, h, channels);
+ if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
+ {
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+ float tt = *ptr1;
+
+ float32x4_t _p2 = vdupq_n_f32(tt);
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+
+ _p1 = vaddq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 + tt);
+ in1++;
+ out++;
+ }
+
+#else
+ float tt = *ptr1;
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = (ptr[i] + tt);
+ }
+#endif
+ }
+
+ ret = 0;
+ }
+ else
+ {
+ if (size * bottom_blob.elemsize % 16 != 0)
+ {
+ throw std::runtime_error{"Unmatched alignment"};
+ }
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *in2 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vld1q_f32(in2);
+
+ _p1 = vaddq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ in2 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = *in1 + *in2;
+ in1++;
+ in2++;
+ out++;
+ }
+ }
+ }
+ }
+ else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
+ {
+ top_blob.create(w, h, channels);
+ if (bottom_blob1.w == 1)
+ {
+ ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
+ // return ret;
+ goto out;
+ }
+ float *pt = (float *)bottom_blob1.data;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float b0 = pt[q];
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vdupq_n_f32(b0);
+
+ _p1 = vaddq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 + b0);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w1, h1, channels1);
+ if (top_blob.empty())
+ return -100;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float a0 = bottom_blob[q];
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size1 >> 2;
+ int remain = size1 - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vdupq_n_f32(a0);
+ float32x4_t _p2 = vld1q_f32(in1);
+
+ _p1 = vaddq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (a0 + *in1);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else
+ ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
+ }
+
+#if 0 // Disable operation except Operation_ADD
+
+ if (op_type == BinaryOp::Operation_SUB)
+ {
+ if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w, h, channels);
+
+ if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
+ {
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+ float tt = *ptr1;
+
+ float32x4_t _p2 = vdupq_n_f32(tt);
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+
+ _p1 = vsubq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 - tt);
+ in1++;
+ out++;
+ }
+
+#else
+ float tt = *ptr1;
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = (ptr[i] - tt);
+ }
+#endif
+ }
+
+ ret = 0;
+ }
+ else
+ {
+ top_blob.create(w, h, channels);
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *in2 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vld1q_f32(in2);
+
+ _p1 = vsubq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ in2 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = *in1 - *in2;
+ in1++;
+ in2++;
+ out++;
+ }
+ }
+ }
+ }
+ else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
+ {
+ top_blob.create(w, h, channels);
+ if (bottom_blob1.w == 1)
+ {
+ ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
+ // return ret;
+ goto out;
+ }
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float b0 = bottom_blob1[q];
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vdupq_n_f32(b0);
+
+ _p1 = vsubq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 - b0);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w1, h1, channels1);
+ if (top_blob.empty())
+ return -100;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float a0 = bottom_blob[q];
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size1 >> 2;
+ int remain = size1 - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vdupq_n_f32(a0);
+ float32x4_t _p2 = vld1q_f32(in1);
+
+ _p1 = vsubq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (a0 - *in1);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else
+ ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
+ }
+
+ if (op_type == BinaryOp::Operation_MUL)
+ {
+ if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w, h, channels);
+
+ if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
+ {
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+ float tt = *ptr1;
+
+ float32x4_t _p2 = vdupq_n_f32(tt);
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+
+ _p1 = vmulq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 * tt);
+ in1++;
+ out++;
+ }
+
+#else
+ float tt = *ptr1;
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = (ptr[i] * tt);
+ }
+#endif
+ }
+
+ ret = 0;
+ }
+ else
+ {
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *in2 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vld1q_f32(in2);
+
+ _p1 = vmulq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ in2 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = *in1 * *in2;
+ in1++;
+ in2++;
+ out++;
+ }
+ }
+ }
+ }
+ else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
+ {
+ top_blob.create(w, h, channels);
+ if (bottom_blob1.w == 1)
+ {
+ ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
+ // return ret;
+ goto out;
+ }
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float b0 = bottom_blob1[q];
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vdupq_n_f32(b0);
+
+ _p1 = vmulq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 * b0);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w1, h1, channels1);
+ if (top_blob.empty())
+ return -100;
+
+ if (bottom_blob.w != bottom_blob1.c)
+ {
+ ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
+ goto out;
+ }
+
+ float *pt = (float *)bottom_blob.data;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float a0 = pt[q];
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size1 >> 2;
+ int remain = size1 - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vdupq_n_f32(a0);
+ float32x4_t _p2 = vld1q_f32(in1);
+
+ _p1 = vmulq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (a0 * *in1);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else
+ ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
+ }
+
+ if (op_type == BinaryOp::Operation_DIV)
+ {
+ if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w, h, channels);
+ if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
+ {
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+ float tt = *ptr1;
+
+ float32x4_t _p2 = vdupq_n_f32(tt);
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+
+ float32x4_t _p3 = vrecpeq_f32(_p2);
+ _p3 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
+ _p1 = vmulq_f32(_p1, _p3);
+
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 / tt);
+ in1++;
+ out++;
+ }
+
+#else
+ float tt = *ptr1;
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = (ptr[i] / tt);
+ }
+#endif
+ }
+
+ // return 0;
+ goto out;
+ }
+ else
+ {
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *in2 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vld1q_f32(in2);
+
+ float32x4_t _p3 = vrecpeq_f32(_p2);
+ _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
+ _p1 = vmulq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ in2 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = *in1 / *in2;
+ in1++;
+ in2++;
+ out++;
+ }
+ }
+ }
+ }
+ else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
+ {
+ top_blob.create(w, h, channels);
+ if (bottom_blob1.w == 1)
+ {
+ ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
+ // return ret;
+ goto out;
+ }
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float b0 = bottom_blob1[q];
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vdupq_n_f32(b0);
+
+ //_p1 = vsubq_f32(_p1, _p2);
+ float32x4_t _p3 = vrecpeq_f32(_p2);
+ _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
+ _p1 = vmulq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 / b0);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w1, h1, channels1);
+ if (top_blob.empty())
+ return -100;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float a0 = bottom_blob[q];
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size1 >> 2;
+ int remain = size1 - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vdupq_n_f32(a0);
+ float32x4_t _p2 = vld1q_f32(in1);
+
+ //_p1 = vsubq_f32(_p1, _p2);
+ float32x4_t _p3 = vrecpeq_f32(_p2);
+ _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
+ _p1 = vmulq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (a0 / *in1);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else
+ ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
+ }
+
+ if (op_type == BinaryOp::Operation_MAX)
+ ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob);
+
+ if (op_type == BinaryOp::Operation_MIN)
+ ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob);
+
+ if (op_type == BinaryOp::Operation_POW)
+ {
+ if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w, h, channels);
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *in2 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vld1q_f32(in2);
+
+ _p1 = pow_ps(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ in2 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = pow(*in1, *in2);
+ in1++;
+ in2++;
+ out++;
+ }
+ }
+ }
+ else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
+ {
+ top_blob.create(w, h, channels);
+ if (bottom_blob1.w == 1)
+ {
+ ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
+ // return ret;
+ goto out;
+ }
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float b0 = bottom_blob1[q];
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vdupq_n_f32(b0);
+
+ _p1 = pow_ps(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = pow(*in1, b0);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w1, h1, channels1);
+ if (top_blob.empty())
+ return -100;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float a0 = bottom_blob[q];
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size1 >> 2;
+ int remain = size1 - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vdupq_n_f32(a0);
+ float32x4_t _p2 = vld1q_f32(in1);
+
+ _p1 = pow_ps(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = pow(a0, *in1);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else
+ ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
+ }
+
+ if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
+ {
+ if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w, h, channels);
+
+ if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
+ {
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+ float tt = *ptr1;
+
+ float32x4_t _p2 = vdupq_n_f32(tt);
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+
+ _p1 = vsubq_f32(_p1, _p2);
+ _p1 = vmulq_f32(_p1, _p1);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ float t2 = *in1 - tt;
+ *out = t2 * t2;
+ in1++;
+ out++;
+ }
+
+#else
+ float tt = *ptr1;
+ for (int i = 0; i < size; i++)
+ {
+ float t2 = (ptr[i] - tt);
+ outptr[i] = t2 * t2;
+ }
+#endif
+ }
+
+ ret = 0;
+ }
+ else
+ {
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *in2 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vld1q_f32(in2);
+
+ _p1 = vsubq_f32(_p1, _p2);
+ _p1 = vmulq_f32(_p1, _p1);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ in2 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 - *in2) * (*in1 - *in2);
+ in1++;
+ in2++;
+ out++;
+ }
+ }
+ }
+ }
+ else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
+ {
+ top_blob.create(w, h, channels);
+ if (bottom_blob1.w == 1)
+ {
+ ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
+ // return ret;
+ goto out;
+ }
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const float *ptr = bottom_blob.channel(q);
+ const float b0 = bottom_blob1[q];
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vdupq_n_f32(b0);
+
+ _p1 = vsubq_f32(_p1, _p2);
+ _p1 = vmulq_f32(_p1, _p1);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (*in1 - b0) * (*in1 - b0);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
+ {
+ top_blob.create(w1, h1, channels1);
+ if (top_blob.empty())
+ return -100;
+
+#pragma omp parallel for
+ for (int q = 0; q < channels1; q++)
+ {
+ const float a0 = bottom_blob[q];
+ const float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size1 >> 2;
+ int remain = size1 - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vdupq_n_f32(a0);
+ float32x4_t _p2 = vld1q_f32(in1);
+
+ _p1 = vsubq_f32(_p1, _p2);
+ _p1 = vmulq_f32(_p1, _p1);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = (a0 - *in1) * (a0 - *in1);
+ in1++;
+ out++;
+ }
+ }
+ }
+ else
+ ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
+ }
+
+#endif // 0 (Disable operation except Operation_ADD)
+
+#else
+
+ if (op_type == BinaryOp::Operation_ADD)
+ ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
+
+ if (op_type == BinaryOp::Operation_SUB)
+ ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
+
+ if (op_type == BinaryOp::Operation_MUL)
+ ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
+
+ if (op_type == BinaryOp::Operation_DIV)
+ ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
+
+ if (op_type == BinaryOp::Operation_MAX)
+ ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob);
+
+ if (op_type == BinaryOp::Operation_MIN)
+ ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob);
+
+ if (op_type == BinaryOp::Operation_POW)
+ ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
+ if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
+ ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
+#endif
+
+/*
+for (int p = 0; p < top_blob.c && p < 5; p++)
+{
+ float* outptr = top_blob.channel(p);
+ printf("channel: %d\n", p);
+ for (int i = 0; i < 1; i++)
+ {
+ for (int j = 0; j < 5; j++)
+ {
+ printf("%f ", outptr[j]);
+ }
+ printf("\n");
+ outptr += top_blob.w;
+ }
+}
+printf("----------------------------\n");
+*/
+
+out:
+ return ret;
+}
+
+int ncnn_binary_op_inplace(const BinaryOpParam &param, Mat &bottom_top_blob)
+{
+ auto op_type = param.op_type;
+ auto b = param.b;
+
+ // printf("-------------------BinaryOp-----forward_inplace----------\n");
+ if (op_type == BinaryOp::Operation_ADD)
+ return binary_op_scalar_inplace<std::plus<float>>(bottom_top_blob, b);
+
+ if (op_type == BinaryOp::Operation_SUB)
+ return binary_op_scalar_inplace<std::minus<float>>(bottom_top_blob, b);
+
+ if (op_type == BinaryOp::Operation_MUL)
+ return binary_op_scalar_inplace<std::multiplies<float>>(bottom_top_blob, b);
+
+ if (op_type == BinaryOp::Operation_DIV)
+ return binary_op_scalar_inplace<std::divides<float>>(bottom_top_blob, b);
+
+ if (op_type == BinaryOp::Operation_MAX)
+ return binary_op_scalar_inplace<binary_op_max<float>>(bottom_top_blob, b);
+
+ if (op_type == BinaryOp::Operation_MIN)
+ return binary_op_scalar_inplace<binary_op_min<float>>(bottom_top_blob, b);
+
+ if (op_type == BinaryOp::Operation_POW)
+ return binary_op_scalar_inplace<binary_op_pow<float>>(bottom_top_blob, b);
+
+ if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
+ return binary_op_scalar_inplace<binary_op_SquaredDifference<float>>(bottom_top_blob, b);
+
+ return 0;
+}
+
+int ncnn_binary_op_inplace(const BinaryOpParam &param, Mat &bottom_blob, Mat &bottom_top_blob)
+{
+ int ret = 0;
+
+ Mat &bottom_blob1 = bottom_top_blob;
+ Mat &top_blob = bottom_top_blob;
+ auto op_type = param.op_type;
+
+ if (op_type == BinaryOp::Operation_ADD)
+ {
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+// Unused variables
+// int w1 = bottom_blob1.w;
+// int h1 = bottom_blob1.h;
+// int channels1 = bottom_blob1.c;
+// int size1 = w1 * h1;
+
+#if __ARM_NEON
+
+ if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
+ {
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ float *ptr = bottom_blob.channel(q);
+ float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+
+ float *in1 = const_cast<float *>(ptr);
+ float *in2 = const_cast<float *>(ptr1);
+ float *out = const_cast<float *>(outptr);
+
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _p1 = vld1q_f32(in1);
+ float32x4_t _p2 = vld1q_f32(in2);
+
+ _p1 = vaddq_f32(_p1, _p2);
+ vst1q_f32(out, _p1);
+ in1 += 4;
+ in2 += 4;
+ out += 4;
+ }
+ for (; remain > 0; remain--)
+ {
+ *out = *in1 + *in2;
+ in1++;
+ in2++;
+ out++;
+ }
+ }
+ }
+#else
+ if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
+ {
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ float *ptr = bottom_blob.channel(q);
+ float *ptr1 = bottom_blob1.channel(q);
+ float *outptr = top_blob.channel(q);
+
+ for (int i = 0; i < size; i++)
+ {
+ outptr[i] = ptr[i] + ptr1[i];
+ }
+ }
+ return 0;
+ }
+#endif
+ }
+ else
+ {
+ return -1;
+ }
+ return ret;
+}
+
+} // namespace ncnn
+} // namespace ncnn
diff --git a/compute/ncnn/src/layer/instance_norm.cc b/compute/ncnn/src/layer/instance_norm.cc
new file mode 100644
index 000000000..08c3f2c23
--- /dev/null
+++ b/compute/ncnn/src/layer/instance_norm.cc
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ncnn/layer/instance_norm.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <math.h>
+#include "ncnn/mat.h"
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace nnfw
+{
+namespace ncnn
+{
+
+void ncnn_instance_norm_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat,
+ int channels, float eps)
+{
+ // x = (x - mean) / (sqrt(var) + eps) * gamma + beta
+
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int size = w * h;
+#ifdef __ARM_NEON
+ int nn = size >> 2;
+ int left4 = size & 3;
+#endif
+
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+#ifdef __ARM_NEON
+ float *in_ptr = in_mat.channel(q);
+ float *out_ptr = out_mat.channel(q);
+ float32x4_t _sum = vdupq_n_f32(0.f);
+ float32x4_t _sq_sum = vdupq_n_f32(0.f);
+ for (int n = nn; n > 0; n--)
+ {
+ float32x4_t _p = vld1q_f32(in_ptr);
+ _sum = vaddq_f32(_sum, _p);
+ _p = vmulq_f32(_p, _p);
+ _sq_sum = vaddq_f32(_sq_sum, _p);
+ in_ptr += 4;
+ }
+ float sum = vgetq_lane_f32(_sum, 0) + vgetq_lane_f32(_sum, 1);
+ sum += vgetq_lane_f32(_sum, 2);
+ sum += vgetq_lane_f32(_sum, 3);
+ float sqsum = vgetq_lane_f32(_sq_sum, 0) + vgetq_lane_f32(_sq_sum, 1);
+ sqsum += vgetq_lane_f32(_sq_sum, 2);
+ sqsum += vgetq_lane_f32(_sq_sum, 3);
+
+ for (int left = left4; left > 0; left--)
+ {
+ sum += *in_ptr;
+ sqsum += (*in_ptr) * (*in_ptr);
+ in_ptr++;
+ }
+
+ float mean = sum / size;
+ float var = sqsum / size - mean * mean;
+ float gamma = gamma_mat[q];
+ float beta = beta_mat[q];
+ float a = gamma / (sqrt(var + eps));
+ float b = -mean * a + beta;
+
+ in_ptr = in_mat.channel(q);
+ float32x4_t _a = vdupq_n_f32(a);
+ float32x4_t _b = vdupq_n_f32(b);
+ for (int n = nn; n > 0; n--)
+ {
+ float32x4_t _p = vld1q_f32(in_ptr);
+ _p = vmulq_f32(_p, _a);
+ _p = vaddq_f32(_p, _b);
+ vst1q_f32(out_ptr, _p);
+ in_ptr += 4;
+ out_ptr += 4;
+ }
+ for (int left = left4; left > 0; left--)
+ {
+ *out_ptr = (*in_ptr) * a + b;
+ in_ptr++;
+ out_ptr++;
+ }
+#else
+ float *in_ptr = in_mat.channel(q);
+ float *out_ptr = out_mat.channel(q);
+ // mean and var
+ float sum = 0.f;
+ float sqsum = 0.f;
+ for (int i = 0; i < size; i++)
+ {
+ sum += in_ptr[i];
+ sqsum += in_ptr[i] * in_ptr[i];
+ }
+ float mean = sum / size;
+ float var = sqsum / size - mean * mean;
+
+ float gamma = gamma_mat[q];
+ float beta = beta_mat[q];
+
+ float a = gamma / (sqrt(var + eps));
+ float b = -mean * a + beta;
+ for (int i = 0; i < size; i++)
+ {
+ out_ptr[i] = in_ptr[i] * a + b;
+ }
+#endif
+ }
+}
+
+void ncnn_instance_norm_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat,
+ int /*channels*/, float eps)
+{
+ // Treat CHW layout as HWC layout
+ int h = in_mat.c;
+ int w = in_mat.h;
+ int c = in_mat.w;
+
+ int size = w * h;
+ int total = size * c;
+
+ float sum[c] = {};
+ float sqsum[c] = {};
+
+ float mean[c] = {};
+ float var[c] = {};
+ float a[c] = {};
+ float b[c] = {};
+
+ float *in_ptr = in_mat.channel(0);
+ float *out_ptr = out_mat.channel(0);
+
+#pragma omp parallel for reduction(+ : sum, sqsum) schedule(guided)
+ for (int i = 0; i < total; i += c)
+ {
+ for (int j = 0; j < c; j++)
+ {
+ sum[j] += in_ptr[i + j];
+ sqsum[j] += in_ptr[i + j] * in_ptr[i + j];
+ }
+ }
+
+ for (int i = 0; i < c; i++)
+ {
+ mean[i] = sum[i] / size;
+ var[i] = sqsum[i] / size - mean[i] * mean[i];
+ a[i] = gamma_mat[i] / (sqrt(var[i] + eps));
+ b[i] = -mean[i] * a[i] + beta_mat[i];
+ }
+
+#pragma omp parallel for schedule(guided)
+ for (int i = 0; i < total; i += c)
+ {
+ for (int j = 0; j < c; j++)
+ {
+ out_ptr[i + j] = in_ptr[i + j] * a[j] + b[j];
+ }
+ }
+}
+
+void ncnn_instance_norm_with_relu_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat,
+ int channels, float eps, float /*slope*/)
+{
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int size = w * h;
+#ifdef __ARM_NEON
+ int nn = size >> 2;
+ int left4 = size & 3;
+#endif
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+#ifdef __ARM_NEON
+ float *in_ptr = in_mat.channel(q);
+ float *out_ptr = out_mat.channel(q);
+ float32x4_t _sum = vdupq_n_f32(0.f);
+ float32x4_t _sq_sum = vdupq_n_f32(0.f);
+ for (int n = nn; n > 0; n--)
+ {
+ float32x4_t _p = vld1q_f32(in_ptr);
+ _sum = vaddq_f32(_sum, _p);
+ _p = vmulq_f32(_p, _p);
+ _sq_sum = vaddq_f32(_sq_sum, _p);
+ in_ptr += 4;
+ }
+ // float sum =
+ // vgetq_lane_f32(_sum,0)+vgetq_lane_f32(_sum,1)+vgetq_lane_f32(_sum,2)+vgetq_lane_f32(_sum,3);
+ // float sqsum = vgetq_lane_f32(_sq_sum,0)+vgetq_lane_f32(_sq_sum,1)+
+ // vgetq_lane_f32(_sq_sum,2)+vgetq_lane_f32(_sq_sum,3);
+ float sum = vgetq_lane_f32(_sum, 0) + vgetq_lane_f32(_sum, 1);
+ sum += vgetq_lane_f32(_sum, 2);
+ sum += vgetq_lane_f32(_sum, 3);
+ float sqsum = vgetq_lane_f32(_sq_sum, 0) + vgetq_lane_f32(_sq_sum, 1);
+ sqsum += vgetq_lane_f32(_sq_sum, 2);
+ sqsum += vgetq_lane_f32(_sq_sum, 3);
+ for (int left = left4; left > 0; left--)
+ {
+ sum += *in_ptr;
+ sqsum += (*in_ptr) * (*in_ptr);
+ in_ptr++;
+ }
+
+ float mean = sum / size;
+ float var = sqsum / size - mean * mean;
+ float gamma = gamma_mat[q];
+ float beta = beta_mat[q];
+ float a = gamma / (sqrt(var + eps));
+ float b = -mean * a + beta;
+ // TODO:slop is not used here , only for RELU which slop is always = 0;
+ in_ptr = in_mat.channel(q);
+ float32x4_t _a = vdupq_n_f32(a);
+ float32x4_t _b = vdupq_n_f32(b);
+ float32x4_t _zero = vdupq_n_f32(0.f);
+ for (int n = nn; n > 0; n--)
+ {
+ float32x4_t _p = vld1q_f32(in_ptr);
+ _p = vmulq_f32(_p, _a);
+ _p = vaddq_f32(_p, _b);
+ _p = vmaxq_f32(_p, _zero);
+ vst1q_f32(out_ptr, _p);
+ in_ptr += 4;
+ out_ptr += 4;
+ }
+ for (int left = left4; left > 0; left--)
+ {
+ int temp = (*in_ptr) * a + b;
+ *out_ptr = temp > 0 ? temp : 0;
+ in_ptr++;
+ out_ptr++;
+ }
+#else
+ float *in_ptr = in_mat.channel(q);
+ float *out_ptr = out_mat.channel(q);
+
+ // mean and var
+ float sum = 0.f;
+ float sqsum = 0.f;
+ for (int i = 0; i < size; i++)
+ {
+ sum += in_ptr[i];
+ sqsum += in_ptr[i] * in_ptr[i];
+ }
+ float mean = sum / size;
+ float var = sqsum / size - mean * mean;
+
+ float gamma = gamma_mat[q];
+ float beta = beta_mat[q];
+
+ float a = gamma / (sqrt(var + eps));
+ float b = -mean * a + beta;
+
+ if (slope == 0.f)
+ {
+ for (int i = 0; i < size; i++)
+ {
+ float temp = in_ptr[i] * a + b;
+ out_ptr[i] = temp > 0 ? temp : 0;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < size; i++)
+ {
+ float temp = in_ptr[i] * a + b;
+ out_ptr[i] = temp > 0 ? temp : temp * slope;
+ }
+ }
+#endif
+ }
+}
+
+void ncnn_instance_norm_with_relu_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat,
+ int /*channels*/, float eps, float slope)
+{
+ // Treat CHW layout as HWC layout
+ int h = in_mat.c;
+ int w = in_mat.h;
+ int c = in_mat.w;
+
+ int size = w * h;
+ int total = size * c;
+
+ float sum[c] = {};
+ float sqsum[c] = {};
+
+ float mean[c] = {};
+ float var[c] = {};
+ float a[c] = {};
+ float b[c] = {};
+
+ float *in_ptr = in_mat.channel(0);
+ float *out_ptr = out_mat.channel(0);
+
+#pragma omp parallel for reduction(+ : sum, sqsum) schedule(guided)
+ for (int i = 0; i < total; i += c)
+ {
+ for (int j = 0; j < c; j++)
+ {
+ sum[j] += in_ptr[i + j];
+ sqsum[j] += in_ptr[i + j] * in_ptr[i + j];
+ }
+ }
+
+ for (int i = 0; i < c; i++)
+ {
+ mean[i] = sum[i] / size;
+ var[i] = sqsum[i] / size - mean[i] * mean[i];
+ a[i] = gamma_mat[i] / (sqrt(var[i] + eps));
+ b[i] = -mean[i] * a[i] + beta_mat[i];
+ }
+
+ if (slope == 0.f)
+ {
+#pragma omp parallel for schedule(guided)
+ for (int i = 0; i < total; i += c)
+ {
+ for (int j = 0; j < c; j++)
+ {
+ float temp = in_ptr[i + j] * a[j] + b[j];
+ out_ptr[i + j] = temp > 0 ? temp : 0;
+ }
+ }
+ }
+ else
+ {
+#pragma omp parallel for schedule(guided)
+ for (int i = 0; i < total; i += c)
+ {
+ for (int j = 0; j < c; j++)
+ {
+ float temp = in_ptr[i + j] * a[j] + b[j];
+ out_ptr[i + j] = temp > 0 ? temp : temp * slope;
+ }
+ }
+ }
+}
+
+} // namespace ncnn
+
+} // namespace nnfw
diff --git a/compute/ncnn/src/mat.cc b/compute/ncnn/src/mat.cc
new file mode 100644
index 000000000..568378ef7
--- /dev/null
+++ b/compute/ncnn/src/mat.cc
@@ -0,0 +1,940 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ncnn/mat.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+// Fix for nnfw: comment out cpu.h
+//#include "cpu.h"
+
+namespace nnfw
+{
+namespace ncnn
+{
+
+void Mat::substract_mean_normalize(const float *mean_vals, const float *norm_vals)
+{
+ int size = w * h;
+
+ if (mean_vals && !norm_vals)
+ {
+// substract mean only
+#pragma omp parallel for
+ for (int q = 0; q < c; q++)
+ {
+ float *ptr = channel(q); // data + cstep * q;
+ const float mean = mean_vals[q];
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ if (nn > 0)
+ {
+ asm volatile("dup v1.4s, %w4 \n"
+ "0: \n"
+ "prfm pldl1keep, [%1, #128] \n"
+ "ld1 {v0.4s}, [%1] \n"
+ "fsub v0.4s, v0.4s, v1.4s \n"
+ "subs %w0, %w0, #1 \n"
+ "st1 {v0.4s}, [%1], #16 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn), "1"(ptr),
+ "r"(mean) // %4
+ : "cc", "memory", "v0", "v1");
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile("vdup.f32 q1, %4 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vsub.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%1 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn), "1"(ptr),
+ "r"(mean) // %4
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain > 0; remain--)
+ {
+ *ptr -= mean;
+ ptr++;
+ }
+ }
+ }
+ else if (!mean_vals && norm_vals)
+ {
+// normalize only
+#pragma omp parallel for
+ for (int q = 0; q < c; q++)
+ {
+ float *ptr = channel(q); // data + cstep * q;
+ const float norm = norm_vals[q];
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ if (nn > 0)
+ {
+ asm volatile("dup v1.4s, %w4 \n"
+ "0: \n"
+ "prfm pldl1keep, [%1, #128] \n"
+ "ld1 {v0.4s}, [%1] \n"
+ "fmul v0.4s, v0.4s, v1.4s \n"
+ "subs %w0, %w0, #1 \n"
+ "st1 {v0.4s}, [%1], #16 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn), "1"(ptr),
+ "r"(norm) // %4
+ : "cc", "memory", "v0", "v1");
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile("vdup.f32 q1, %4 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vmul.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%1 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn), "1"(ptr),
+ "r"(norm) // %4
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain > 0; remain--)
+ {
+ *ptr *= norm;
+ ptr++;
+ }
+ }
+ }
+ else if (mean_vals && norm_vals)
+ {
+// substract mean and normalize
+#pragma omp parallel for
+ for (int q = 0; q < c; q++)
+ {
+ float *ptr = channel(q); // data + cstep * q;
+ const float mean = mean_vals[q];
+ const float norm = norm_vals[q];
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ if (nn > 0)
+ {
+ asm volatile("dup v1.4s, %w4 \n"
+ "dup v2.4s, %w5 \n"
+ "0: \n"
+ "prfm pldl1keep, [%1, #128] \n"
+ "ld1 {v0.4s}, [%1] \n"
+ "fsub v0.4s, v0.4s, v1.4s \n"
+ "fmul v0.4s, v0.4s, v2.4s \n"
+ "subs %w0, %w0, #1 \n"
+ "st1 {v0.4s}, [%1], #16 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn), "1"(ptr),
+ "r"(mean), // %4
+ "r"(norm) // %5
+ : "cc", "memory", "v0", "v1", "v2");
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile("vdup.f32 q1, %4 \n"
+ "vdup.f32 q2, %5 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vsub.f32 q0, q0, q1 \n"
+ "vmul.f32 q0, q0, q2 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%1 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn), "1"(ptr),
+ "r"(mean), // %4
+ "r"(norm) // %5
+ : "cc", "memory", "q0", "q1", "q2");
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain > 0; remain--)
+ {
+ *ptr = (*ptr - mean) * norm;
+ ptr++;
+ }
+ }
+ }
+}
+
+// convert half precision floating point to float
+static float half2float(unsigned short value)
+{
+ // 1 : 5 : 10
+ unsigned short sign = (value & 0x8000) >> 15;
+ unsigned short exponent = (value & 0x7c00) >> 10;
+ unsigned short significand = value & 0x03FF;
+
+ // fprintf(stderr, "%d %d %d\n", sign, exponent, significand);
+
+ // 1 : 8 : 23
+ union {
+ unsigned int u;
+ float f;
+ } tmp;
+ if (exponent == 0)
+ {
+ if (significand == 0)
+ {
+ // zero
+ tmp.u = (sign << 31);
+ }
+ else
+ {
+ // denormal
+ exponent = 0;
+ // find non-zero bit
+ while ((significand & 0x200) == 0)
+ {
+ significand <<= 1;
+ exponent++;
+ }
+ significand <<= 1;
+ significand &= 0x3FF;
+ tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13);
+ }
+ }
+ else if (exponent == 0x1F)
+ {
+ // infinity or NaN
+ tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13);
+ }
+ else
+ {
+ // normalized
+ tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13);
+ }
+
+ return tmp.f;
+}
+
+Mat Mat::from_float16(const unsigned short *data, int size)
+{
+ Mat m(size);
+ if (m.empty())
+ return m;
+
+ float *ptr = m; //.data;
+
+#if __ARM_NEON && (__ARM_FP & 2)
+ // Fix for nnfw: Alway support vfpv4
+ // int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0;
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON && (__ARM_FP & 2)
+#if __aarch64__
+ if (nn > 0)
+ {
+ asm volatile("0: \n"
+ "ld1 {v0.4h}, [%1], #8 \n"
+ "fcvtl v1.4s, v0.4h \n"
+ "subs %w0, %w0, #1 \n"
+ "st1 {v1.4s}, [%2], #16 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(data), // %1
+ "=r"(ptr) // %2
+ : "0"(nn), "1"(data), "2"(ptr)
+ : "cc", "memory", "v0", "v1");
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile("0: \n"
+ "pld [%1, #64] \n"
+ "vld1.s16 {d0}, [%1 :64]! \n"
+ "vcvt.f32.f16 q1, d0 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d2-d3}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(data), // %1
+ "=r"(ptr) // %2
+ : "0"(nn), "1"(data), "2"(ptr)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain > 0; remain--)
+ {
+ *ptr = half2float(*data);
+
+ data++;
+ ptr++;
+ }
+
+ return m;
+}
+
+static void copy_make_border_image(const Mat &src, Mat &dst, int top, int left, int type, float v)
+{
+ int w = dst.w;
+ int h = dst.h;
+
+ const float *ptr = src; //.data;
+ float *outptr = dst; //.data;
+
+ if (type == BORDER_CONSTANT)
+ {
+ int y = 0;
+ // fill top
+ for (; y < top; y++)
+ {
+ int x = 0;
+ for (; x < w; x++)
+ {
+ outptr[x] = v;
+ }
+ outptr += w;
+ }
+ // fill center
+ for (; y < (top + src.h); y++)
+ {
+ int x = 0;
+ for (; x < left; x++)
+ {
+ outptr[x] = v;
+ }
+ if (src.w < 12)
+ {
+ for (; x < (left + src.w); x++)
+ {
+ outptr[x] = ptr[x - left];
+ }
+ }
+ else
+ {
+ memcpy(outptr + left, ptr, src.w * sizeof(float));
+ x += src.w;
+ }
+ for (; x < w; x++)
+ {
+ outptr[x] = v;
+ }
+ ptr += src.w;
+ outptr += w;
+ }
+ // fill bottom
+ for (; y < h; y++)
+ {
+ int x = 0;
+ for (; x < w; x++)
+ {
+ outptr[x] = v;
+ }
+ outptr += w;
+ }
+ }
+ else if (type == BORDER_REPLICATE)
+ {
+ int y = 0;
+ // fill top
+ for (; y < top; y++)
+ {
+ int x = 0;
+ for (; x < left; x++)
+ {
+ outptr[x] = ptr[0];
+ }
+ if (src.w < 12)
+ {
+ for (; x < (left + src.w); x++)
+ {
+ outptr[x] = ptr[x - left];
+ }
+ }
+ else
+ {
+ memcpy(outptr + left, ptr, src.w * sizeof(float));
+ x += src.w;
+ }
+ for (; x < w; x++)
+ {
+ outptr[x] = ptr[src.w - 1];
+ }
+ outptr += w;
+ }
+ // fill center
+ for (; y < (top + src.h); y++)
+ {
+ int x = 0;
+ for (; x < left; x++)
+ {
+ outptr[x] = ptr[0];
+ }
+ if (src.w < 12)
+ {
+ for (; x < (left + src.w); x++)
+ {
+ outptr[x] = ptr[x - left];
+ }
+ }
+ else
+ {
+ memcpy(outptr + left, ptr, src.w * sizeof(float));
+ x += src.w;
+ }
+ for (; x < w; x++)
+ {
+ outptr[x] = ptr[src.w - 1];
+ }
+ ptr += src.w;
+ outptr += w;
+ }
+ // fill bottom
+ ptr -= src.w;
+ for (; y < h; y++)
+ {
+ int x = 0;
+ for (; x < left; x++)
+ {
+ outptr[x] = ptr[0];
+ }
+ if (src.w < 12)
+ {
+ for (; x < (left + src.w); x++)
+ {
+ outptr[x] = ptr[x - left];
+ }
+ }
+ else
+ {
+ memcpy(outptr + left, ptr, src.w * sizeof(float));
+ x += src.w;
+ }
+ for (; x < w; x++)
+ {
+ outptr[x] = ptr[src.w - 1];
+ }
+ outptr += w;
+ }
+ }
+}
+
+#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_)
+static void copy_make_border_image_inplace(const Mat &src, Mat &dst, int top, int left, int type,
+ float v)
+{
+ int w = dst.w;
+ int h = dst.h;
+
+ const float *ptr = src;
+ float *outptr = dst;
+
+ if (type == BORDER_CONSTANT)
+ {
+ // fill bottom
+ int y = src.h + top;
+ outptr += y * w;
+ for (; y < h; y++)
+ {
+ int x = 0;
+ for (; x < w; x++)
+ {
+ outptr[x] = v;
+ }
+ outptr += w;
+ }
+
+ // fill center
+ y = src.h + top - 1;
+ outptr = dst;
+ outptr += y * w;
+ ptr += (src.h - 1) * src.w;
+
+ for (; y >= top; y--)
+ {
+ int x = left + src.w;
+ for (; x < w; x++)
+ {
+ outptr[x] = v;
+ }
+
+ x = left + src.w - 1;
+
+ for (; x >= left; x--)
+ {
+ outptr[x] = ptr[x - left];
+ }
+
+ for (x = 0; x < left; x++)
+ {
+ outptr[x] = v;
+ }
+ ptr -= src.w;
+ outptr -= w;
+ }
+
+ // fill top
+ y = 0;
+ outptr = dst;
+ for (; y < top; y++)
+ {
+ int x = 0;
+ for (; x < w; x++)
+ {
+ outptr[x] = v;
+ }
+ outptr += w;
+ }
+ }
+}
+#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_
+
+void copy_make_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right, int type,
+ float v)
+{
+ int w = src.w + left + right;
+ int h = src.h + top + bottom;
+
+ if (w == src.w && h == src.h)
+ {
+ dst = src;
+ return;
+ }
+
+ if (src.dims == 2)
+ {
+ dst.create(w, h);
+ if (dst.empty())
+ return;
+ copy_make_border_image(src, dst, top, left, type, v);
+ }
+ else if (src.dims == 3)
+ {
+ int channels = src.c;
+ dst.create(w, h, channels);
+ if (dst.empty())
+ return;
+
+ if (src.data != dst.data)
+ {
+// unroll image channel
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const Mat m = src.channel(q);
+ Mat borderm = dst.channel(q);
+
+ copy_make_border_image(m, borderm, top, left, type, v);
+ }
+ }
+ else
+ {
+#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_)
+ for (int q = channels - 1; q >= 0; q--)
+ {
+ Mat m = src.channel(q);
+ Mat borderm = dst.channel(q);
+ copy_make_border_image_inplace(m, borderm, top, left, type, v);
+ }
+#else
+// unroll image channel
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const Mat m = src.channel(q);
+ Mat borderm = dst.channel(q);
+
+ copy_make_border_image(m, borderm, top, left, type, v);
+ }
+#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_
+ }
+ }
+}
+
+static void copy_cut_border_image(const Mat &src, Mat &dst, int top, int left)
+{
+ int w = dst.w;
+ int h = dst.h;
+
+ const float *ptr = src.row(top) + left; //.data + src.w * top + left;
+ float *outptr = dst; //.data;
+
+ for (int y = 0; y < h; y++)
+ {
+ if (w < 12)
+ {
+ for (int x = 0; x < w; x++)
+ {
+ outptr[x] = ptr[x];
+ }
+ }
+ else
+ {
+ memcpy(outptr, ptr, w * sizeof(float));
+ }
+ outptr += w;
+ ptr += src.w;
+ }
+}
+
+void copy_cut_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right)
+{
+ int w = src.w - left - right;
+ int h = src.h - top - bottom;
+
+#ifndef _MEMORY_TO_TIME_
+ if (w == src.w && h == src.h)
+ {
+ dst = src;
+ return;
+ }
+#endif
+
+ if (src.dims == 2)
+ {
+ dst.create(w, h);
+ if (dst.empty())
+ return;
+
+ copy_cut_border_image(src, dst, top, left);
+ }
+ else if (src.dims == 3)
+ {
+ int channels = src.c;
+
+ dst.create(w, h, channels);
+ if (dst.empty())
+ return;
+
+#if !defined(_MEMORY_TO_TIME_) || !defined(_TIME_TO_MEMORY_)
+// unroll image channel
+#pragma omp parallel for
+#endif
+ for (int q = 0; q < channels; q++)
+ {
+ const Mat m = src.channel(q);
+ Mat cutm = dst.channel(q);
+
+ copy_cut_border_image(m, cutm, top, left);
+ }
+ }
+}
+
+static void resize_bilinear_image(const Mat &src, Mat &dst, int w, int h)
+{
+ double scale_x = (double)src.w / w;
+ double scale_y = (double)src.h / h;
+
+ int *buf = new int[w + h + w * 2 + h * 2];
+
+ int *xofs = buf; // new int[w];
+ int *yofs = buf + w; // new int[h];
+
+ float *alpha = (float *)(buf + w + h); // new float[w * 2];
+ float *beta = (float *)(buf + w + h + w * 2); // new float[h * 2];
+
+ float fx;
+ float fy;
+ int sx;
+ int sy;
+
+ for (int dx = 0; dx < w; dx++)
+ {
+ fx = (float)((dx + 0.5) * scale_x - 0.5);
+ sx = fx; // cvFloor(fx);
+ fx -= sx;
+
+ if (sx >= src.w - 1)
+ {
+ sx = src.w - 2;
+ fx = 1.f;
+ }
+
+ xofs[dx] = sx;
+
+ alpha[dx * 2] = 1.f - fx;
+ alpha[dx * 2 + 1] = fx;
+ }
+
+ for (int dy = 0; dy < h; dy++)
+ {
+ fy = (float)((dy + 0.5) * scale_y - 0.5);
+ sy = fy; // cvFloor(fy);
+ fy -= sy;
+
+ if (sy >= src.h - 1)
+ {
+ sy = src.h - 2;
+ fy = 1.f;
+ }
+
+ yofs[dy] = sy;
+
+ beta[dy * 2] = 1.f - fy;
+ beta[dy * 2 + 1] = fy;
+ }
+
+ // loop body
+ Mat rowsbuf0(w + 1);
+ Mat rowsbuf1(w + 1);
+ float *rows0 = rowsbuf0;
+ float *rows1 = rowsbuf1;
+
+ int prev_sy1 = -1;
+
+ for (int dy = 0; dy < h; dy++)
+ {
+ int sy = yofs[dy];
+
+ if (sy == prev_sy1)
+ {
+ // hresize one row
+ float *rows0_old = rows0;
+ rows0 = rows1;
+ rows1 = rows0_old;
+ const float *S1 = src.row(sy + 1);
+
+ const float *alphap = alpha;
+ float *rows1p = rows1;
+ int dx = 0;
+#if __ARM_NEON
+ for (; dx + 1 < w; dx += 2)
+ {
+ int sx = xofs[dx];
+ int sxn = xofs[dx + 1];
+ const float *S1p = S1 + sx;
+ const float *S1np = S1 + sxn;
+
+ float32x4_t _a = vld1q_f32(alphap);
+ float32x2_t _S1 = vld1_f32(S1p);
+ float32x2_t _S1n = vld1_f32(S1np);
+
+ float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
+ float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
+ float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
+
+ vst1_f32(rows1p + dx, _rows1);
+
+ alphap += 4;
+ }
+#endif // __ARM_NEON
+ for (; dx < w; dx++)
+ {
+ int sx = xofs[dx];
+ const float *S1p = S1 + sx;
+
+ float a0 = alphap[0];
+ float a1 = alphap[1];
+ rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
+
+ alphap += 2;
+ }
+ }
+ else
+ {
+ // hresize two rows
+ const float *S0 = src.row(sy);
+ const float *S1 = src.row(sy + 1);
+
+ const float *alphap = alpha;
+ float *rows0p = rows0;
+ float *rows1p = rows1;
+ int dx = 0;
+#if __ARM_NEON
+ for (; dx + 1 < w; dx += 2)
+ {
+ int sx = xofs[dx];
+ int sxn = xofs[dx + 1];
+ const float *S0p = S0 + sx;
+ const float *S1p = S1 + sx;
+ const float *S0np = S0 + sxn;
+ const float *S1np = S1 + sxn;
+
+ float32x4_t _a = vld1q_f32(alphap);
+ float32x2_t _S0 = vld1_f32(S0p);
+ float32x2_t _S1 = vld1_f32(S1p);
+ float32x2_t _S0n = vld1_f32(S0np);
+ float32x2_t _S1n = vld1_f32(S1np);
+
+ float32x4_t _S0S0n = vcombine_f32(_S0, _S0n);
+ float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
+ float32x4_t _ms0 = vmulq_f32(_S0S0n, _a);
+ float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
+ float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
+ float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
+
+ vst1_f32(rows0p + dx, _rows0);
+ vst1_f32(rows1p + dx, _rows1);
+
+ alphap += 4;
+ }
+#endif // __ARM_NEON
+ for (; dx < w; dx++)
+ {
+ int sx = xofs[dx];
+ const float *S0p = S0 + sx;
+ const float *S1p = S1 + sx;
+
+ float a0 = alphap[0];
+ float a1 = alphap[1];
+ rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
+ rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
+
+ alphap += 2;
+ }
+ }
+
+ prev_sy1 = sy + 1;
+
+ // vresize
+ float b0 = beta[0];
+ float b1 = beta[1];
+
+ float *rows0p = rows0;
+ float *rows1p = rows1;
+ float *Dp = dst.row(dy);
+
+#if __ARM_NEON
+ int nn = w >> 3;
+#else
+ int nn = 0;
+#endif
+ int remain = w - (nn << 3);
+
+#if __ARM_NEON
+ float32x4_t _b0 = vdupq_n_f32(b0);
+ float32x4_t _b1 = vdupq_n_f32(b1);
+ for (; nn > 0; nn--)
+ {
+ float32x4_t _rows0 = vld1q_f32(rows0p);
+ float32x4_t _rows1 = vld1q_f32(rows1p);
+
+ float32x4_t _D = vmulq_f32(_rows0, _b0);
+ _D = vmlaq_f32(_D, _rows1, _b1);
+
+ vst1q_f32(Dp, _D);
+
+ float32x4_t _rows0n = vld1q_f32(rows0p + 4);
+ float32x4_t _rows1n = vld1q_f32(rows1p + 4);
+
+ float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
+ _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
+
+ vst1q_f32(Dp + 4, _Dn);
+
+ Dp += 8;
+ rows0p += 8;
+ rows1p += 8;
+ }
+#endif // __ARM_NEON
+ for (; remain; --remain)
+ {
+ // D[x] = rows0[x]*b0 + rows1[x]*b1;
+ *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+ }
+
+ beta += 2;
+ }
+
+ delete[] buf;
+}
+
+void resize_bilinear(const Mat &src, Mat &dst, int w, int h)
+{
+ if (w == src.w && h == src.h)
+ {
+ dst = src;
+ return;
+ }
+
+ if (src.dims == 2)
+ {
+ dst.create(w, h);
+ if (dst.empty())
+ return;
+
+ resize_bilinear_image(src, dst, w, h);
+ }
+ else if (src.dims == 3)
+ {
+ int channels = src.c;
+
+ dst.create(w, h, channels);
+ if (dst.empty())
+ return;
+
+// unroll image channel
+#pragma omp parallel for
+ for (int q = 0; q < channels; q++)
+ {
+ const Mat m = src.channel(q);
+ Mat resizem = dst.channel(q);
+
+ resize_bilinear_image(m, resizem, w, h);
+ }
+ }
+}
+
+} // namespace ncnn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/common.h b/compute/ncnn/src/srcn/common.h
new file mode 100644
index 000000000..778a17a80
--- /dev/null
+++ b/compute/ncnn/src/srcn/common.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_COMMON_H__
+#define __NNFW_SRCN_COMMON_H__
+
+#include <string.h>
+#include <limits>
+#include <arm_neon.h>
+
+#include "ncnn/srcn/conv_type.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+#define sizeof_RhsScalar 4
+#define sizeof_LhsScalar 4
+#define sizeof_ResScalar 4
+
+#define MIN(a, b) (a) > (b) ? (b) : (a)
+#define MAX(a, b) (a) > (b) ? (a) : (b)
+
+enum shardType_t
+{
+ shardByCol = 0,
+ shardByRow
+};
+
+#ifdef TIZEN
+#define L1_CACHE_SIZE (16536 * 2)
+#define L2_CACHE_SIZE (524288 * 2)
+#define L3_CACHE_SIZE (0) // no L3
+#define MAX_K (512)
+// single-thread
+#define GEN_COL (1440)
+// multi-threads
+#define MAX_COL (90)
+#define MIN_COL (32)
+#elif defined ANDROID
+#define L1_CACHE_SIZE (16536 * 4)
+#define L2_CACHE_SIZE (524288 * 8)
+#define L3_CACHE_SIZE (0) //(524288 * 8) //no L3
+#define MAX_K (512 * 2)
+// single-thread
+#define GEN_COL (1440)
+// multi-threads
+#if __aarch64__
+#define MAX_COL (1024)
+#else
+#define MAX_COL (90)
+#endif
+#define MIN_COL (32)
+#endif
+
+enum
+{
+ USE_COMMON_KENEL = 0,
+ USE_12BIT_KERNEL,
+ USE_NONZERO_KERENL
+};
+
+template <typename T> static T divup(const T &x, const T &y)
+{
+ return static_cast<T>((x + y - 1) / y);
+}
+
+#ifdef NCNN
+static inline size_t alignSize(size_t sz, int n) { return (sz + n - 1) / n * n; }
+
+static inline size_t alignBy2(size_t sz) { return (sz + 1) & -2; }
+#endif // NCNN
+
+static inline int32_t BitNot(int32_t a) { return ~a; }
+
+static inline int32_t MaskIfNonZero(int32_t a)
+{
+ static int32_t zero = 0;
+ return a ? BitNot(zero) : zero;
+}
+
+static inline int32_t BitAnd(int32_t a, int32_t b) { return a & b; }
+
+static inline int32_t ShiftRight(int32_t a, int offset) { return a >> offset; }
+
+static inline int32_t MaskIfLessThan(int32_t a, int32_t b) { return MaskIfNonZero(a < b); }
+
+static inline int32_t MaskIfGreaterThan(int32_t a, int32_t b) { return MaskIfNonZero(a > b); }
+
+static inline int32_t Add(int32_t a, int32_t b) { return a + b; }
+
+static inline int32_t RoundingDivideByPOT(int32_t x, int exponent)
+{
+ const int32_t mask = (1ll << exponent) - 1;
+ const int32_t zero = 0;
+ const int32_t one = 1;
+ const int32_t remainder = BitAnd(x, mask);
+ const int32_t threshold = Add(ShiftRight(mask, 1), BitAnd(MaskIfLessThan(x, zero), one));
+ return Add(ShiftRight(x, exponent), BitAnd(MaskIfGreaterThan(remainder, threshold), one));
+}
+static inline int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b)
+{
+ bool overflow = a == b && a == std::numeric_limits<int32_t>::min();
+ int64_t a_64(a);
+ int64_t b_64(b);
+ int64_t ab_64 = a_64 * b_64;
+ int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+ int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
+ return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
+}
+
+static inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
+ int shift)
+{
+ int left_shift = shift > 0 ? shift : 0;
+ int right_shift = shift > 0 ? 0 : -shift;
+ return RoundingDivideByPOT(
+ SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), right_shift);
+}
+
+static inline int32x4_t SaturatingRoundingDoublingHighMulV(int32x4_t a, int32x4_t b)
+{
+ return vqrdmulhq_s32(a, b);
+}
+
+static inline int32x4_t RoundingDivideByPOTV(int32x4_t x, int exponent)
+{
+ const int32x4_t shift_vec = vdupq_n_s32(-exponent);
+ const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+ const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+ return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
+static inline int32x4_t MultiplyByQuantizedMultiplierV(int32x4_t x, int32_t quantized_multiplier,
+ int shift)
+{
+ int left_shift = shift > 0 ? shift : 0;
+ int right_shift = shift > 0 ? 0 : -shift;
+ return RoundingDivideByPOTV(
+ SaturatingRoundingDoublingHighMulV(vrshlq_s32(x, vdupq_n_s32(left_shift)),
+ vdupq_n_s32(quantized_multiplier)),
+ right_shift);
+}
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_COMMON_H__
diff --git a/compute/ncnn/src/srcn/conv_sgemm_multithreads.cc b/compute/ncnn/src/srcn/conv_sgemm_multithreads.cc
new file mode 100644
index 000000000..21083f677
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_sgemm_multithreads.cc
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+#include "sgemm_kernel.h"
+#include "sgemm_pack.h"
+#include "conv_sgemm_multithreads.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void conv_sgemm_multithreads::param_init()
+{
+#if __aarch64__
+ if (conv_type_ == row_major)
+ {
+ mr_ = 8;
+ nr_ = 12;
+ }
+ else if (conv_type_ == col_major)
+ {
+#ifdef BATCH_DILATION_FIX
+ if (out_mat_.n > 1)
+ {
+
+ mr_ = 24;
+ nr_ = 4;
+ }
+ else
+#endif // BATCH_DILATION_FIX
+ {
+ if (m_ > n_)
+ {
+ mr_ = 24;
+ nr_ = 4;
+ }
+ else
+ {
+ mr_ = 12;
+ nr_ = 8;
+ }
+ }
+ }
+#else // __aarch64__
+ if (conv_type_ == row_major)
+ {
+ mr_ = 6;
+ nr_ = 8;
+ }
+ else if (conv_type_ == col_major)
+ {
+ mr_ = 8;
+ nr_ = 6;
+ }
+#endif // __aarch64__
+ int col = n_;
+
+ if (m_ > n_)
+ {
+ shard_type_ = shardByRow;
+ col = m_;
+ }
+ else
+ {
+ shard_type_ = shardByCol;
+ }
+
+ int th_base = divup(col, num_threads_);
+
+ th_base = MIN(MAX(th_base, MIN_COL), MAX_COL);
+
+ int k_div = (nr_ * sizeof_RhsScalar);
+ int k_sub = (mr_ * nr_ * sizeof_ResScalar);
+
+ const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div * 2), MAX_K);
+ bk_ = MIN(k_cache, k_);
+
+ if (shard_type_ == shardByCol)
+ {
+ int m_sub = (bk_ * nr_ * sizeof_RhsScalar);
+ int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
+ if (L3_CACHE_SIZE)
+ m_div = (sizeof_LhsScalar * bk_ * 2);
+ int m_cache = divup((L2_CACHE_SIZE - m_sub), m_div);
+ bm_ = MIN(m_cache, m_);
+
+ bn_ = MIN(th_base, n_);
+ if (L3_CACHE_SIZE)
+ {
+ int n_sub = (bk_ * bm_ * sizeof_RhsScalar);
+ int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
+ int n_cache = divup((L3_CACHE_SIZE - n_sub), n_div);
+ bn_ = MIN(n_cache, bn_);
+ }
+ }
+ else
+ {
+ int n_sub = (bk_ * mr_ * sizeof_LhsScalar);
+ int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
+ if (L3_CACHE_SIZE)
+ n_div = (sizeof_LhsScalar * bk_ * 2);
+ int n_cache = divup((L2_CACHE_SIZE - n_sub), n_div);
+ bn_ = MIN(n_cache, n_);
+
+ bm_ = MIN(th_base, m_);
+ if (L3_CACHE_SIZE)
+ {
+ int m_sub = (bk_ * bn_ * sizeof_RhsScalar);
+ int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
+ int m_cache = divup((L3_CACHE_SIZE - m_sub), m_div);
+ bm_ = MIN(m_cache, bm_);
+ }
+ }
+
+ nm_ = divup(m_, bm_);
+ nn_ = divup(n_, bn_);
+ nk_ = divup(k_, bk_);
+
+ rm_ = m_ % bm_;
+ rn_ = n_ % bn_;
+ rk_ = k_ % bk_;
+}
+
+conv_sgemm_multithreads::conv_sgemm_multithreads(const convMat_t &in_mat,
+ const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, int num_threads,
+ convType_t conv_type)
+
+ : in_mat_(in_mat), weights_mat_(weights_mat), out_mat_(out_mat), in_param_(in_param),
+ conv_type_(conv_type), num_threads_(num_threads)
+{
+ m_ = out_mat_.c;
+#ifdef NCNN
+#ifdef WITH_DPU
+ np_ = out_mat_.n * alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float));
+ n_ = (np_ + 1) / 2;
+#else // WITH_DPU
+ n_ = out_mat_.n * alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float));
+#endif // WITH_DPU
+#else // NCNN
+#ifdef WITH_DPU
+ np_ = out_mat_.n * out_mat_.w * out_mat_.h;
+ n_ = (np_ + 1) / 2;
+#else // WITH_DPU
+ n_ = out_mat_.n * out_mat_.w * out_mat_.h;
+#endif // WITH_DPU
+#endif // NCNN
+ k_ = in_param_.kernel_h * in_param_.kernel_w * in_mat.c;
+
+ param_init();
+
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ if (shard_type_ == shardByCol)
+ {
+ plhs_buffer_ = new float[lhs_stride * 1 * nm_];
+ prhs_buffer_ = new float[rhs_stride * num_threads_];
+ }
+ else
+ {
+ plhs_buffer_ = new float[lhs_stride * num_threads_];
+ prhs_buffer_ = new float[rhs_stride * 1 * nn_];
+ }
+
+ if (plhs_buffer_ == NULL || prhs_buffer_ == NULL)
+ {
+ error_ = 1;
+ }
+
+ if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 ||
+ in_param_.stride_h != 1 || in_param_.padding != 0)
+ {
+ need_im2col_ = 1;
+ }
+ else
+ {
+ need_im2col_ = 0;
+ }
+
+ omp_set_num_threads(num_threads_);
+
+ error_ = 0;
+}
+
+conv_sgemm_multithreads::~conv_sgemm_multithreads()
+{
+ if (plhs_buffer_)
+ delete[] plhs_buffer_;
+ if (prhs_buffer_)
+ delete[] prhs_buffer_;
+}
+
+void conv_sgemm_multithreads::run()
+{
+ if (error_)
+ return;
+
+ if (shard_type_ == shardByCol && conv_type_ == col_major)
+ {
+ compute_colmajor_colshard();
+ }
+ else if (shard_type_ == shardByRow && conv_type_ == col_major)
+ {
+ compute_colmajor_rowshard();
+ }
+ else if (shard_type_ == shardByCol && conv_type_ == row_major)
+ {
+ compute_rowmajor_colshard();
+ }
+ else if (shard_type_ == shardByRow && conv_type_ == row_major)
+ {
+ compute_rowmajor_rowshard();
+ }
+}
+
+void conv_sgemm_multithreads::compute_rowmajor_colshard()
+{
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+#pragma omp parallel for
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_],
+ &plhs_buffer_[i * lhs_stride]);
+ }
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ int thread_num = omp_get_thread_num();
+ // float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num];
+ float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num];
+
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ if (need_im2col_)
+ {
+ if (out_mat_.n == 1)
+ {
+ _pack_rowmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ else
+ {
+ _pack_rowmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ }
+ else
+ {
+#ifdef WITH_DPU
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, np_, &in_mat_.data[n_ + l * bk_ * np_ + j * bn_],
+ prhs_ptr);
+#else
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_],
+ prhs_ptr);
+#endif
+ }
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+#ifdef WITH_DPU
+ _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride],
+ prhs_ptr, &out_mat_.data[n_ + i * bm_ * np_ + j * bn_],
+ l, np_, bk);
+#else // WITH_DPU
+ _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride],
+ prhs_ptr, &out_mat_.data[i * bm_ * n_ + j * bn_], l, n_,
+ bk);
+#endif // WITH_DPU
+ }
+ }
+ }
+}
+
+void conv_sgemm_multithreads::compute_rowmajor_rowshard()
+{
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ if (need_im2col_)
+ {
+ if (out_mat_.n == 1)
+ {
+ _pack_rowmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_),
+ &prhs_buffer_[j * rhs_stride]);
+ }
+ else
+ {
+ _pack_rowmajor_image_rhs_batch(
+ nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), &prhs_buffer_[j * rhs_stride]);
+ }
+ }
+ else
+ {
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_],
+ &prhs_buffer_[j * rhs_stride]);
+ }
+ }
+
+#pragma omp parallel for
+ for (int i = 0; i < nm_; i++)
+ {
+ int thread_num = omp_get_thread_num();
+ float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num];
+
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_],
+ plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr,
+ &prhs_buffer_[j * rhs_stride],
+ &out_mat_.data[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+}
+
+void conv_sgemm_multithreads::compute_colmajor_colshard()
+{
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+#pragma omp parallel for
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_],
+ &plhs_buffer_[i * lhs_stride]);
+ }
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ int thread_num = omp_get_thread_num();
+ float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num];
+
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ if (need_im2col_)
+ {
+ if (out_mat_.n == 1)
+ {
+ _pack_colmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ else
+ {
+ _pack_colmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ }
+ else
+ {
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_],
+ prhs_ptr);
+ }
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride],
+ prhs_ptr, &out_mat_.data[j * bn_ * m_ + i * bm_], l, m_,
+ bk);
+ }
+ }
+ }
+}
+
+void conv_sgemm_multithreads::compute_colmajor_rowshard()
+{
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ if (need_im2col_)
+ {
+ if (out_mat_.n == 1)
+ {
+ _pack_colmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_),
+ &prhs_buffer_[j * rhs_stride]);
+ }
+ else
+ {
+ _pack_colmajor_image_rhs_batch(
+ nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), &prhs_buffer_[j * rhs_stride]);
+ }
+ }
+ else
+ {
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_],
+ &prhs_buffer_[j * rhs_stride]);
+ }
+ }
+
+#pragma omp parallel for
+ for (int i = 0; i < nm_; i++)
+ {
+ int thread_num = omp_get_thread_num();
+ float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num];
+
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_],
+ plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr,
+ &prhs_buffer_[j * rhs_stride],
+ &out_mat_.data[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/conv_sgemm_multithreads.h b/compute/ncnn/src/srcn/conv_sgemm_multithreads.h
new file mode 100644
index 000000000..9c9ce7437
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_sgemm_multithreads.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_CONV_SGEMM_MULTITHREADS_H__
+#define __NNFW_SRCN_CONV_SGEMM_MULTITHREADS_H__
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+class conv_sgemm_multithreads
+{
+public:
+ conv_sgemm_multithreads(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, int num_threads, convType_t conv_type);
+ ~conv_sgemm_multithreads();
+
+ void run();
+
+private:
+ void param_init();
+
+ void compute_rowmajor_colshard();
+ void compute_rowmajor_rowshard();
+ void compute_colmajor_colshard();
+ void compute_colmajor_rowshard();
+
+ const convMat_t in_mat_;
+ const convMat_t weights_mat_;
+ convMat_t out_mat_;
+ const convParams_t in_param_;
+ convType_t conv_type_;
+ int num_threads_;
+
+ int m_;
+ int n_;
+#ifdef WITH_DPU
+ int np_;
+#endif
+ int k_;
+
+ int bm_;
+ int bn_;
+ int bk_;
+
+ int rm_;
+ int rn_;
+ int rk_;
+
+ int nm_;
+ int nn_;
+ int nk_;
+
+ int mr_;
+ int nr_;
+
+ int need_im2col_;
+ shardType_t shard_type_;
+
+ float *prhs_buffer_;
+ float *plhs_buffer_;
+
+ int error_;
+};
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_CONV_SGEMM_MULTITHREADS_H__
diff --git a/compute/ncnn/src/srcn/conv_sgemm_singlethread.cc b/compute/ncnn/src/srcn/conv_sgemm_singlethread.cc
new file mode 100644
index 000000000..4cbbf217f
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_sgemm_singlethread.cc
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+
+#include "common.h"
+#include "sgemm_kernel.h"
+#include "sgemm_pack.h"
+#include "conv_sgemm_singlethread.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void conv_sgemm_singlethread::param_init()
+{
+ if (n_ > 3 * m_)
+ {
+ shard_type_ = shardByRow;
+ }
+ else
+ {
+ shard_type_ = shardByCol;
+ }
+
+#if __aarch64__
+ if (conv_type_ == row_major)
+ {
+ if (shard_type_ == shardByRow)
+ {
+ mr_ = 8;
+ nr_ = 12;
+ }
+ else
+ {
+ mr_ = 12;
+ nr_ = 8;
+ }
+ }
+ else if (conv_type_ == col_major)
+ {
+#ifndef BATCH_DILATION_FIX
+ mr_ = 12;
+ nr_ = 8;
+#else // BATCH_DILATION_FIX
+ // TODO: batch(dilation) + inw * inh
+ if (out_mat_.n > 1)
+ {
+ mr_ = 24;
+ nr_ = 4;
+ }
+ else
+ {
+ mr_ = 12;
+ nr_ = 8;
+ }
+#endif // BATCH_DILATION_FIX
+ }
+#else // __aarch64__
+ if (conv_type_ == row_major)
+ {
+ mr_ = 6;
+ nr_ = 8;
+ }
+ else if (conv_type_ == col_major)
+ {
+ mr_ = 8;
+ nr_ = 6;
+ }
+#endif // __aarch64__
+
+ int k_div = (nr_ * sizeof_RhsScalar);
+ int k_sub = (mr_ * nr_ * sizeof_ResScalar);
+
+ const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div), MAX_K);
+ bk_ = MIN(k_cache, k_);
+
+ if (shard_type_ == shardByCol)
+ {
+ int m_sub = (bk_ * nr_ * sizeof_RhsScalar);
+ int m_cache = divup((L2_CACHE_SIZE - m_sub), (sizeof_LhsScalar * bk_ * 2));
+ bm_ = MIN(m_cache, m_);
+
+ bn_ = MIN(GEN_COL, n_);
+ if (L3_CACHE_SIZE)
+ {
+ int n_sub = (bk_ * bm_ * sizeof_RhsScalar);
+ int n_cache = divup((L3_CACHE_SIZE - n_sub), (sizeof_LhsScalar * bk_ * 2));
+ bn_ = MIN(n_cache, bn_);
+ }
+ }
+ else
+ {
+ int n_sub = (bk_ * mr_ * sizeof_RhsScalar);
+ int n_cache = divup((L2_CACHE_SIZE - n_sub), (sizeof_LhsScalar * bk_ * 2));
+ bn_ = MIN(n_cache, n_);
+
+ bm_ = MIN(GEN_COL, m_);
+ if (L3_CACHE_SIZE)
+ {
+ int m_sub = (bk_ * bn_ * sizeof_RhsScalar);
+ int m_cache = divup((L3_CACHE_SIZE - m_sub), (sizeof_LhsScalar * bk_ * 2));
+ bm_ = MIN(m_cache, bm_);
+ }
+ }
+
+ nm_ = divup(m_, bm_);
+ nn_ = divup(n_, bn_);
+ nk_ = divup(k_, bk_);
+
+ rm_ = m_ % bm_;
+ rn_ = n_ % bn_;
+ rk_ = k_ % bk_;
+}
+
+conv_sgemm_singlethread::conv_sgemm_singlethread(const convMat_t &in_mat,
+ const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, convType_t conv_type)
+ : in_mat_(in_mat), weights_mat_(weights_mat), out_mat_(out_mat), in_param_(in_param),
+ conv_type_(conv_type)
+{
+ m_ = out_mat_.c;
+#ifdef NCNN
+ n_ = out_mat_.n * alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float));
+#else
+ n_ = out_mat_.n * out_mat_.w * out_mat_.h;
+#endif
+ k_ = in_param_.kernel_h * in_param_.kernel_w * in_mat.c;
+
+ param_init();
+
+ if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 ||
+ in_param_.stride_h != 1 || in_param_.padding != 0 || out_mat_.n > 1)
+ {
+ need_im2col_ = 1;
+ }
+ else
+ {
+ need_im2col_ = 0;
+ }
+}
+
+conv_sgemm_singlethread::~conv_sgemm_singlethread() {}
+
+void conv_sgemm_singlethread::run()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float *plhs_ptr = new float[mstride * bk_];
+ float *prhs_ptr = new float[nstride * bk_];
+
+ if (conv_type_ == row_major)
+ {
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ if (need_im2col_)
+ {
+ if (out_mat_.n == 1)
+ {
+ _pack_rowmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ else
+ {
+ _pack_rowmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ }
+ else
+ {
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_],
+ prhs_ptr);
+ }
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_],
+ plhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &out_mat_.data[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_],
+ plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ if (need_im2col_)
+ {
+ if (out_mat_.n == 1)
+ {
+ _pack_rowmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ else
+ {
+ _pack_rowmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ }
+ else
+ {
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_],
+ prhs_ptr);
+ }
+
+ _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &out_mat_.data[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"Error shrad type!"};
+ }
+ }
+ else if (conv_type_ == col_major)
+ {
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ if (need_im2col_)
+ {
+ if (out_mat_.n == 1)
+ {
+ _pack_colmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ else
+ {
+ _pack_colmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ }
+ else
+ {
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_],
+ prhs_ptr);
+ }
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_],
+ plhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &out_mat_.data[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_],
+ plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ if (need_im2col_)
+ {
+ if (out_mat_.n == 1)
+ {
+ _pack_colmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ else
+ {
+ _pack_colmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_,
+ const_cast<convMat_t *>(&in_mat_), &out_mat_,
+ const_cast<convParams_t *>(&in_param_), prhs_ptr);
+ }
+ }
+ else
+ {
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_],
+ prhs_ptr);
+ }
+
+ _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &out_mat_.data[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"Error shrad type!"};
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"Error conv type!"};
+ }
+
+ delete[] plhs_ptr;
+ delete[] prhs_ptr;
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/conv_sgemm_singlethread.h b/compute/ncnn/src/srcn/conv_sgemm_singlethread.h
new file mode 100644
index 000000000..63f8b6e66
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_sgemm_singlethread.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_CONV_SGEMM_SINGLETHREAD_H__
+#define __NNFW_SRCN_CONV_SGEMM_SINGLETHREAD_H__
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+class conv_sgemm_singlethread
+{
+public:
+ conv_sgemm_singlethread(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, convType_t conv_type);
+ ~conv_sgemm_singlethread();
+
+ void run();
+
+private:
+ void param_init();
+
+ const convMat_t in_mat_;
+ const convMat_t weights_mat_;
+ convMat_t out_mat_;
+ const convParams_t in_param_;
+ convType_t conv_type_;
+
+ int m_;
+ int n_;
+ int k_;
+
+ int bm_;
+ int bn_;
+ int bk_;
+
+ int rm_;
+ int rn_;
+ int rk_;
+
+ int nm_;
+ int nn_;
+ int nk_;
+
+ int mr_;
+ int nr_;
+
+ int need_im2col_;
+
+ shardType_t shard_type_;
+};
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_CONV_SGEMM_SINGLETHREAD_H__
diff --git a/compute/ncnn/src/srcn/conv_sparse.cc b/compute/ncnn/src/srcn/conv_sparse.cc
new file mode 100644
index 000000000..10e2a2b93
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_sparse.cc
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <stdexcept>
+
+#include "common.h"
+#include "sgemm_kernel.h"
+#include "sgemm_pack.h"
+#include "conv_sparse.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void conv_sparse::param_init()
+{
+#ifdef NCNN
+ n_ = alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float));
+#else
+ n_ = out_mat_.w * out_mat_.h;
+#endif
+
+ bch_ = BCH;
+ nch_ = (out_mat_.c + bch_ - 1) / bch_;
+
+ rch_ = out_mat_.c % bch_;
+
+ bn_ = MIN(n_, L1_CACHE_SIZE / (sizeof(float) * 2));
+ bn_ = MIN(bn_, (L2_CACHE_SIZE / 2 - bch_ * sizeof(weight_data_t)) / ((bch_ + 1) * sizeof(float)) /
+ num_threads_);
+ nn_ = (n_ + bn_ - 1) / bn_;
+ rn_ = n_ % bn_;
+
+ if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 ||
+ in_param_.stride_h != 1 || in_param_.padding != 0)
+ {
+ need_im2col_ = 1;
+ }
+ else
+ {
+ need_im2col_ = 0;
+ }
+}
+
+conv_sparse::conv_sparse(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param,
+ const sparse_weight_t *weights, int num_threads, convType_t conv_type)
+ : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), weights_(weights),
+ num_threads_(num_threads), conv_type_(conv_type)
+{
+ param_init();
+}
+
+conv_sparse::~conv_sparse() {}
+
+void conv_sparse::compute_singlethread()
+{
+ if (need_im2col_)
+ {
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+ float prhs_ptr[bn_];
+
+ for (int j = 0; j < nn_; j++)
+ {
+ int k = -1;
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ if (k != lhs_ptr->k)
+ {
+ k = lhs_ptr->k;
+ _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_),
+ prhs_ptr);
+ }
+
+ // Why n_ = 64 x 64 is too much slower on Tizen???
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+ float *rhs_ptr = in_mat_.data + j * bn_;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ // Why n_ = 64 x 64 is too much slower on Tizen???
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+}
+
+void conv_sparse::compute_multithreads()
+{
+ omp_set_num_threads(num_threads_);
+
+ if (nch_ >= num_threads_ || nch_ >= nn_)
+ {
+ if (need_im2col_)
+ {
+#pragma omp parallel for
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+ float prhs_ptr[bn_];
+
+ for (int j = 0; j < nn_; j++)
+ {
+ int k = -1;
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ if (k != lhs_ptr->k)
+ {
+ k = lhs_ptr->k;
+ _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_),
+ prhs_ptr);
+ }
+
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ else
+ {
+#pragma omp parallel for
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+ float *rhs_ptr = in_mat_.data + j * bn_;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (need_im2col_)
+ {
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ int k = -1;
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+ float prhs_ptr[bn];
+
+ for (int l = 0; l < mxk; l++)
+ {
+ if (k != lhs_ptr->k)
+ {
+ k = lhs_ptr->k;
+ _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_),
+ prhs_ptr);
+ }
+
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+ float *rhs_ptr = in_mat_.data + j * bn_;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ }
+}
+
+void conv_sparse::run()
+{
+ if (num_threads_ == 1)
+ compute_singlethread();
+ else if (num_threads_ > 1)
+ compute_multithreads();
+ else
+ throw std::runtime_error{"Invalid thread number."};
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/conv_sparse.h b/compute/ncnn/src/srcn/conv_sparse.h
new file mode 100644
index 000000000..7ac358fd8
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_sparse.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_CONV_SPARSE_H__
+#define __NNFW_SRCN_CONV_SPARSE_H__
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+#define BCH 128
+
+typedef struct
+{
+ short m;
+ short k;
+ float data;
+} weight_data_t;
+
+typedef struct
+{
+ int mxk;
+ weight_data_t *wdata;
+} sparse_weight_t;
+
+class conv_sparse
+{
+public:
+ conv_sparse(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param,
+ const sparse_weight_t *weights, int num_threads, convType_t conv_type);
+ ~conv_sparse();
+
+ void run();
+
+private:
+ void param_init();
+ void compute_singlethread();
+ void compute_multithreads();
+
+ const convMat_t in_mat_;
+ convMat_t out_mat_;
+ const convParams_t in_param_;
+ const sparse_weight_t *weights_;
+ int num_threads_;
+ convType_t conv_type_;
+
+ uint32_t n_;
+ uint32_t bn_;
+ int rn_;
+ int nn_;
+
+ int bch_;
+ int rch_;
+ int nch_;
+
+ int need_im2col_;
+};
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_CONV_SPARSE_H__
diff --git a/compute/ncnn/src/srcn/conv_winograd.cc b/compute/ncnn/src/srcn/conv_winograd.cc
new file mode 100644
index 000000000..69649ea2a
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_winograd.cc
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.h"
+#include "conv_winograd.h"
+
+namespace std
+{
+template <typename Dtype> static inline Dtype max(Dtype a, Dtype b)
+{
+ if (a > b)
+ return a;
+ else
+ return b;
+}
+}
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void conv_winograd::param_init()
+{
+ if ((in_param_.kernel_w != in_param_.kernel_h) || (in_param_.stride_w != in_param_.stride_h) ||
+ (in_param_.kernel_w != 3 && in_param_.kernel_w != 5) || (in_param_.stride_w != 1) ||
+ (!winograd_weight_))
+ {
+ error_ = 1;
+ return;
+ }
+
+ int M, N;
+ const int w = in_mat_.w;
+ const int h = in_mat_.h;
+ const int outw = out_mat_.w;
+ const int outh = out_mat_.h;
+ const int pad_w = in_param_.pad_w;
+ const int pad_h = in_param_.pad_h;
+
+ if (in_param_.kernel_w == 3)
+ {
+ M = winograd_para_3x3s1::M;
+ N = winograd_para_3x3s1::N;
+ }
+ else
+ {
+ M = winograd_para_5x5s1::M;
+ N = winograd_para_5x5s1::N;
+ }
+
+ tile_h_in_ = tile_w_in_ = M;
+ tile_h_out_ = tile_h_in_ - N + 1;
+ tile_w_out_ = tile_w_in_ - N + 1;
+ ntiles_h_ = (std::max(h + pad_h - tile_h_in_ + 1, outh) + tile_h_out_ - 1) / tile_h_out_;
+ ntiles_w_ = (std::max(w + pad_w - tile_w_in_ + 1, outw) + tile_w_out_ - 1) / tile_w_out_;
+
+ error_ = 0;
+}
+
+conv_winograd::conv_winograd(const convMat_t &in_mat, convMat_t &out_mat,
+ const convParams_t &in_param, convType_t conv_type,
+ const float *winograd_weight, int num_threads, int inc_stride,
+ int outc_stride, int c_stride)
+ : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), conv_type_(conv_type),
+ winograd_weight_(winograd_weight), num_threads_(num_threads), inc_stride_(inc_stride),
+ outc_stride_(outc_stride), c_stride_(c_stride)
+
+{
+ param_init();
+}
+
+conv_winograd::~conv_winograd() {}
+
+void conv_winograd::compute_sgemm(sgemmType_t major_type, sgemmTrans_t ltrans, sgemmTrans_t rtrans,
+ const int m, const int n, const int k, const float *lhs_data,
+ const float *rhs_data, float *res_data)
+{
+ class sgemm_singlethread sgemm(major_type, ltrans, rtrans, m, n, k, lhs_data, rhs_data, res_data,
+ num_threads_);
+
+ sgemm.run();
+}
+
+void conv_winograd::winograd_input_im2col(float *col_buff)
+{
+ const int w = in_mat_.w;
+ const int h = in_mat_.h;
+ const float *data = in_mat_.data;
+ const int channels = in_mat_.c;
+ const int pad_w = in_param_.pad_w;
+ const int pad_h = in_param_.pad_h;
+
+ if (conv_type_ == row_major)
+ {
+#ifdef NCNN
+ const int n = alignSize(inc_stride_, 16 / sizeof(float));
+#else // NCNN
+ const int n = inc_stride_;
+#endif // NCNN
+ for (int c = 0; c < channels; ++c)
+ {
+ for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h)
+ {
+ for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w)
+ {
+ for (int y = 0; y < tile_h_in_; ++y)
+ {
+ for (int x = 0; x < tile_w_in_; ++x)
+ {
+ int in_y = tile_h * tile_h_out_ + y - pad_h;
+ int in_x = tile_w * tile_w_out_ + x - pad_w;
+
+ if (in_y < 0 || in_x < 0 || in_y >= h || in_x >= w)
+ {
+ col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_in_ + y) *
+ tile_w_in_ +
+ x] = 0;
+ }
+ else
+ {
+ col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_in_ + y) *
+ tile_w_in_ +
+ x] = data[c * n + in_y * w + in_x];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ else if (conv_type_ == col_major)
+ {
+ for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h)
+ {
+ for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w)
+ {
+ for (int y = 0; y < tile_h_in_; ++y)
+ {
+ for (int x = 0; x < tile_w_in_; ++x)
+ {
+ for (int c = 0; c < channels; ++c)
+ {
+ int in_y = tile_h * tile_h_out_ + y - pad_h;
+ int in_x = tile_w * tile_w_out_ + x - pad_w;
+
+ if (in_y < 0 || in_x < 0 || in_y >= h || in_x >= w)
+ {
+ col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_in_ + y) *
+ tile_w_in_ +
+ x] = 0;
+ }
+ else
+ {
+ col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_in_ + y) *
+ tile_w_in_ +
+ x] = data[c + (in_y * w + in_x) * channels];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void conv_winograd::winograd_output_col2im(const float *col_buff)
+{
+ int outh = out_mat_.h;
+ int outw = out_mat_.w;
+ float *data = out_mat_.data;
+ int channels = out_mat_.c;
+
+ if (conv_type_ == row_major)
+ {
+#ifdef NCNN
+ const int n = alignSize(outc_stride_, 16 / sizeof(float));
+#else // NCNN
+ const int n = outc_stride_;
+#endif // NCNN
+ for (int c = 0; c < channels; ++c)
+ {
+ for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h)
+ {
+ for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w)
+ {
+ for (int y = 0; y < tile_h_out_; ++y)
+ {
+ for (int x = 0; x < tile_w_out_; ++x)
+ {
+ int out_y = tile_h * tile_h_out_ + y;
+ int out_x = tile_w * tile_w_out_ + x;
+ if (out_y < outh && out_x < outw)
+ {
+ data[c * n + out_y * outw + out_x] =
+ col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_out_ + y) *
+ tile_w_out_ +
+ x];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ else if (conv_type_ == col_major)
+ {
+ for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h)
+ {
+ for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w)
+ {
+ for (int y = 0; y < tile_h_out_; ++y)
+ {
+ for (int x = 0; x < tile_w_out_; ++x)
+ {
+ for (int c = 0; c < channels; ++c)
+ {
+ int out_y = tile_h * tile_h_out_ + y;
+ int out_x = tile_w * tile_w_out_ + x;
+ if (out_y < outh && out_x < outw)
+ {
+ data[c + (out_y * outw + out_x) * c_stride_] =
+ col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_out_ + y) *
+ tile_w_out_ +
+ x];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void conv_winograd::compute_winograd()
+{
+ // const int w = in_mat_.w;
+ // const int h = in_mat_.h;
+ const int inch = in_mat_.c;
+ // const int outw = out_mat_.w;
+ // const int outh = out_mat_.h;
+ const int outch = out_mat_.c;
+ const int kernel_size = in_param_.kernel_w;
+
+ int M, N;
+ const double *A;
+ const double *B;
+
+ if (kernel_size == 3)
+ {
+ M = winograd_para_3x3s1::M;
+ N = winograd_para_3x3s1::N;
+ B = winograd_para_3x3s1::getB();
+ A = winograd_para_3x3s1::getA();
+ }
+ else
+ {
+ M = winograd_para_5x5s1::M;
+ N = winograd_para_5x5s1::N;
+ B = winograd_para_5x5s1::getB();
+ A = winograd_para_5x5s1::getA();
+ }
+
+ /*Step 2: transfer image to winograd domain*/
+ float *col_buff =
+ new float[std::max(outch, inch) * ntiles_h_ * ntiles_w_ * tile_h_in_ * tile_w_in_];
+
+ int temp1_n = inch * ntiles_h_ * ntiles_w_;
+ float *temp1_ =
+ new float[tile_h_in_ * tile_w_in_ * std::max(outch, inch) * ntiles_h_ * ntiles_w_];
+
+ float *winograd_b = new float[M * M * M * M];
+
+ if ((NULL == col_buff) || (NULL == temp1_) || (NULL == winograd_b))
+ {
+ delete[] col_buff;
+ delete[] temp1_;
+ delete[] winograd_b;
+ return;
+ }
+
+ winograd_input_im2col(col_buff);
+
+ kronecker_product(winograd_b, B, B, M, M, M, M);
+
+ compute_sgemm(rowMajor, trans, trans, tile_h_in_ * tile_w_in_, temp1_n, tile_h_in_ * tile_w_in_,
+ winograd_b, col_buff, temp1_);
+
+ delete[] winograd_b;
+
+ /*Step 3: convolution in winograd domain*/
+ for (int j = 0; j < tile_h_in_ * tile_w_in_; ++j)
+ {
+ compute_sgemm(rowMajor, notrans, notrans, outch, ntiles_h_ * ntiles_w_, inch,
+ winograd_weight_ + j * c_stride_ * inch,
+ temp1_ + j * inch * ntiles_h_ * ntiles_w_,
+ col_buff + j * outch * ntiles_h_ * ntiles_w_);
+ }
+
+ /*Step 4: transfer back to time domain*/
+ float *winograd_a = new float[M * (M - N + 1) * M * (M - N + 1)];
+ if (NULL == winograd_a)
+ {
+ delete[] col_buff;
+ delete[] temp1_;
+ return;
+ }
+ kronecker_product(winograd_a, A, A, M, M - N + 1, M, M - N + 1);
+ compute_sgemm(rowMajor, trans, notrans, outch * ntiles_h_ * ntiles_w_, tile_h_out_ * tile_w_out_,
+ tile_h_in_ * tile_w_in_, col_buff, winograd_a, temp1_);
+ delete[] winograd_a;
+ delete[] col_buff;
+
+ winograd_output_col2im(temp1_);
+
+ delete[] temp1_;
+}
+
+void conv_winograd::run()
+{
+ if (error_)
+ return;
+
+ compute_winograd();
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/conv_winograd.h b/compute/ncnn/src/srcn/conv_winograd.h
new file mode 100644
index 000000000..76c2601f2
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_winograd.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_CONV_WINOGRAD_H__
+#define __NNFW_SRCN_CONV_WINOGRAD_H__
+
+#include "ncnn/srcn/conv_type.h"
+#include "winograd.h"
+#include "sgemm_singlethread.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+class conv_winograd
+{
+public:
+ conv_winograd(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param,
+ convType_t conv_type, const float *winograd_weight, int num_threads, int inc_stride,
+ int outc_stride, int c_stride);
+ ~conv_winograd();
+
+ void run();
+
+private:
+ void param_init();
+ void compute_sgemm(sgemmType_t major_type, sgemmTrans_t ltrans, sgemmTrans_t rtrans, const int m,
+ const int n, const int k, const float *lhs_data, const float *rhs_data,
+ float *res_data);
+ void winograd_input_im2col(float *col_buff);
+ void winograd_output_col2im(const float *col_buff);
+ void compute_winograd();
+
+ const convMat_t in_mat_;
+ convMat_t out_mat_;
+ const convParams_t in_param_;
+ convType_t conv_type_;
+ const float *winograd_weight_;
+ const int num_threads_;
+
+ int tile_w_in_;
+ int tile_h_in_;
+ int tile_w_out_;
+ int tile_h_out_;
+ int ntiles_w_;
+ int ntiles_h_;
+
+ int inc_stride_;
+ int outc_stride_;
+ int c_stride_;
+
+ int error_;
+};
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_CONV_WINOGRAD_H__
diff --git a/compute/ncnn/src/srcn/conv_winograd_batch.cc b/compute/ncnn/src/srcn/conv_winograd_batch.cc
new file mode 100644
index 000000000..cba45c648
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_winograd_batch.cc
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.h"
+#include "conv_winograd_batch.h"
+
+namespace std
+{
+template <typename Dtype> static inline Dtype max(Dtype a, Dtype b)
+{
+ if (a > b)
+ return a;
+ else
+ return b;
+}
+}
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void conv_winograd_batch::param_init()
+{
+ if ((in_param_.kernel_w != in_param_.kernel_h) || (in_param_.stride_w != in_param_.stride_h) ||
+ (in_param_.kernel_w != 3 && in_param_.kernel_w != 5) || (in_param_.stride_w != 1) ||
+ (!winograd_weight_))
+ {
+ error_ = 1;
+ return;
+ }
+
+ int M, N;
+ const int w = in_mat_.w;
+ const int h = in_mat_.h;
+ const int outw = out_mat_.w;
+ const int outh = out_mat_.h;
+ const int pad_w = in_param_.pad_w;
+ const int pad_h = in_param_.pad_h;
+
+ if (in_param_.kernel_w == 3)
+ {
+ if (w == 4)
+ {
+ M = winograd_para_3x3s1_2::M;
+ N = winograd_para_3x3s1_2::N;
+ }
+ else
+ {
+ M = winograd_para_3x3s1::M;
+ N = winograd_para_3x3s1::N;
+ }
+ }
+ else
+ {
+ M = winograd_para_5x5s1::M;
+ N = winograd_para_5x5s1::N;
+ }
+
+ tile_h_in_ = tile_w_in_ = M;
+ tile_h_out_ = tile_h_in_ - N + 1;
+ tile_w_out_ = tile_w_in_ - N + 1;
+ ntiles_h_ = (std::max(h + pad_h - tile_h_in_ + 1, outh) + tile_h_out_ - 1) / tile_h_out_;
+ ntiles_w_ = (std::max(w + pad_w - tile_w_in_ + 1, outw) + tile_w_out_ - 1) / tile_w_out_;
+
+ error_ = 0;
+}
+
+conv_winograd_batch::conv_winograd_batch(const convMat_t &in_mat, convMat_t &out_mat,
+ const convParams_t &in_param, convType_t conv_type,
+ const float *winograd_weight, int num_threads)
+ : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), conv_type_(conv_type),
+ winograd_weight_(winograd_weight), num_threads_(num_threads)
+{
+ param_init();
+}
+
+conv_winograd_batch::~conv_winograd_batch() {}
+
+void conv_winograd_batch::compute_sgemm(sgemmType_t major_type, sgemmTrans_t ltrans,
+ sgemmTrans_t rtrans, const int m, const int n, const int k,
+ const float *lhs_data, const float *rhs_data,
+ float *res_data)
+{
+ class sgemm_singlethread sgemm(major_type, ltrans, rtrans, m, n, k, lhs_data, rhs_data, res_data,
+ num_threads_);
+
+ sgemm.run();
+}
+
+void conv_winograd_batch::winograd_input_im2col(float *col_buff)
+{
+ const int w = in_mat_.w;
+ const int h = in_mat_.h;
+ const float *data = in_mat_.data;
+ const int channels = in_mat_.c;
+ const int batch = in_mat_.n;
+ const int pad_w = in_param_.pad_w;
+ const int pad_h = in_param_.pad_h;
+
+ // TODO: row_major
+ if (conv_type_ == col_major)
+ {
+ for (int n = 0; n < batch; n++)
+ {
+ for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h)
+ {
+ for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w)
+ {
+ for (int y = 0; y < tile_h_in_; ++y)
+ {
+ for (int x = 0; x < tile_w_in_; ++x)
+ {
+ for (int c = 0; c < channels; ++c)
+ {
+ int in_y = tile_h * tile_h_out_ + y - pad_h;
+ int in_x = tile_w * tile_w_out_ + x - pad_w;
+
+ if (in_y < 0 || in_x < 0 || in_y >= h || in_x >= w)
+ {
+ col_buff[((((c * batch + n) * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) *
+ tile_h_in_ +
+ y) *
+ tile_w_in_ +
+ x] = 0;
+ }
+ else
+ {
+ col_buff[((((c * batch + n) * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) *
+ tile_h_in_ +
+ y) *
+ tile_w_in_ +
+ x] = data[((n * h + in_y) * w + in_x) * channels + c];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void conv_winograd_batch::winograd_output_col2im(const float *col_buff)
+{
+ int outh = out_mat_.h;
+ int outw = out_mat_.w;
+ float *data = out_mat_.data;
+ int channels = out_mat_.c;
+ int batch = out_mat_.n;
+
+ // TODO: row_major
+ if (conv_type_ == col_major)
+ {
+ for (int n = 0; n < batch; n++)
+ {
+ for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h)
+ {
+ for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w)
+ {
+ for (int y = 0; y < tile_h_out_; ++y)
+ {
+ for (int x = 0; x < tile_w_out_; ++x)
+ {
+ for (int c = 0; c < channels; ++c)
+ {
+ int out_y = tile_h * tile_h_out_ + y;
+ int out_x = tile_w * tile_w_out_ + x;
+ if (out_y < outh && out_x < outw)
+ {
+ data[((n * outh + out_y) * outw + out_x) * channels + c] =
+ col_buff[((((c * batch + n) * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) *
+ tile_h_out_ +
+ y) *
+ tile_w_out_ +
+ x];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void conv_winograd_batch::compute_winograd()
+{
+ const int w = in_mat_.w;
+ // const int h = in_mat_.h;
+ const int inch = in_mat_.c;
+ // const int outw = out_mat_.w;
+ // const int outh = out_mat_.h;
+ const int outch = out_mat_.c;
+ const int kernel_size = in_param_.kernel_w;
+ const int batch = in_mat_.n;
+
+ int M, N;
+ const double *A;
+ const double *B;
+
+ if (kernel_size == 3)
+ {
+ if (w == 4)
+ {
+ M = winograd_para_3x3s1_2::M;
+ N = winograd_para_3x3s1_2::N;
+ B = winograd_para_3x3s1_2::getB();
+ A = winograd_para_3x3s1_2::getA();
+ }
+ else
+ {
+ M = winograd_para_3x3s1::M;
+ N = winograd_para_3x3s1::N;
+ B = winograd_para_3x3s1::getB();
+ A = winograd_para_3x3s1::getA();
+ }
+ }
+ else
+ {
+ M = winograd_para_5x5s1::M;
+ N = winograd_para_5x5s1::N;
+ B = winograd_para_5x5s1::getB();
+ A = winograd_para_5x5s1::getA();
+ }
+
+ /*Step 2: transfer image to winograd domain*/
+ float *col_buff =
+ new float[std::max(outch, inch) * batch * ntiles_h_ * ntiles_w_ * tile_h_in_ * tile_w_in_];
+
+ int temp1_n = batch * inch * ntiles_h_ * ntiles_w_;
+ float *temp1_ =
+ new float[batch * tile_h_in_ * tile_w_in_ * std::max(outch, inch) * ntiles_h_ * ntiles_w_];
+
+ float *winograd_b = new float[M * M * M * M];
+
+ if ((NULL == col_buff) || (NULL == temp1_) || (NULL == winograd_b))
+ {
+ delete[] col_buff;
+ delete[] temp1_;
+ delete[] winograd_b;
+ return;
+ }
+
+ winograd_input_im2col(col_buff);
+
+ kronecker_product(winograd_b, B, B, M, M, M, M);
+
+ compute_sgemm(rowMajor, trans, trans, tile_h_in_ * tile_w_in_, temp1_n, tile_h_in_ * tile_w_in_,
+ winograd_b, col_buff, temp1_);
+ delete[] winograd_b;
+
+ /*Step 3: convolution in winograd domain*/
+ for (int j = 0; j < tile_h_in_ * tile_w_in_; ++j)
+ {
+ compute_sgemm(rowMajor, notrans, notrans, outch, batch * ntiles_h_ * ntiles_w_, inch,
+ winograd_weight_ + j * outch * inch,
+ temp1_ + j * batch * inch * ntiles_h_ * ntiles_w_,
+ col_buff + j * batch * outch * ntiles_h_ * ntiles_w_);
+ }
+
+ /*Step 4: transfer back to time domain*/
+ float *winograd_a = new float[M * (M - N + 1) * M * (M - N + 1)];
+ if (NULL == winograd_a)
+ {
+ delete[] col_buff;
+ delete[] temp1_;
+ return;
+ }
+
+ kronecker_product(winograd_a, A, A, M, M - N + 1, M, M - N + 1);
+ compute_sgemm(rowMajor, trans, notrans, batch * outch * ntiles_h_ * ntiles_w_,
+ tile_h_out_ * tile_w_out_, tile_h_in_ * tile_w_in_, col_buff, winograd_a, temp1_);
+ delete[] winograd_a;
+ delete[] col_buff;
+
+ winograd_output_col2im(temp1_);
+
+ delete[] temp1_;
+}
+
+void conv_winograd_batch::run()
+{
+ if (error_)
+ return;
+
+ compute_winograd();
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/conv_winograd_batch.h b/compute/ncnn/src/srcn/conv_winograd_batch.h
new file mode 100644
index 000000000..a022d9c52
--- /dev/null
+++ b/compute/ncnn/src/srcn/conv_winograd_batch.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_CONV_WINOGRAD_BATCH_H__
+#define __NNFW_SRCN_CONV_WINOGRAD_BATCH_H__
+
+#include "ncnn/srcn/conv_type.h"
+#include "winograd.h"
+#include "sgemm_singlethread.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+class conv_winograd_batch
+{
+public:
+ conv_winograd_batch(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param,
+ convType_t conv_type, const float *winograd_weight, int num_threads);
+ ~conv_winograd_batch();
+
+ void run();
+
+private:
+ void param_init();
+ void compute_sgemm(sgemmType_t major_type, sgemmTrans_t ltrans, sgemmTrans_t rtrans, const int m,
+ const int n, const int k, const float *lhs_data, const float *rhs_data,
+ float *res_data);
+ void winograd_input_im2col(float *col_buff);
+ void winograd_output_col2im(const float *col_buff);
+ void compute_winograd();
+
+ const convMat_t in_mat_;
+ convMat_t out_mat_;
+ const convParams_t in_param_;
+ convType_t conv_type_;
+ const float *winograd_weight_;
+ const int num_threads_;
+
+ int tile_w_in_;
+ int tile_h_in_;
+ int tile_w_out_;
+ int tile_h_out_;
+ int ntiles_w_;
+ int ntiles_h_;
+
+ int error_;
+};
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_CONV_WINOGRAD_BATCH_H__
diff --git a/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc
new file mode 100644
index 000000000..f3ccf13e5
--- /dev/null
+++ b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "common.h"
+#include "sgemm_kernel.h"
+#include "sgemm_pack.h"
+#include "deconv_sgemm_multithreads.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void deconv_sgemm_multithreads::param_init()
+{
+#if __aarch64__
+ if (conv_type_ == row_major)
+ {
+ mr_ = 8;
+ nr_ = 12;
+ }
+ else if (conv_type_ == col_major)
+ {
+
+ mr_ = 12;
+ nr_ = 8;
+ }
+#else // __aarch64__
+ if (conv_type_ == row_major)
+ {
+ mr_ = 6;
+ nr_ = 8;
+ }
+ else if (conv_type_ == col_major)
+ {
+ mr_ = 8;
+ nr_ = 6;
+ }
+#endif // __aarch64__
+
+ int col = n_;
+
+ if (m_ > n_)
+ {
+ shard_type_ = shardByRow;
+ col = m_;
+ }
+ else
+ {
+ shard_type_ = shardByCol;
+ }
+
+ int th_base = divup(col, num_threads_);
+
+ th_base = MIN(MAX(th_base, MIN_COL), MAX_COL);
+
+ int k_div = (nr_ * sizeof_RhsScalar);
+ int k_sub = (mr_ * nr_ * sizeof_ResScalar);
+
+ const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div * 2), MAX_K);
+ bk_ = MIN(k_cache, k_);
+
+ if (shard_type_ == shardByCol)
+ {
+ int m_sub = (bk_ * nr_ * sizeof_RhsScalar);
+ int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
+ if (L3_CACHE_SIZE)
+ m_div = (sizeof_LhsScalar * bk_ * 2);
+ int m_cache = divup((L2_CACHE_SIZE - m_sub), m_div);
+ bm_ = MIN(m_cache, m_);
+
+ bn_ = MIN(th_base, n_);
+ if (L3_CACHE_SIZE)
+ {
+ int n_sub = (bk_ * bm_ * sizeof_RhsScalar);
+ int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
+ int n_cache = divup((L3_CACHE_SIZE - n_sub), n_div);
+ bn_ = MIN(n_cache, bn_);
+ }
+ }
+ else
+ {
+ int n_sub = (bk_ * mr_ * sizeof_LhsScalar);
+ int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
+ if (L3_CACHE_SIZE)
+ n_div = (sizeof_LhsScalar * bk_ * 2);
+ int n_cache = divup((L2_CACHE_SIZE - n_sub), n_div);
+ bn_ = MIN(n_cache, n_);
+
+ bm_ = MIN(th_base, m_);
+ if (L3_CACHE_SIZE)
+ {
+ int m_sub = (bk_ * bn_ * sizeof_RhsScalar);
+ int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
+ int m_cache = divup((L3_CACHE_SIZE - m_sub), m_div);
+ bm_ = MIN(m_cache, bm_);
+ }
+ }
+
+ nm_ = divup(m_, bm_);
+ nn_ = divup(n_, bn_);
+ nk_ = divup(k_, bk_);
+
+ rm_ = m_ % bm_;
+ rn_ = n_ % bn_;
+ rk_ = k_ % bk_;
+}
+
+deconv_sgemm_multithreads::deconv_sgemm_multithreads(const convMat_t &in_mat,
+ const convMat_t &weights_mat,
+ convMat_t &out_mat,
+ const convParams_t &in_param, int num_threads,
+ convType_t conv_type)
+
+ : in_mat_(in_mat), weights_mat_(weights_mat), out_mat_(out_mat), in_param_(in_param),
+ conv_type_(conv_type), num_threads_(num_threads)
+{
+ m_ = in_param_.kernel_h * in_param_.kernel_w * out_mat_.c;
+#ifdef NCNN
+ n_ = alignSize(in_mat_.h * in_mat_.w, 16 / sizeof(float));
+#else // NCNN
+ n_ = in_mat_.w * in_mat_.h;
+#endif // NCNN
+ k_ = in_mat.c;
+
+ param_init();
+
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ if (shard_type_ == shardByCol)
+ {
+ plhs_buffer_ = new float[lhs_stride * 1 * nm_];
+ prhs_buffer_ = new float[rhs_stride * num_threads_];
+ }
+ else
+ {
+ plhs_buffer_ = new float[lhs_stride * num_threads_];
+ prhs_buffer_ = new float[rhs_stride * 1 * nn_];
+ }
+
+ pres_buffer_ = new float[bm_ * bn_ * num_threads_];
+
+ if (plhs_buffer_ == NULL || prhs_buffer_ == NULL || pres_buffer_ == NULL)
+ {
+ error_ = 1;
+ }
+
+ if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 ||
+ in_param_.stride_h != 1 || in_param_.padding != 0)
+ {
+ need_col2im_ = 1;
+ }
+ else
+ {
+ need_col2im_ = 0;
+ }
+
+ omp_set_num_threads(num_threads_);
+
+ error_ = 0;
+}
+
+deconv_sgemm_multithreads::~deconv_sgemm_multithreads()
+{
+ if (plhs_buffer_)
+ delete[] plhs_buffer_;
+ if (prhs_buffer_)
+ delete[] prhs_buffer_;
+ if (pres_buffer_)
+ delete[] pres_buffer_;
+}
+
+void deconv_sgemm_multithreads::run()
+{
+ if (error_)
+ return;
+
+ if (shard_type_ == shardByCol && conv_type_ == col_major)
+ {
+ compute_colmajor_colshard();
+ }
+ else if (shard_type_ == shardByRow && conv_type_ == col_major)
+ {
+ compute_colmajor_rowshard();
+ }
+ else if (shard_type_ == shardByCol && conv_type_ == row_major)
+ {
+ compute_rowmajor_colshard();
+ }
+ else if (shard_type_ == shardByRow && conv_type_ == row_major)
+ {
+ compute_rowmajor_rowshard();
+ }
+}
+
+void deconv_sgemm_multithreads::compute_rowmajor_colshard()
+{
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+#pragma omp parallel for
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_],
+ &plhs_buffer_[i * lhs_stride]);
+ }
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ int thread_num = omp_get_thread_num();
+ float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num];
+ float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num];
+
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride],
+ prhs_ptr, pres_ptr, 0, bn, bk);
+
+ if (need_col2im_)
+ _unpack_rowmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr);
+ }
+ }
+ }
+}
+
+void deconv_sgemm_multithreads::compute_rowmajor_rowshard()
+{
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_],
+ &prhs_buffer_[j * rhs_stride]);
+ }
+
+#pragma omp parallel for
+ for (int i = 0; i < nm_; i++)
+ {
+ int thread_num = omp_get_thread_num();
+ float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num];
+ float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num];
+
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_],
+ plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr,
+ &prhs_buffer_[j * rhs_stride], pres_ptr, 0, bn, bk);
+ if (need_col2im_)
+ _unpack_rowmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr);
+ }
+ }
+ }
+}
+
+void deconv_sgemm_multithreads::compute_colmajor_colshard()
+{
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+#pragma omp parallel for
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_],
+ &plhs_buffer_[i * lhs_stride]);
+ }
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ int thread_num = omp_get_thread_num();
+ float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num];
+ float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num];
+
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride],
+ prhs_ptr, pres_ptr, 0, bm, bk);
+
+ // Need to add lock?
+ if (need_col2im_)
+ _unpack_colmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr);
+ }
+ }
+ }
+}
+
+void deconv_sgemm_multithreads::compute_colmajor_rowshard()
+{
+ int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
+ int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_],
+ &prhs_buffer_[j * rhs_stride]);
+ }
+
+#pragma omp parallel for
+ for (int i = 0; i < nm_; i++)
+ {
+ int thread_num = omp_get_thread_num();
+ float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num];
+ float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num];
+
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_],
+ plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr,
+ &prhs_buffer_[j * rhs_stride], pres_ptr, 0, bm, bk);
+
+ if (need_col2im_)
+ _unpack_colmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr);
+ }
+ }
+ }
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/deconv_sgemm_multithreads.h b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.h
new file mode 100644
index 000000000..762f20380
--- /dev/null
+++ b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_DECONV_SGEMM_MULTITHREADS_H__
+#define __NNFW_SRCN_DECONV_SGEMM_MULTITHREADS_H__
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+class deconv_sgemm_multithreads
+{
+public:
+ deconv_sgemm_multithreads(const convMat_t &in_mat, const convMat_t &weights_mat,
+ convMat_t &out_mat, const convParams_t &in_param, int num_threads,
+ convType_t conv_type);
+ ~deconv_sgemm_multithreads();
+
+ void run();
+
+private:
+ void param_init();
+
+ void compute_rowmajor_colshard();
+ void compute_rowmajor_rowshard();
+ void compute_colmajor_colshard();
+ void compute_colmajor_rowshard();
+
+ const convMat_t in_mat_;
+ const convMat_t weights_mat_;
+ convMat_t out_mat_;
+ const convParams_t in_param_;
+ convType_t conv_type_;
+ const int num_threads_;
+
+ int m_;
+ int n_;
+ int k_;
+
+ int bm_;
+ int bn_;
+ int bk_;
+
+ int rm_;
+ int rn_;
+ int rk_;
+
+ int nm_;
+ int nn_;
+ int nk_;
+
+ int mr_;
+ int nr_;
+
+ int need_col2im_;
+ shardType_t shard_type_;
+
+ float *prhs_buffer_;
+ float *plhs_buffer_;
+ float *pres_buffer_;
+
+ int error_;
+};
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_DECONV_SGEMM_MULTITHREADS_H__
diff --git a/compute/ncnn/src/srcn/depthwise_conv.cc b/compute/ncnn/src/srcn/depthwise_conv.cc
new file mode 100644
index 000000000..cd092d5ac
--- /dev/null
+++ b/compute/ncnn/src/srcn/depthwise_conv.cc
@@ -0,0 +1,2684 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <arm_neon.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "ncnn/srcn/conv_type.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+static void depthwise_conv3x3S1_nopad(const convMat_t &in_mat, convMat_t &out_mat,
+ const convMat_t &kernel, const convMat_t &bias)
+{
+#if !__aarch64__
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int outw = out_mat.w;
+ int outh = out_mat.h;
+ int channels = in_mat.c;
+
+#pragma omp parallel for
+ for (int c = 0; c < channels; c++)
+ {
+ const float *filter = kernel.data + c * 9;
+#ifdef NCNN
+ float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float));
+ float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float));
+#else // NCNN
+ float *inbuf = in_mat.data + c * w * h;
+ float *outbuf = out_mat.data + c * outw * outh;
+#endif // NCNN
+ float bias0 = bias.data ? bias.data[c] : 0.0f;
+
+ register float32x4_t weight012 asm("q4") = vld1q_f32(filter);
+ register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3);
+ register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6);
+ register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0);
+
+ float *in_ptr0 = inbuf + 0 * w;
+ float *in_ptr1 = inbuf + 1 * w;
+ float *in_ptr2 = inbuf + 2 * w;
+ float *in_ptr3 = inbuf + 3 * w;
+
+ float *out_ptr0 = outbuf + 0 * outw;
+ float *out_ptr1 = outbuf + 1 * outw;
+
+ int i;
+ for (i = 0; i + 1 < outh; i += 2)
+ {
+ int nn = (outw >> 2) - 1;
+ int remain = outw & 0x03;
+
+ if (nn > 0)
+ {
+ __asm __volatile("pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr0], %[in_ptr0], #16\n"
+
+ "1:\n"
+ "add %[in_ptr0], %[in_ptr0], #16\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q2, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr1], %[in_ptr1], #16\n"
+
+ "vand q15, %q[qbias0], %q[qbias0]\n"
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q2, %e[weight345][1]\n"
+ "vmul.f32 q12, q0, %e[weight012][0]\n"
+ "vmul.f32 q13, q2, %e[weight012][1]\n"
+
+ "pld [%[in_ptr2], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vmla.f32 q15, q3, %f[weight012][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr2], %[in_ptr2], #16\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q2, %e[weight678][1]\n"
+ "vmla.f32 q12, q0, %e[weight345][0]\n"
+ "vmla.f32 q13, q2, %e[weight345][1]\n"
+
+ "pld [%[in_ptr3], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr3]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vmla.f32 q15, q3, %f[weight345][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr3], %[in_ptr3], #16\n"
+
+ "vmla.f32 q12, q0, %e[weight678][0]\n"
+ "vmla.f32 q13, q2, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vmla.f32 q15, q3, %f[weight678][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+ "vadd.f32 q15, q15, q12\n"
+ "vadd.f32 q15, q15, q13\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n"
+
+ "bne 1b\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
+
+ [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+
+ for (; remain > 0; remain--)
+ {
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+ float32x4_t input3 = vld1q_f32(in_ptr3);
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ float32x4_t out1 = vmulq_f32(input1, weight012);
+ out1 = vmlaq_f32(out1, input2, weight345);
+ out1 = vmlaq_f32(out1, input3, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+ out1 = vsetq_lane_f32(bias0, out1, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+ float32x2_t out11 = vadd_f32(vget_low_f32(out1), vget_high_f32(out1));
+
+ float32x2_t out01 = vpadd_f32(out00, out11);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+ *out_ptr1 = vget_lane_f32(out01, 1);
+
+ in_ptr0++;
+ in_ptr1++;
+ in_ptr2++;
+ in_ptr3++;
+ out_ptr0++;
+ out_ptr1++;
+ }
+
+ in_ptr0 += w + 2;
+ in_ptr1 += w + 2;
+ in_ptr2 += w + 2;
+ in_ptr3 += w + 2;
+
+ out_ptr0 += outw;
+ out_ptr1 += outw;
+ }
+
+ for (; i < outh; i++)
+ {
+ int nn = outw >> 2;
+ int remain = outw & 0x03;
+
+ if (nn > 0)
+ {
+ __asm __volatile("1:\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr0], %[in_ptr0], #16\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmla.f32 q14, q0, %e[weight012][0]\n"
+ "vmla.f32 q14, q2, %e[weight012][1]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr1], %[in_ptr1], #16\n"
+
+ "vmla.f32 q14, q0, %e[weight345][0]\n"
+ "vmla.f32 q14, q2, %e[weight345][1]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+
+ "vld1.f32 {d0-d2}, [%[in_ptr2]]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr2], %[in_ptr2], #16\n"
+
+ "vmla.f32 q14, q0, %e[weight678][0]\n"
+ "vmla.f32 q14, q2, %e[weight678][1]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+
+ "bne 1b\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+
+ for (; remain > 0; remain--)
+ {
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0++;
+ in_ptr1++;
+ in_ptr2++;
+ out_ptr0++;
+ }
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ in_ptr2 += 2;
+ }
+ }
+#else // __aarch64__
+ (void)in_mat;
+ (void)out_mat;
+ (void)kernel;
+ (void)bias;
+#endif // !__aarch64__
+}
+
+static void depthwise_conv3x3S1_padding(const convMat_t &in_mat, convMat_t &out_mat,
+ const convMat_t &kernel, const convMat_t &bias)
+{
+#if !__aarch64__
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int outw = out_mat.w;
+ int outh = out_mat.h;
+ int channels = in_mat.c;
+
+#pragma omp parallel for
+ for (int c = 0; c < channels; c++)
+ {
+ const float *filter = kernel.data + c * 9;
+#ifdef NCNN
+ float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float));
+ float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float));
+#else // NCNN
+ float *inbuf = in_mat.data + c * w * h;
+ float *outbuf = out_mat.data + c * outw * outh;
+#endif // NCNN
+ float bias0 = bias.data ? bias.data[c] : 0.0f;
+
+ register float32x4_t weight012 asm("q4") = vld1q_f32(filter);
+ register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3);
+ register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6);
+ register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0);
+
+ float *in_ptr0 = inbuf + 0 * w;
+ float *in_ptr1 = inbuf + 1 * w;
+ float *in_ptr2 = inbuf + 2 * w;
+ float *in_ptr3 = inbuf + 3 * w;
+
+ float *out_ptr0 = outbuf + 0 * outw;
+ float *out_ptr1 = outbuf + 1 * outw;
+
+ int i;
+ for (i = 0; i + 1 < outh; i += 2)
+ {
+ int nn = (outw >> 2) - 1;
+ int remain = (outw & 0x03) + 4;
+ if (i == 0)
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q8, #0\n"
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr0], %[in_ptr0], #12\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vand q15, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q2, %e[weight345][0]\n"
+ "vmul.f32 q11, q0, %e[weight345][1]\n"
+ "vmul.f32 q12, q2, %e[weight012][0]\n"
+ "vmul.f32 q13, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vmla.f32 q15, q3, %f[weight012][0]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr1], %[in_ptr1], #12\n"
+
+ "vmla.f32 q10, q2, %e[weight678][0]\n"
+ "vmla.f32 q11, q0, %e[weight678][1]\n"
+ "vmla.f32 q12, q2, %e[weight345][0]\n"
+ "vmla.f32 q13, q0, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vmla.f32 q15, q3, %f[weight345][0]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr2], %[in_ptr2], #12\n"
+
+ "vmla.f32 q12, q2, %e[weight678][0]\n"
+ "vmla.f32 q13, q0, %e[weight678][1]\n"
+ "vmla.f32 q15, q3, %f[weight678][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+ "vadd.f32 q15, q15, q12\n"
+ "vadd.f32 q15, q15, q13\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "1:\n"
+ "add %[in_ptr0], %[in_ptr0], #16\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vand q15, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight345][0]\n"
+ "vmul.f32 q11, q2, %e[weight345][1]\n"
+ "vmul.f32 q12, q0, %e[weight012][0]\n"
+ "vmul.f32 q13, q2, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vmla.f32 q15, q3, %f[weight012][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr1], %[in_ptr1], #16\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q2, %e[weight678][1]\n"
+ "vmla.f32 q12, q0, %e[weight345][0]\n"
+ "vmla.f32 q13, q2, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vmla.f32 q15, q3, %f[weight345][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr2], %[in_ptr2], #16\n"
+
+ "vmla.f32 q12, q0, %e[weight678][0]\n"
+ "vmla.f32 q13, q2, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vmla.f32 q15, q3, %f[weight678][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+ "vadd.f32 q15, q15, q12\n"
+ "vadd.f32 q15, q15, q13\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n"
+ "bne 1b\n"
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0),
+ [out_ptr1] "+r"(out_ptr1), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+
+ for (; remain > 0; remain--)
+ {
+ // TODO: when nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ input2 = vsetq_lane_f32(0.0f, input2, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight345);
+ out0 = vmlaq_f32(out0, input1, weight678);
+
+ float32x4_t out1 = vmulq_f32(input0, weight012);
+ out1 = vmlaq_f32(out1, input1, weight345);
+ out1 = vmlaq_f32(out1, input2, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+ out1 = vsetq_lane_f32(bias0, out1, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+ float32x2_t out11 = vadd_f32(vget_low_f32(out1), vget_high_f32(out1));
+
+ float32x2_t out01 = vpadd_f32(out00, out11);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+ *out_ptr1 = vget_lane_f32(out01, 1);
+
+ in_ptr0++;
+ in_ptr1++;
+ in_ptr2++;
+ out_ptr0++;
+ out_ptr1++;
+ }
+
+ in_ptr0 += 1;
+ in_ptr1 += 1;
+ in_ptr2 += 1;
+ in_ptr3 += w;
+ }
+ else if (i == outh - 2)
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q8, #0\n"
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr0], %[in_ptr0], #12\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q2, %e[weight012][0]\n"
+ "vmul.f32 q11, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr1], %[in_ptr1], #12\n"
+
+ "vand q15, %q[qbias0], %q[qbias0]\n"
+ "vmla.f32 q10, q2, %e[weight345][0]\n"
+ "vmla.f32 q11, q0, %e[weight345][1]\n"
+ "vmul.f32 q12, q2, %e[weight012][0]\n"
+ "vmul.f32 q13, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr2], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vmla.f32 q15, q3, %f[weight012][0]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr2], %[in_ptr2], #12\n"
+
+ "vmla.f32 q10, q2, %e[weight678][0]\n"
+ "vmla.f32 q11, q0, %e[weight678][1]\n"
+ "vmla.f32 q12, q2, %e[weight345][0]\n"
+ "vmla.f32 q13, q0, %e[weight345][1]\n"
+
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vmla.f32 q15, q3, %f[weight345][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+ "vadd.f32 q15, q15, q12\n"
+ "vadd.f32 q15, q15, q13\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "1:\n"
+ "add %[in_ptr0], %[in_ptr0], #16\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q2, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr1], %[in_ptr1], #16\n"
+
+ "vand q15, %q[qbias0], %q[qbias0]\n"
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q2, %e[weight345][1]\n"
+ "vmul.f32 q12, q0, %e[weight012][0]\n"
+ "vmul.f32 q13, q2, %e[weight012][1]\n"
+
+ "pld [%[in_ptr2], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vmla.f32 q15, q3, %f[weight012][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr2], %[in_ptr2], #16\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q2, %e[weight678][1]\n"
+ "vmla.f32 q12, q0, %e[weight345][0]\n"
+ "vmla.f32 q13, q2, %e[weight345][1]\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vmla.f32 q15, q3, %f[weight345][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+ "vadd.f32 q15, q15, q12\n"
+ "vadd.f32 q15, q15, q13\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n"
+ "bne 1b\n"
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0),
+ [out_ptr1] "+r"(out_ptr1), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ // TODO: when nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ input2 = vsetq_lane_f32(0.0f, input2, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ float32x4_t out1 = vmulq_f32(input1, weight012);
+ out1 = vmlaq_f32(out1, input2, weight345);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+ out1 = vsetq_lane_f32(bias0, out1, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+ float32x2_t out11 = vadd_f32(vget_low_f32(out1), vget_high_f32(out1));
+
+ float32x2_t out01 = vpadd_f32(out00, out11);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+ *out_ptr1 = vget_lane_f32(out01, 1);
+
+ in_ptr0++;
+ in_ptr1++;
+ in_ptr2++;
+ out_ptr0++;
+ out_ptr1++;
+ }
+ }
+ else
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q8, #0\n"
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr0], %[in_ptr0], #12\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q2, %e[weight012][0]\n"
+ "vmul.f32 q11, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr1], %[in_ptr1], #12\n"
+
+ "vand q15, %q[qbias0], %q[qbias0]\n"
+ "vmla.f32 q10, q2, %e[weight345][0]\n"
+ "vmla.f32 q11, q0, %e[weight345][1]\n"
+ "vmul.f32 q12, q2, %e[weight012][0]\n"
+ "vmul.f32 q13, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr2], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vmla.f32 q15, q3, %f[weight012][0]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr2], %[in_ptr2], #12\n"
+
+ "vmla.f32 q10, q2, %e[weight678][0]\n"
+ "vmla.f32 q11, q0, %e[weight678][1]\n"
+ "vmla.f32 q12, q2, %e[weight345][0]\n"
+ "vmla.f32 q13, q0, %e[weight345][1]\n"
+
+ "pld [%[in_ptr3], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr3]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vmla.f32 q15, q3, %f[weight345][0]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr3], %[in_ptr3], #12\n"
+
+ "vmla.f32 q15, q2, %e[weight678][0]\n"
+ "vmla.f32 q15, q0, %e[weight678][1]\n"
+ "vmla.f32 q15, q3, %f[weight678][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+ "vadd.f32 q15, q15, q12\n"
+ "vadd.f32 q15, q15, q13\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "1:\n"
+ "add %[in_ptr0], %[in_ptr0], #16\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q2, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr1], %[in_ptr1], #16\n"
+
+ "vand q15, %q[qbias0], %q[qbias0]\n"
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q2, %e[weight345][1]\n"
+ "vmul.f32 q12, q0, %e[weight012][0]\n"
+ "vmul.f32 q13, q2, %e[weight012][1]\n"
+
+ "pld [%[in_ptr2], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vmla.f32 q15, q3, %f[weight012][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr2], %[in_ptr2], #16\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q2, %e[weight678][1]\n"
+ "vmla.f32 q12, q0, %e[weight345][0]\n"
+ "vmla.f32 q13, q2, %e[weight345][1]\n"
+
+ "pld [%[in_ptr3], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr3]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vmla.f32 q15, q3, %f[weight345][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr3], %[in_ptr3], #16\n"
+
+ "vmla.f32 q15, q0, %e[weight678][0]\n"
+ "vmla.f32 q15, q2, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vmla.f32 q15, q3, %f[weight678][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q15, q15, q12\n"
+ "vadd.f32 q14, q14, q11\n"
+ "vadd.f32 q15, q15, q13\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n"
+ "bne 1b\n"
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
+
+ [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ // TODO: when nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+ float32x4_t input3 = vld1q_f32(in_ptr3);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ input2 = vsetq_lane_f32(0.0f, input2, 2);
+ input3 = vsetq_lane_f32(0.0f, input3, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ float32x4_t out1 = vmulq_f32(input1, weight012);
+ out1 = vmlaq_f32(out1, input2, weight345);
+ out1 = vmlaq_f32(out1, input3, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+ out1 = vsetq_lane_f32(bias0, out1, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+ float32x2_t out11 = vadd_f32(vget_low_f32(out1), vget_high_f32(out1));
+
+ float32x2_t out01 = vpadd_f32(out00, out11);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+ *out_ptr1 = vget_lane_f32(out01, 1);
+
+ in_ptr0++;
+ in_ptr1++;
+ in_ptr2++;
+ in_ptr3++;
+ out_ptr0++;
+ out_ptr1++;
+ }
+ in_ptr0 += w + 1;
+ in_ptr1 += w + 1;
+ in_ptr2 += w + 1;
+ in_ptr3 += w + 1;
+ }
+
+ out_ptr0 += outw;
+ out_ptr1 += outw;
+ }
+
+ for (; i < outh; i++)
+ {
+ // TODO:if i == 0, pad_top comes here.
+ int nn = (outw >> 2) - 1;
+ int remain = (outw & 0x03) + 4;
+
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q8, #0\n"
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr0], %[in_ptr0], #12\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q2, %e[weight012][0]\n"
+ "vmul.f32 q11, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q2, q8, q0, #3\n"
+ "vext.32 q3, q0, q1, #1\n"
+ "add %[in_ptr1], %[in_ptr1], #12\n"
+
+ "vmla.f32 q10, q2, %e[weight345][0]\n"
+ "vmla.f32 q11, q0, %e[weight345][1]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "1:\n"
+ "add %[in_ptr0], %[in_ptr0], #16\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q2, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+ "add %[in_ptr1], %[in_ptr1], #16\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q2, %e[weight345][1]\n"
+
+ "pld [%[in_ptr0], #192]\n"
+ "vld1.f32 {d0-d2}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vext.32 q2, q0, q1, #1\n"
+ "vext.32 q3, q0, q1, #2\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ // TODO: when nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0++;
+ in_ptr1++;
+ out_ptr0++;
+ out_ptr1++;
+ }
+ }
+ }
+#else // __aarch64__
+ (void)in_mat;
+ (void)out_mat;
+ (void)kernel;
+ (void)bias;
+#endif // __aarch64__
+}
+
+static void depthwise_conv3x3S2_nopad(const convMat_t &in_mat, convMat_t &out_mat,
+ const convMat_t &kernel, const convMat_t &bias)
+{
+#if !__aarch64__
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int outw = out_mat.w;
+ int outh = out_mat.h;
+ int channels = in_mat.c;
+
+ const int tailstep = w - 2 * outw + w;
+
+#pragma omp parallel for
+ for (int c = 0; c < channels; c++)
+ {
+ const float *filter = kernel.data + c * 9;
+#ifdef NCNN
+ float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float));
+ float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float));
+#else // NCNN
+ float *inbuf = in_mat.data + c * w * h;
+ float *outbuf = out_mat.data + c * outw * outh;
+#endif // NCNN
+ float bias0 = bias.data ? bias.data[c] : 0.0f;
+
+ register float32x4_t weight012 asm("q4") = vld1q_f32(filter);
+ register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3);
+ register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6);
+ register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0);
+
+ float *in_ptr0 = inbuf + 0 * w;
+ float *in_ptr1 = inbuf + 1 * w;
+ float *in_ptr2 = inbuf + 2 * w;
+
+ float *out_ptr0 = outbuf + 0 * outw;
+
+ int i;
+ for (i = 0; i < outh; i++)
+ {
+ int nn = outw >> 2;
+ int remain = outw & 0x03;
+
+ if (nn > 0)
+ {
+ __asm __volatile("pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q1, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+
+ for (; remain > 0; remain--)
+ {
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ in_ptr2 += 2;
+ out_ptr0++;
+ }
+
+ in_ptr0 += tailstep;
+ in_ptr1 += tailstep;
+ in_ptr2 += tailstep;
+ }
+ }
+
+#else // __aarch64__
+ (void)in_mat;
+ (void)out_mat;
+ (void)kernel;
+ (void)bias;
+#endif // __aarch64__
+}
+
+static void depthwise_conv3x3S2_padding00(const convMat_t &in_mat, convMat_t &out_mat,
+ const convMat_t &kernel, const convMat_t &bias)
+{
+#if !__aarch64__
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int outw = out_mat.w;
+ int outh = out_mat.h;
+ int channels = in_mat.c;
+
+#pragma omp parallel for
+ for (int c = 0; c < channels; c++)
+ {
+ const float *filter = kernel.data + c * 9;
+#ifdef NCNN
+ float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float));
+ float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float));
+#else // NCNN
+ float *inbuf = in_mat.data + c * w * h;
+ float *outbuf = out_mat.data + c * outw * outh;
+#endif // NCNN
+ float bias0 = bias.data ? bias.data[c] : 0.0f;
+
+ register float32x4_t weight012 asm("q4") = vld1q_f32(filter);
+ register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3);
+ register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6);
+ register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0);
+
+ float *in_ptr0 = inbuf + 0 * w;
+ float *in_ptr1 = inbuf + 1 * w;
+ float *in_ptr2 = inbuf + 2 * w;
+
+ float *out_ptr0 = outbuf + 0 * outw;
+
+ int i;
+ for (i = 0; i < outh; i++)
+ {
+ int nn = (outw >> 2) - 1;
+ int remain = (outw & 0x03) + 4;
+
+ if (i == outh - 1)
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ out_ptr0++;
+ }
+ }
+ else
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q1, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ input2 = vsetq_lane_f32(0.0f, input2, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ in_ptr2 += 2;
+ out_ptr0++;
+ }
+
+ in_ptr0 += w;
+ in_ptr1 += w;
+ in_ptr2 += w;
+ }
+ }
+ }
+#else // __aarch64__
+ (void)in_mat;
+ (void)out_mat;
+ (void)kernel;
+ (void)bias;
+#endif // !__aarch64__
+}
+
+static void depthwise_conv3x3S2_padding01(const convMat_t &in_mat, convMat_t &out_mat,
+ const convMat_t &kernel, const convMat_t &bias)
+{
+#if !__aarch64__
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int outw = out_mat.w;
+ int outh = out_mat.h;
+ int channels = in_mat.c;
+
+#pragma omp parallel for
+ for (int c = 0; c < channels; c++)
+ {
+ const float *filter = kernel.data + c * 9;
+#ifdef NCNN
+ float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float));
+ float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float));
+#else // NCNN
+ float *inbuf = in_mat.data + c * w * h;
+ float *outbuf = out_mat.data + c * outw * outh;
+#endif // NCNN
+ float bias0 = bias.data ? bias.data[c] : 0.0f;
+
+ register float32x4_t weight012 asm("q4") = vld1q_f32(filter);
+ register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3);
+ register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6);
+ register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0);
+
+ float *in_ptr0 = inbuf + 0 * w;
+ float *in_ptr1 = inbuf + 1 * w;
+ float *in_ptr2 = inbuf + 2 * w;
+
+ float *out_ptr0 = outbuf + 0 * outw;
+
+ int i;
+ for (i = 0; i < outh; i++)
+ {
+ int nn = (outw >> 2) - 1;
+ int remain = (outw & 0x03) + 4;
+
+ if (i == outh - 1)
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q2, #0\n"
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr0], %[in_ptr0], #28\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q3, %e[weight012][0]\n"
+ "vmul.f32 q11, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q1, %f[weight012][0]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr1], %[in_ptr1], #28\n"
+
+ "vmla.f32 q10, q3, %e[weight345][0]\n"
+ "vmla.f32 q11, q0, %e[weight345][1]\n"
+ "vmla.f32 q14, q1, %f[weight345][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ // TODO: if nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ out_ptr0++;
+ }
+ }
+ else
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q2, #0\n"
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr0], %[in_ptr0], #28\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q3, %e[weight012][0]\n"
+ "vmul.f32 q11, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q1, %f[weight012][0]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr1], %[in_ptr1], #28\n"
+
+ "vmla.f32 q10, q3, %e[weight345][0]\n"
+ "vmla.f32 q11, q0, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q1, %f[weight345][0]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr2], %[in_ptr2], #28\n"
+
+ "vmla.f32 q10, q3, %e[weight678][0]\n"
+ "vmla.f32 q11, q0, %e[weight678][1]\n"
+ "vmla.f32 q14, q1, %f[weight678][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q1, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ // TODO: if nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ input2 = vsetq_lane_f32(0.0f, input2, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ in_ptr2 += 2;
+ out_ptr0++;
+ }
+
+ in_ptr0 += w;
+ in_ptr1 += w;
+ in_ptr2 += w;
+ }
+ }
+ }
+
+#else // __aarch64__
+ (void)in_mat;
+ (void)out_mat;
+ (void)kernel;
+ (void)bias;
+#endif // __aarch64__
+}
+
+static void depthwise_conv3x3S2_padding10(const convMat_t &in_mat, convMat_t &out_mat,
+ const convMat_t &kernel, const convMat_t &bias)
+{
+#if !__aarch64__
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int outw = out_mat.w;
+ int outh = out_mat.h;
+ int channels = in_mat.c;
+
+#pragma omp parallel for
+ for (int c = 0; c < channels; c++)
+ {
+ const float *filter = kernel.data + c * 9;
+#ifdef NCNN
+ float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float));
+ float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float));
+#else // NCNN
+ float *inbuf = in_mat.data + c * w * h;
+ float *outbuf = out_mat.data + c * outw * outh;
+#endif // NCNN
+ float bias0 = bias.data ? bias.data[c] : 0.0f;
+
+ register float32x4_t weight012 asm("q4") = vld1q_f32(filter);
+ register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3);
+ register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6);
+ register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0);
+
+ float *in_ptr0 = inbuf + 0 * w;
+ float *in_ptr1 = inbuf + 1 * w;
+ float *in_ptr2 = inbuf + 2 * w;
+
+ float *out_ptr0 = outbuf + 0 * outw;
+
+ int i;
+ for (i = 0; i < outh; i++)
+ {
+ int nn = (outw >> 2) - 1;
+ int remain = (outw & 0x03) + 4;
+
+ // TODO: i == 0 && i == outh -1
+ if (i == 0)
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight345][0]\n"
+ "vmul.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q1, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight345);
+ out0 = vmlaq_f32(out0, input1, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ out_ptr0++;
+ }
+
+ in_ptr2 += w;
+ }
+ else if (i == outh - 1)
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ out_ptr0++;
+ }
+ }
+ else
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q1, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ input2 = vsetq_lane_f32(0.0f, input2, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ in_ptr2 += 2;
+ out_ptr0++;
+ }
+
+ in_ptr0 += w;
+ in_ptr1 += w;
+ in_ptr2 += w;
+ }
+ }
+ }
+
+#else // __aarch64__
+ (void)in_mat;
+ (void)out_mat;
+ (void)kernel;
+ (void)bias;
+#endif // __aarch64__
+}
+
+static void depthwise_conv3x3S2_padding11(const convMat_t &in_mat, convMat_t &out_mat,
+ const convMat_t &kernel, const convMat_t &bias)
+{
+#if !__aarch64__
+ int w = in_mat.w;
+ int h = in_mat.h;
+ int outw = out_mat.w;
+ int outh = out_mat.h;
+ int channels = in_mat.c;
+
+#pragma omp parallel for
+ for (int c = 0; c < channels; c++)
+ {
+ const float *filter = kernel.data + c * 9;
+#ifdef NCNN
+ float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float));
+ float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float));
+#else // NCNN
+ float *inbuf = in_mat.data + c * w * h;
+ float *outbuf = out_mat.data + c * outw * outh;
+#endif // NCNN
+ float bias0 = bias.data ? bias.data[c] : 0.0f;
+
+ register float32x4_t weight012 asm("q4") = vld1q_f32(filter);
+ register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3);
+ register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6);
+ register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0);
+
+ float *in_ptr0 = inbuf + 0 * w;
+ float *in_ptr1 = inbuf + 1 * w;
+ float *in_ptr2 = inbuf + 2 * w;
+
+ float *out_ptr0 = outbuf + 0 * outw;
+
+ int i;
+ for (i = 0; i < outh; i++)
+ {
+ int nn = (outw >> 2) - 1;
+ int remain = (outw & 0x03) + 4;
+
+ // TODO: i == 0 && i == outh - 1
+ if (i == 0)
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q2, #0\n"
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr0], %[in_ptr0], #28\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q3, %e[weight345][0]\n"
+ "vmul.f32 q11, q0, %e[weight345][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q1, %f[weight345][0]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr1], %[in_ptr1], #28\n"
+
+ "vmla.f32 q10, q3, %e[weight678][0]\n"
+ "vmla.f32 q11, q0, %e[weight678][1]\n"
+ "vmla.f32 q14, q1, %f[weight678][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight345][0]\n"
+ "vmul.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q1, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ // TODO: if nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight345);
+ out0 = vmlaq_f32(out0, input1, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ out_ptr0++;
+ }
+
+ in_ptr2 += w;
+ }
+ else if (i == outh - 1)
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q2, #0\n"
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr0], %[in_ptr0], #28\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q3, %e[weight012][0]\n"
+ "vmul.f32 q11, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q1, %f[weight012][0]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr1], %[in_ptr1], #28\n"
+
+ "vmla.f32 q10, q3, %e[weight345][0]\n"
+ "vmla.f32 q11, q0, %e[weight345][1]\n"
+ "vmla.f32 q14, q1, %f[weight345][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ // TODO: if nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ out_ptr0++;
+ }
+ }
+ else
+ {
+ if (nn > 0)
+ {
+ __asm __volatile("vmov.i32 q2, #0\n"
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr0], %[in_ptr0], #28\n"
+
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q3, %e[weight012][0]\n"
+ "vmul.f32 q11, q0, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q1, %f[weight012][0]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr1], %[in_ptr1], #28\n"
+
+ "vmla.f32 q10, q3, %e[weight345][0]\n"
+ "vmla.f32 q11, q0, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q1, %f[weight345][0]\n"
+ "vext.32 q3, q2, q0, #3\n"
+ "add %[in_ptr2], %[in_ptr2], #28\n"
+
+ "vmla.f32 q10, q3, %e[weight678][0]\n"
+ "vmla.f32 q11, q0, %e[weight678][1]\n"
+ "vmla.f32 q14, q1, %f[weight678][0]\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "beq 2f\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "1:\n"
+ "vand q14, %q[qbias0], %q[qbias0]\n"
+ "vmul.f32 q10, q0, %e[weight012][0]\n"
+ "vmul.f32 q11, q1, %e[weight012][1]\n"
+
+ "pld [%[in_ptr1], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr1]]\n"
+ "vmla.f32 q14, q3, %f[weight012][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight345][0]\n"
+ "vmla.f32 q11, q1, %e[weight345][1]\n"
+
+ "pld [%[in_ptr2], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr2]]\n"
+ "vmla.f32 q14, q3, %f[weight345][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vmla.f32 q10, q0, %e[weight678][0]\n"
+ "vmla.f32 q11, q1, %e[weight678][1]\n"
+
+ "pld [%[in_ptr0], #256]\n"
+ "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n"
+ "vld1.f32 {d4[0]}, [%[in_ptr0]]\n"
+ "vmla.f32 q14, q3, %f[weight678][0]\n"
+ "vext.32 q3, q0, q2, #1\n"
+
+ "vadd.f32 q14, q14, q10\n"
+ "vadd.f32 q14, q14, q11\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n"
+ "bne 1b\n"
+ "sub %[in_ptr0], %[in_ptr0], #32\n"
+ "2:\n"
+ : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1),
+ [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn)
+ : [weight012] "w"(weight012), [weight345] "w"(weight345),
+ [weight678] "w"(weight678), [qbias0] "w"(qbias0)
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "cc", "memory");
+ }
+ for (; remain > 0; remain--)
+ {
+ // TODO: if nn == 0, pad_left comes here.
+ float32x4_t input0 = vld1q_f32(in_ptr0);
+ float32x4_t input1 = vld1q_f32(in_ptr1);
+ float32x4_t input2 = vld1q_f32(in_ptr2);
+
+ if (remain == 1)
+ {
+ input0 = vsetq_lane_f32(0.0f, input0, 2);
+ input1 = vsetq_lane_f32(0.0f, input1, 2);
+ input2 = vsetq_lane_f32(0.0f, input2, 2);
+ }
+
+ float32x4_t out0 = vmulq_f32(input0, weight012);
+ out0 = vmlaq_f32(out0, input1, weight345);
+ out0 = vmlaq_f32(out0, input2, weight678);
+
+ out0 = vsetq_lane_f32(bias0, out0, 3);
+
+ float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0));
+
+ float32x2_t out01 = vpadd_f32(out00, out00);
+
+ *out_ptr0 = vget_lane_f32(out01, 0);
+
+ in_ptr0 += 2;
+ in_ptr1 += 2;
+ in_ptr2 += 2;
+ out_ptr0++;
+ }
+
+ in_ptr0 += w;
+ in_ptr1 += w;
+ in_ptr2 += w;
+ }
+ }
+ }
+#else // __aarch64__
+ (void)in_mat;
+ (void)out_mat;
+ (void)kernel;
+ (void)bias;
+#endif // __aarch64__
+}
+
+static void depthwise_conv_colmajor(const convMat_t &in_mat, convMat_t &out_mat,
+ const convMat_t &kernel, const convParams_t &in_param)
+{
+#if __aarch64__
+ const int w = in_mat.w;
+ const int h = in_mat.h;
+ const int outw = out_mat.w;
+ const int outh = out_mat.h;
+ const int channels = out_mat.c;
+ const int stridew = in_param.stride_w;
+ const int strideh = in_param.stride_h;
+ const int padding = in_param.padding;
+ const int padw = in_param.pad_w;
+ const int padh = in_param.pad_h;
+
+#pragma omp parallel for
+ for (int oh = 0; oh < outh; oh++)
+ {
+ const float *input_data0 = in_mat.data + (oh * strideh - padh) * w * channels;
+
+ memset(out_mat.data + oh * outw * channels, 0x00, outw * channels * sizeof(float));
+
+ for (int kh = 0; kh < in_param.kernel_h; kh++)
+ {
+ for (int kw = 0; kw < in_param.kernel_w; kw++)
+ {
+ const float *kernel_data = kernel.data + (kh * in_param.kernel_w + kw) * channels;
+ const float *input_data1 = input_data0 + (kh * w + kw) * channels;
+
+ if (padding && ((oh * strideh + kh < padh) || (oh * strideh + kh >= padh + h)))
+ {
+ continue;
+ }
+
+ int ow = 0;
+ for (; ow + 3 < outw; /*ow += 4*/)
+ {
+ if (((ow + 3) * stridew + kw < padw) || (ow * stridew + kw >= padw + w))
+ {
+ ow += 4;
+ continue;
+ }
+ else if ((ow + 3) * stridew + kw >= padw + w)
+ {
+ break;
+ }
+ else if (ow * stridew + kw < padw)
+ {
+ int delta = (padw - kw) / stridew - ow;
+ delta += (padw - kw) % stridew ? 1 : 0;
+ ow += delta;
+ continue;
+ }
+
+ int nn = channels >> 2;
+ int remain = channels & 0x03;
+
+ const float *input_r0 = input_data1 + (ow * stridew - padw) * channels;
+
+ const float *input_r1 = input_r0 + stridew * channels;
+ const float *input_r2 = input_r1 + stridew * channels;
+ const float *input_r3 = input_r2 + stridew * channels;
+ const float *weights_data = kernel_data;
+ float *output_r0 = out_mat.data + (oh * outw + ow) * channels;
+ float *output_r1 = output_r0 + channels;
+ float *output_r2 = output_r1 + channels;
+ float *output_r3 = output_r2 + channels;
+
+ if (nn > 0)
+ {
+ int _n = (nn + 1) >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("subs %[_n], %[_n], #1\n"
+ "ld1 {v4.4s}, [%[weights_data]], #16\n"
+ "ld1 {v5.4s}, [%[input_r0]], #16\n"
+ "ld1 {v6.4s}, [%[input_r1]], #16\n"
+ "ld1 {v7.4s}, [%[input_r2]], #16\n"
+ "ld1 {v8.4s}, [%[input_r3]], #16\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v24.4s, v25.4s}, [%[output_r0]]\n"
+ "ld1 {v26.4s, v27.4s}, [%[output_r1]]\n"
+ "ld1 {v28.4s, v29.4s}, [%[output_r2]]\n"
+ "ld1 {v30.4s, v31.4s}, [%[output_r3]]\n"
+
+ "ld1 {v9.4s}, [%[weights_data]], #16\n"
+ "ld1 {v10.4s}, [%[input_r0]], #16\n"
+ "ld1 {v11.4s}, [%[input_r1]], #16\n"
+ "ld1 {v12.4s}, [%[input_r2]], #16\n"
+ "ld1 {v13.4s}, [%[input_r3]], #16\n"
+
+ "fmla v24.4s, v4.4s, v5.4s\n"
+ "fmla v26.4s, v4.4s, v6.4s\n"
+
+ "fmla v28.4s, v4.4s, v7.4s\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+
+ "ld1 {v4.4s}, [%[weights_data]], #16\n"
+ "ld1 {v5.4s}, [%[input_r0]], #16\n"
+ "ld1 {v6.4s}, [%[input_r1]], #16\n"
+ "ld1 {v7.4s}, [%[input_r2]], #16\n"
+ "ld1 {v8.4s}, [%[input_r3]], #16\n"
+
+ "fmla v25.4s, v9.4s, v10.4s\n"
+ "fmla v27.4s, v9.4s, v11.4s\n"
+
+ "fmla v29.4s, v9.4s, v12.4s\n"
+ "fmla v31.4s, v9.4s, v13.4s\n"
+
+ "st1 {v24.4s, v25.4s}, [%[output_r0]], #32\n"
+ "st1 {v26.4s, v27.4s}, [%[output_r1]], #32\n"
+ "st1 {v28.4s, v29.4s}, [%[output_r2]], #32\n"
+ "st1 {v30.4s, v31.4s}, [%[output_r3]], #32\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v24.4s}, [%[output_r0]]\n"
+ "ld1 {v26.4s}, [%[output_r1]]\n"
+ "ld1 {v28.4s}, [%[output_r2]]\n"
+ "ld1 {v30.4s}, [%[output_r3]]\n"
+ "cmp %[oddn], #1\n"
+
+ "fmla v24.4s, v4.4s, v5.4s\n"
+ "fmla v26.4s, v4.4s, v6.4s\n"
+
+ "fmla v28.4s, v4.4s, v7.4s\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+
+ "st1 {v24.4s}, [%[output_r0]], #16\n"
+ "st1 {v26.4s}, [%[output_r1]], #16\n"
+ "st1 {v28.4s}, [%[output_r2]], #16\n"
+ "st1 {v30.4s}, [%[output_r3]], #16\n"
+
+ "beq 2f\n"
+ "ld1 {v25.4s}, [%[output_r0]]\n"
+ "ld1 {v27.4s}, [%[output_r1]]\n"
+ "ld1 {v29.4s}, [%[output_r2]]\n"
+ "ld1 {v31.4s}, [%[output_r3]]\n"
+
+ "ld1 {v9.4s}, [%[weights_data]], #16\n"
+ "ld1 {v10.4s}, [%[input_r0]], #16\n"
+ "ld1 {v11.4s}, [%[input_r1]], #16\n"
+ "ld1 {v12.4s}, [%[input_r2]], #16\n"
+ "ld1 {v13.4s}, [%[input_r3]], #16\n"
+
+ "fmla v25.4s, v9.4s, v10.4s\n"
+ "fmla v27.4s, v9.4s, v11.4s\n"
+
+ "fmla v29.4s, v9.4s, v12.4s\n"
+ "fmla v31.4s, v9.4s, v13.4s\n"
+
+ "st1 {v25.4s}, [%[output_r0]], #16\n"
+ "st1 {v27.4s}, [%[output_r1]], #16\n"
+ "st1 {v29.4s}, [%[output_r2]], #16\n"
+ "st1 {v31.4s}, [%[output_r3]], #16\n"
+ "2:\n"
+ : [weights_data] "+r"(weights_data), [input_r0] "+r"(input_r0),
+ [input_r1] "+r"(input_r1), [input_r2] "+r"(input_r2),
+ [input_r3] "+r"(input_r3), [output_r0] "+r"(output_r0),
+ [output_r1] "+r"(output_r1), [output_r2] "+r"(output_r2),
+ [output_r3] "+r"(output_r3), [_n] "+r"(_n)
+ : [oddn] "r"(oddn)
+ : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+ }
+ if (remain >= 2)
+ {
+ asm volatile(
+ "ld1 {v24.2s}, [%[output_r0]]\n"
+ "ld1 {v26.2s}, [%[output_r1]]\n"
+ "ld1 {v28.2s}, [%[output_r2]]\n"
+ "ld1 {v30.2s}, [%[output_r3]]\n"
+ "ld1 {v4.2s}, [%[weights_data]], #8\n"
+ "ld1 {v5.2s}, [%[input_r0]], #8\n"
+
+ "ld1 {v6.2s}, [%[input_r1]], #8\n"
+ "ld1 {v7.2s}, [%[input_r2]], #8\n"
+ "ld1 {v8.2s}, [%[input_r3]], #8\n"
+
+ "fmla v24.2s, v4.2s, v5.2s\n"
+ "fmla v26.2s, v4.2s, v6.2s\n"
+
+ "fmla v28.2s, v4.2s, v7.2s\n"
+ "fmla v30.2s, v4.2s, v8.2s\n"
+
+ "st1 {v24.2s}, [%[output_r0]], #8\n"
+ "st1 {v26.2s}, [%[output_r1]], #8\n"
+ "st1 {v28.2s}, [%[output_r2]], #8\n"
+ "st1 {v30.2s}, [%[output_r3]], #8\n"
+ : [weights_data] "+r"(weights_data), [input_r0] "+r"(input_r0),
+ [input_r1] "+r"(input_r1), [input_r2] "+r"(input_r2), [input_r3] "+r"(input_r3),
+ [output_r0] "+r"(output_r0), [output_r1] "+r"(output_r1),
+ [output_r2] "+r"(output_r2), [output_r3] "+r"(output_r3)
+ :
+ : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v24", "v26", "v28", "v30");
+ remain -= 2;
+ }
+
+ if (remain > 0)
+ {
+ *output_r0++ += (*weights_data) * (*input_r0++);
+ *output_r1++ += (*weights_data++) * (*input_r1++);
+ *output_r2++ += (*weights_data) * (*input_r2++);
+ *output_r3++ += (*weights_data++) * (*input_r3++);
+ }
+ ow += 4;
+ }
+
+ for (; ow + 1 < outw; /*ow += 2*/)
+ {
+ if (padding)
+ {
+ if (((ow + 1) * stridew + kw < padw) || (ow * stridew + kw >= padw + w))
+ {
+ ow += 2;
+ continue;
+ }
+ else if ((ow + 1) * stridew + kw >= padw + w)
+ {
+ break;
+ }
+ else if (ow * stridew + kw < padw)
+ {
+ ow++;
+ continue;
+ }
+ }
+
+ int nn = channels >> 2;
+ int remain = channels & 0x03;
+
+ const float *input_r0 = input_data1 + (ow * stridew - padw) * channels;
+
+ const float *input_r1 = input_r0 + stridew * channels;
+ const float *weights_data = kernel_data;
+ float *output_r0 = out_mat.data + (oh * outw + ow) * channels;
+ float *output_r1 = output_r0 + channels;
+
+ if (nn > 0)
+ {
+ int _n = (nn + 1) >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("subs %[_n], %[_n], #1\n"
+ "ld1 {v4.4s}, [%[weights_data]], #16\n"
+ "ld1 {v5.4s}, [%[input_r0]], #16\n"
+ "ld1 {v6.4s}, [%[input_r1]], #16\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v24.4s, v25.4s}, [%[output_r0]]\n"
+ "ld1 {v26.4s, v27.4s}, [%[output_r1]]\n"
+
+ "ld1 {v9.4s}, [%[weights_data]], #16\n"
+ "ld1 {v10.4s}, [%[input_r0]], #16\n"
+ "ld1 {v11.4s}, [%[input_r1]], #16\n"
+
+ "fmla v24.4s, v4.4s, v5.4s\n"
+ "fmla v26.4s, v4.4s, v6.4s\n"
+
+ "ld1 {v4.4s}, [%[weights_data]], #16\n"
+ "ld1 {v5.4s}, [%[input_r0]], #16\n"
+ "ld1 {v6.4s}, [%[input_r1]], #16\n"
+
+ "fmla v25.4s, v9.4s, v10.4s\n"
+ "fmla v27.4s, v9.4s, v11.4s\n"
+
+ "st1 {v24.4s, v25.4s}, [%[output_r0]], #32\n"
+ "st1 {v26.4s, v27.4s}, [%[output_r1]], #32\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v24.4s}, [%[output_r0]]\n"
+ "ld1 {v26.4s}, [%[output_r1]]\n"
+ "cmp %[oddn], #1\n"
+
+ "fmla v24.4s, v4.4s, v5.4s\n"
+ "fmla v26.4s, v4.4s, v6.4s\n"
+
+ "st1 {v24.4s}, [%[output_r0]], #16\n"
+ "st1 {v26.4s}, [%[output_r1]], #16\n"
+
+ "beq 2f\n"
+ "ld1 {v25.4s}, [%[output_r0]]\n"
+ "ld1 {v27.4s}, [%[output_r1]]\n"
+
+ "ld1 {v9.4s}, [%[weights_data]], #16\n"
+ "ld1 {v10.4s}, [%[input_r0]], #16\n"
+ "ld1 {v11.4s}, [%[input_r1]], #16\n"
+
+ "fmla v25.4s, v9.4s, v10.4s\n"
+ "fmla v27.4s, v9.4s, v11.4s\n"
+
+ "st1 {v25.4s}, [%[output_r0]], #16\n"
+ "st1 {v27.4s}, [%[output_r1]], #16\n"
+ "2:\n"
+ : [weights_data] "+r"(weights_data), [input_r0] "+r"(input_r0),
+ [input_r1] "+r"(input_r1), [output_r0] "+r"(output_r0),
+ [output_r1] "+r"(output_r1), [_n] "+r"(_n)
+ : [oddn] "r"(oddn)
+ : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+ }
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v24.2s}, [%[output_r0]]\n"
+ "ld1 {v26.2s}, [%[output_r1]]\n"
+ "ld1 {v4.2s}, [%[weights_data]], #8\n"
+ "ld1 {v5.2s}, [%[input_r0]], #8\n"
+
+ "ld1 {v6.2s}, [%[input_r1]], #8\n"
+
+ "fmla v24.2s, v4.2s, v5.2s\n"
+ "fmla v26.2s, v4.2s, v6.2s\n"
+
+ "st1 {v24.2s}, [%[output_r0]], #8\n"
+ "st1 {v26.2s}, [%[output_r1]], #8\n"
+ : [weights_data] "+r"(weights_data), [input_r0] "+r"(input_r0),
+ [input_r1] "+r"(input_r1), [output_r0] "+r"(output_r0),
+ [output_r1] "+r"(output_r1)
+ :
+ : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v24", "v26", "v28",
+ "v30");
+ remain -= 2;
+ }
+
+ if (remain > 0)
+ {
+ *output_r0++ += (*weights_data) * (*input_r0++);
+ *output_r1++ += (*weights_data++) * (*input_r1++);
+ }
+ ow += 2;
+ }
+
+ for (; ow < outw; ow++)
+ {
+ const float *input_data = input_data1 + (ow * stridew - padw) * channels;
+
+ if (padding && ((ow * stridew + kw < padw) || (ow * strideh + kw >= padw + w)))
+ {
+ continue;
+ }
+
+ int nn = channels >> 2;
+ int remain = channels & 0x03;
+
+ const float *weights_data = kernel_data;
+ float *output_data = out_mat.data + (oh * outw + ow) * channels;
+
+ if (nn > 0)
+ {
+ int _n = (nn + 1) >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("subs %[_n], %[_n], #1\n"
+ "ld1 {v4.4s}, [%[weights_data]], #16\n"
+ "ld1 {v5.4s}, [%[input_data]], #16\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v30.4s, v31.4s}, [%[output_data]]\n"
+ "ld1 {v6.4s}, [%[weights_data]], #16\n"
+ "ld1 {v7.4s}, [%[input_data]], #16\n"
+ "fmla v30.4s, v4.4s, v5.4s\n"
+
+ "ld1 {v4.4s}, [%[weights_data]], #16\n"
+ "ld1 {v5.4s}, [%[input_data]], #16\n"
+ "fmla v31.4s, v6.4s, v7.4s\n"
+
+ "st1 {v30.4s, v31.4s}, [%[output_data]], #32\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v30.4s}, [%[output_data]]\n"
+ "cmp %[oddn], #1\n"
+ "fmla v30.4s, v4.4s, v5.4s\n"
+ "st1 {v30.4s}, [%[output_data]], #16\n"
+ "beq 2f\n"
+ "ld1 {v31.4s}, [%[output_data]]\n"
+ "ld1 {v6.4s}, [%[weights_data]], #16\n"
+ "ld1 {v7.4s}, [%[input_data]], #16\n"
+ "fmla v31.4s, v6.4s, v7.4s\n"
+
+ "st1 {v31.4s}, [%[output_data]], #16\n"
+ "2:\n"
+ : [weights_data] "+r"(weights_data), [input_data] "+r"(input_data),
+ [output_data] "+r"(output_data), [_n] "+r"(_n)
+ : [oddn] "r"(oddn)
+ : "cc", "memory", "v4", "v5", "v30", "v31");
+ }
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v30.2s}, [%[output_data]]\n"
+ "ld1 {v4.2s}, [%[weights_data]], #8\n"
+ "ld1 {v5.2s}, [%[input_data]], #8\n"
+
+ "fmla v30.2s, v4.2s, v5.2s\n"
+
+ "st1 {v30.2s}, [%[output_data]], #8\n"
+ : [weights_data] "+r"(weights_data), [input_data] "+r"(input_data),
+ [output_data] "+r"(output_data)
+ :
+ : "cc", "memory", "v4", "v5", "v30");
+ remain -= 2;
+ }
+
+ if (remain > 0)
+ {
+ *output_data++ += (*weights_data++) * (*input_data++);
+ }
+ }
+ }
+ }
+ }
+#else // __aarch64__
+ (void)in_mat;
+ (void)out_mat;
+ (void)kernel;
+ (void)in_param;
+#endif // __aarch64__
+}
+
+void srcn_depthwise_conv(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convMat_t &bias, const convParams_t &in_param, int num_threads,
+ convType_t conv_type)
+{
+ omp_set_num_threads(num_threads);
+
+ if (conv_type == col_major)
+ {
+ depthwise_conv_colmajor(in_mat, out_mat, weights_mat, in_param);
+ return;
+ }
+
+ else if (conv_type == row_major)
+ {
+ if (in_param.kernel_w == 3 && in_param.kernel_h == 3 && in_param.dilation_w == 1 &&
+ in_param.dilation_h == 1)
+ {
+ if (in_param.stride_w == 1 && in_param.stride_h == 1)
+ {
+ if (in_param.padding == 0)
+ depthwise_conv3x3S1_nopad(in_mat, out_mat, weights_mat, bias);
+ else
+ depthwise_conv3x3S1_padding(in_mat, out_mat, weights_mat, bias);
+ }
+ else if (in_param.stride_w == 2 && in_param.stride_h == 2)
+ {
+ if (in_param.padding == 0)
+ depthwise_conv3x3S2_nopad(in_mat, out_mat, weights_mat, bias);
+ else
+ {
+ if (in_param.pad_w == 0 && in_param.pad_h == 0)
+ depthwise_conv3x3S2_padding00(in_mat, out_mat, weights_mat, bias);
+ else if (in_param.pad_w == 0 && in_param.pad_h == 1)
+ depthwise_conv3x3S2_padding10(in_mat, out_mat, weights_mat, bias);
+ else if (in_param.pad_w == 1 && in_param.pad_h == 0)
+ depthwise_conv3x3S2_padding01(in_mat, out_mat, weights_mat, bias);
+ else if (in_param.pad_w == 1 && in_param.pad_h == 1)
+ depthwise_conv3x3S2_padding11(in_mat, out_mat, weights_mat, bias);
+ }
+ }
+ }
+ }
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/direct_conv_colmajor.cc b/compute/ncnn/src/srcn/direct_conv_colmajor.cc
new file mode 100644
index 000000000..300235222
--- /dev/null
+++ b/compute/ncnn/src/srcn/direct_conv_colmajor.cc
@@ -0,0 +1,5872 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <stdlib.h>
+#include <arm_neon.h>
+#include "ncnn/srcn/conv_type.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+#if __aarch64__
+static void direct_conv_l(const convMat_t &bottom_blob, convMat_t &top_blob,
+ const convMat_t &_kernel, const int _stride, const int padding,
+ const int pad_top, const int pad_left)
+{
+ const int w = bottom_blob.w;
+ const int h = bottom_blob.h;
+ const int inch = bottom_blob.c;
+ const int outw = top_blob.w;
+ const int outh = top_blob.h;
+ const int outch = top_blob.c;
+ const int kernel_w = _kernel.w;
+ const int kernel_h = _kernel.h;
+
+ for (int m = 0; m < kernel_w * kernel_h; m++)
+ {
+ const float *_kernel0 = _kernel.data + m * inch * outch;
+ const float *img0 =
+ bottom_blob.data + (m / kernel_w - pad_top) * w * inch + (m % kernel_w - pad_left) * inch;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif // _OPENMP
+ for (int p = 0; p < outh; p++)
+ {
+ float *out0 = top_blob.data + p * outw * outch;
+
+ // clear output
+ if (m == 0)
+ {
+ for (int j = 0; j < outw * outch; j++)
+ {
+ *(out0 + j) = 0.f;
+ }
+ }
+
+ if (padding)
+ {
+ if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h))
+ {
+ continue;
+ }
+ }
+
+ const float *img1 = img0 + p * w * inch * _stride;
+
+ int q = 0;
+ for (; q + 3 < outw; /*q += 4*/)
+ {
+ if (padding)
+ {
+ if (((q + 3) * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w) >= pad_left + w)
+ {
+ out0 += outch * 4;
+ img1 += inch * _stride * 4;
+ q += 4;
+ continue;
+ }
+ else if ((q + 3) * _stride + m % kernel_w >= pad_left + w)
+ {
+ break;
+ }
+ else if (q * _stride + m % kernel_w < pad_left)
+ {
+ int delta = (pad_left - m % kernel_w) / _stride - q;
+ delta += (pad_left - m % kernel_w) % _stride ? 1 : 0;
+ out0 += outch * delta;
+ img1 += inch * _stride * delta;
+ q += delta;
+ continue;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *_x1 = img1 + inch * _stride;
+ const float *_x2 = img1 + inch * _stride * 2;
+ const float *_x3 = img1 + inch * _stride * 3;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("v4") = vld1q_f32(_x0);
+ register float32x4_t rx1 asm("v5") = vld1q_f32(_x1);
+ register float32x4_t rx2 asm("v16") = vld1q_f32(_x2);
+ register float32x4_t rx3 asm("v17") = vld1q_f32(_x3);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+ float *outptr2 = out0 + outch * 2;
+ float *outptr3 = out0 + outch * 3;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v8.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v8.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v9.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v9.4s, %[rx3].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v11.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v11.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v12.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v12.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v12.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v13.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v13.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v13.4s, %[rx3].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v8.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v8.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v9.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v9.4s, %[rx3].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v11.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v11.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v12.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v12.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v12.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v13.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v13.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v13.4s, %[rx3].s[3]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v8.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v8.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v9.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v9.4s, %[rx3].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n), [outptr2] "+r"(outptr2),
+ [outptr3] "+r"(outptr3)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn),
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14", "v15", "v30", "v31");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+ "ld1 {v30.2s}, [%[outptr2]]\n"
+ "ld1 {v31.2s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v30.2s, v6.2s, %[rx2].s[0]\n"
+ "fmla v31.2s, v6.2s, %[rx3].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v15.2s, v7.2s, %[rx1].s[1]\n"
+ "fmla v30.2s, v7.2s, %[rx2].s[1]\n"
+ "fmla v31.2s, v7.2s, %[rx3].s[1]\n"
+ "fmla v14.2s, v8.2s, %[rx0].s[2]\n"
+ "fmla v15.2s, v8.2s, %[rx1].s[2]\n"
+ "fmla v30.2s, v8.2s, %[rx2].s[2]\n"
+ "fmla v31.2s, v8.2s, %[rx3].s[2]\n"
+ "fmla v14.2s, v9.2s, %[rx0].s[3]\n"
+ "fmla v15.2s, v9.2s, %[rx1].s[3]\n"
+ "fmla v30.2s, v9.2s, %[rx2].s[3]\n"
+ "fmla v31.2s, v9.2s, %[rx3].s[3]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+ "st1 {v30.2s}, [%[outptr2]], #8\n"
+ "st1 {v31.2s}, [%[outptr3]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1),
+
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15", "v30",
+ "v31");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x1 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x1 + 3));
+
+ *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x2 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x2 + 3));
+
+ *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x3 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x3 + 3));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ outptr2++;
+ outptr3++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ _x1 += 4;
+ _x2 += 4;
+ _x3 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_f32(_x0);
+ register float32x2_t rx1 asm("v5") = vld1_f32(_x1);
+ register float32x2_t rx2 asm("v16") = vld1_f32(_x2);
+ register float32x2_t rx3 asm("v17") = vld1_f32(_x3);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+ float *outptr2 = out0 + outch * 2;
+ float *outptr3 = out0 + outch * 3;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile(
+ "cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v11.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v11.4s, %[rx3].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v11.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v11.4s, %[rx3].s[1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1),
+ [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn),
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15", "v30", "v31");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+ "ld1 {v30.2s}, [%[outptr2]]\n"
+ "ld1 {v31.2s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v30.2s, v6.2s, %[rx2].s[0]\n"
+ "fmla v31.2s, v6.2s, %[rx3].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v15.2s, v7.2s, %[rx1].s[1]\n"
+ "fmla v30.2s, v7.2s, %[rx2].s[1]\n"
+ "fmla v31.2s, v7.2s, %[rx3].s[1]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+ "st1 {v30.2s}, [%[outptr2]], #8\n"
+ "st1 {v31.2s}, [%[outptr3]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1),
+
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v7", "v14", "v15", "v30", "v31");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1));
+ *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1));
+ *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ outptr2++;
+ outptr3++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ _x1 += 2;
+ _x2 += 2;
+ _x3 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0);
+ register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1);
+ register float32x2_t rx2 asm("v16") = vld1_dup_f32(_x2);
+ register float32x2_t rx3 asm("v17") = vld1_dup_f32(_x3);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+ float *outptr2 = out0 + outch * 2;
+ float *outptr3 = out0 + outch * 3;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile(
+ "cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1),
+ [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v10", "v14", "v15", "v30", "v31");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+ "ld1 {v30.2s}, [%[outptr2]]\n"
+ "ld1 {v31.2s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v30.2s, v6.2s, %[rx2].s[0]\n"
+ "fmla v31.2s, v6.2s, %[rx3].s[0]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+ "st1 {v30.2s}, [%[outptr2]], #8\n"
+ "st1 {v31.2s}, [%[outptr3]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1),
+
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v14", "v15", "v30", "v31");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+ *outptr1 += (*kernel0) * (*_x1);
+ *outptr2 += (*kernel0) * (*_x2);
+ *outptr3 += (*kernel0) * (*_x3);
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ outptr2++;
+ outptr3++;
+ }
+
+ _x0 += 1;
+ _x1 += 1;
+ _x2 += 1;
+ _x3 += 1;
+ }
+
+ img1 += inch * 4 * _stride;
+ out0 += outch * 4;
+ q += 4;
+ }
+
+ for (; q + 1 < outw; /*q += 2*/)
+ {
+ if (padding)
+ {
+ if (((q + 1) * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w) >= pad_left + w)
+ {
+ out0 += outch * 2;
+ img1 += inch * _stride * 2;
+ q += 2;
+ continue;
+ }
+ else if ((q + 1) * _stride + m % kernel_w >= pad_left + w)
+ {
+ break;
+ }
+ else if (q * _stride + m % kernel_w < pad_left)
+ {
+ out0 += outch;
+ img1 += inch * _stride;
+ q++;
+ continue;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *_x1 = img1 + inch * _stride;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("v4") = vld1q_f32(_x0);
+ register float32x4_t rx1 asm("v5") = vld1q_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v12.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v13.4s, %[rx1].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v12.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v13.4s, %[rx1].s[3]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14", "v15");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v15.2s, v7.2s, %[rx1].s[1]\n"
+ "fmla v14.2s, v8.2s, %[rx0].s[2]\n"
+ "fmla v15.2s, v8.2s, %[rx1].s[2]\n"
+ "fmla v14.2s, v9.2s, %[rx0].s[3]\n"
+ "fmla v15.2s, v9.2s, %[rx1].s[3]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x1 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x1 + 3));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ _x1 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_f32(_x0);
+ register float32x2_t rx1 asm("v5") = vld1_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v15.2s, v7.2s, %[rx1].s[1]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1)
+ : "cc", "memory", "x0", "v6", "v7", "v14", "v15");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ _x1 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0);
+ register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v10", "v14", "v15");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1)
+ : "cc", "memory", "x0", "v6", "v14", "v15");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+ *outptr1 += (*kernel0) * (*_x1);
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ _x0 += 1;
+ _x1 += 1;
+ }
+
+ img1 += inch * 2 * _stride;
+ out0 += outch * 2;
+ q += 2;
+ }
+
+ for (; q < outw; q++)
+ {
+ if (padding)
+ {
+ if ((q * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w >= pad_left + w))
+ {
+ img1 += inch * _stride;
+ out0 += outch;
+ continue;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("v4") = vld1q_f32(_x0);
+
+ float *outptr0 = out0;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v14.2s, v8.2s, %[rx0].s[2]\n"
+ "fmla v14.2s, v9.2s, %[rx0].s[3]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [stride] "r"(stride), [rx0] "w"(rx0)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ kernel0++;
+ outptr0++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_f32(_x0);
+
+ float *outptr0 = out0;
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [stride] "r"(stride), [rx0] "w"(rx0)
+ : "cc", "memory", "x0", "v6", "v7", "v14");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+
+ kernel0++;
+ outptr0++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0);
+
+ float *outptr0 = out0;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [rx0] "w"(rx0), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v10", "v14");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [rx0] "w"(rx0)
+ : "cc", "memory", "x0", "v6", "v14");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+
+ kernel0++;
+ outptr0++;
+ }
+
+ _x0 += 1;
+ }
+
+ img1 += inch * _stride;
+ out0 += outch;
+ }
+ }
+ }
+}
+
+static void direct_conv_s(const convMat_t &bottom_blob, convMat_t &top_blob,
+ const convMat_t &_kernel, const int _stride, const int padding,
+ const int pad_top, const int pad_left)
+{
+ const int w = bottom_blob.w;
+ const int h = bottom_blob.h;
+ const int inch = bottom_blob.c;
+ const int outw = top_blob.w;
+ const int outh = top_blob.h;
+ const int outch = top_blob.c;
+ const int kernel_w = _kernel.w;
+ const int kernel_h = _kernel.h;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+ for (int p = 0; p < outh; p++)
+ {
+ const float *img0 = bottom_blob.data + (p * _stride - pad_top) * w * inch;
+ float *out = top_blob.data + p * outw * outch;
+
+ // clear output
+ for (int j = 0; j < outw * outch; j++)
+ {
+ *(out + j) = 0.f;
+ }
+
+ for (int m = 0; m < kernel_w * kernel_h; m++)
+ {
+ if (padding)
+ {
+ if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h))
+ {
+ continue;
+ }
+ }
+
+ float *out0 = out;
+ const float *_kernel0 = _kernel.data + m * inch * outch;
+ const float *img1 = img0 + (m / kernel_w) * w * inch + (m % kernel_w - pad_left) * inch;
+
+ int q = 0;
+ for (; q + 3 < outw; /*q += 4*/)
+ {
+ if (padding)
+ {
+ if (((q + 3) * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w) >= pad_left + w)
+ {
+ out0 += outch * 4;
+ img1 += inch * _stride * 4;
+ q += 4;
+ continue;
+ }
+ else if ((q + 3) * _stride + m % kernel_w >= pad_left + w)
+ {
+ break;
+ }
+ else if (q * _stride + m % kernel_w < pad_left)
+ {
+ int delta = (pad_left - m % kernel_w) / _stride - q;
+ delta += (pad_left - m % kernel_w) % _stride ? 1 : 0;
+ out0 += outch * delta;
+ img1 += inch * _stride * delta;
+ q += delta;
+ continue;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *_x1 = img1 + inch * _stride;
+ const float *_x2 = img1 + inch * _stride * 2;
+ const float *_x3 = img1 + inch * _stride * 3;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("v4") = vld1q_f32(_x0);
+ register float32x4_t rx1 asm("v5") = vld1q_f32(_x1);
+ register float32x4_t rx2 asm("v16") = vld1q_f32(_x2);
+ register float32x4_t rx3 asm("v17") = vld1q_f32(_x3);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+ float *outptr2 = out0 + outch * 2;
+ float *outptr3 = out0 + outch * 3;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v8.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v8.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v9.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v9.4s, %[rx3].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v11.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v11.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v12.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v12.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v12.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v13.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v13.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v13.4s, %[rx3].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v8.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v8.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v9.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v9.4s, %[rx3].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v11.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v11.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v12.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v12.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v12.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v13.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v13.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v13.4s, %[rx3].s[3]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v30.4s, v8.4s, %[rx2].s[2]\n"
+ "fmla v31.4s, v8.4s, %[rx3].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+ "fmla v30.4s, v9.4s, %[rx2].s[3]\n"
+ "fmla v31.4s, v9.4s, %[rx3].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n), [outptr2] "+r"(outptr2),
+ [outptr3] "+r"(outptr3)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn),
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14", "v15", "v30", "v31");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+ "ld1 {v30.2s}, [%[outptr2]]\n"
+ "ld1 {v31.2s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v30.2s, v6.2s, %[rx2].s[0]\n"
+ "fmla v31.2s, v6.2s, %[rx3].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v15.2s, v7.2s, %[rx1].s[1]\n"
+ "fmla v30.2s, v7.2s, %[rx2].s[1]\n"
+ "fmla v31.2s, v7.2s, %[rx3].s[1]\n"
+ "fmla v14.2s, v8.2s, %[rx0].s[2]\n"
+ "fmla v15.2s, v8.2s, %[rx1].s[2]\n"
+ "fmla v30.2s, v8.2s, %[rx2].s[2]\n"
+ "fmla v31.2s, v8.2s, %[rx3].s[2]\n"
+ "fmla v14.2s, v9.2s, %[rx0].s[3]\n"
+ "fmla v15.2s, v9.2s, %[rx1].s[3]\n"
+ "fmla v30.2s, v9.2s, %[rx2].s[3]\n"
+ "fmla v31.2s, v9.2s, %[rx3].s[3]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+ "st1 {v30.2s}, [%[outptr2]], #8\n"
+ "st1 {v31.2s}, [%[outptr3]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1),
+
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15", "v30",
+ "v31");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x1 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x1 + 3));
+
+ *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x2 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x2 + 3));
+
+ *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x3 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x3 + 3));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ outptr2++;
+ outptr3++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ _x1 += 4;
+ _x2 += 4;
+ _x3 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_f32(_x0);
+ register float32x2_t rx1 asm("v5") = vld1_f32(_x1);
+ register float32x2_t rx2 asm("v16") = vld1_f32(_x2);
+ register float32x2_t rx3 asm("v17") = vld1_f32(_x3);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+ float *outptr2 = out0 + outch * 2;
+ float *outptr3 = out0 + outch * 3;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile(
+ "cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v11.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v11.4s, %[rx3].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v11.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v11.4s, %[rx3].s[1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v30.4s, v7.4s, %[rx2].s[1]\n"
+ "fmla v31.4s, v7.4s, %[rx3].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1),
+ [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn),
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15", "v30", "v31");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+ "ld1 {v30.2s}, [%[outptr2]]\n"
+ "ld1 {v31.2s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v30.2s, v6.2s, %[rx2].s[0]\n"
+ "fmla v31.2s, v6.2s, %[rx3].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v15.2s, v7.2s, %[rx1].s[1]\n"
+ "fmla v30.2s, v7.2s, %[rx2].s[1]\n"
+ "fmla v31.2s, v7.2s, %[rx3].s[1]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+ "st1 {v30.2s}, [%[outptr2]], #8\n"
+ "st1 {v31.2s}, [%[outptr3]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1),
+
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v7", "v14", "v15", "v30", "v31");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1));
+ *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1));
+ *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ outptr2++;
+ outptr3++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ _x1 += 2;
+ _x2 += 2;
+ _x3 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0);
+ register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1);
+ register float32x2_t rx2 asm("v16") = vld1_dup_f32(_x2);
+ register float32x2_t rx3 asm("v17") = vld1_dup_f32(_x3);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+ float *outptr2 = out0 + outch * 2;
+ float *outptr3 = out0 + outch * 3;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile(
+ "cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v10.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v10.4s, %[rx3].s[0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+ "ld1 {v30.4s}, [%[outptr2]]\n"
+ "ld1 {v31.4s}, [%[outptr3]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v30.4s, v6.4s, %[rx2].s[0]\n"
+ "fmla v31.4s, v6.4s, %[rx3].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "st1 {v30.4s}, [%[outptr2]], #16\n"
+ "st1 {v31.4s}, [%[outptr3]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1),
+ [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v10", "v14", "v15", "v30", "v31");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+ "ld1 {v30.2s}, [%[outptr2]]\n"
+ "ld1 {v31.2s}, [%[outptr3]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v30.2s, v6.2s, %[rx2].s[0]\n"
+ "fmla v31.2s, v6.2s, %[rx3].s[0]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+ "st1 {v30.2s}, [%[outptr2]], #8\n"
+ "st1 {v31.2s}, [%[outptr3]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1),
+
+ [rx2] "w"(rx2), [rx3] "w"(rx3)
+ : "cc", "memory", "x0", "v6", "v14", "v15", "v30", "v31");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+ *outptr1 += (*kernel0) * (*_x1);
+ *outptr2 += (*kernel0) * (*_x2);
+ *outptr3 += (*kernel0) * (*_x3);
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ outptr2++;
+ outptr3++;
+ }
+
+ _x0 += 1;
+ _x1 += 1;
+ _x2 += 1;
+ _x3 += 1;
+ }
+
+ img1 += inch * 4 * _stride;
+ out0 += outch * 4;
+ q += 4;
+ }
+
+ for (; q + 1 < outw; /*q += 2*/)
+ {
+ if (padding)
+ {
+ if (((q + 1) * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w) >= pad_left + w)
+ {
+ out0 += outch * 2;
+ img1 += inch * _stride * 2;
+ q += 2;
+ continue;
+ }
+ else if ((q + 1) * _stride + m % kernel_w >= pad_left + w)
+ {
+ break;
+ }
+ else if (q * _stride + m % kernel_w < pad_left)
+ {
+ out0 += outch;
+ img1 += inch * _stride;
+ q++;
+ continue;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *_x1 = img1 + inch * _stride;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("v4") = vld1q_f32(_x0);
+ register float32x4_t rx1 asm("v5") = vld1q_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v12.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v13.4s, %[rx1].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v12.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v13.4s, %[rx1].s[3]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v15.4s, v8.4s, %[rx1].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+ "fmla v15.4s, v9.4s, %[rx1].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14", "v15");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v15.2s, v7.2s, %[rx1].s[1]\n"
+ "fmla v14.2s, v8.2s, %[rx0].s[2]\n"
+ "fmla v15.2s, v8.2s, %[rx1].s[2]\n"
+ "fmla v14.2s, v9.2s, %[rx0].s[3]\n"
+ "fmla v15.2s, v9.2s, %[rx1].s[3]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x1 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x1 + 3));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ _x1 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_f32(_x0);
+ register float32x2_t rx1 asm("v5") = vld1_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v11.4s, %[rx1].s[1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v15.4s, v7.4s, %[rx1].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v15.2s, v7.2s, %[rx1].s[1]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1)
+ : "cc", "memory", "x0", "v6", "v7", "v14", "v15");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ _x1 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0);
+ register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v10.4s, %[rx1].s[0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+ "ld1 {v15.4s}, [%[outptr1]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v15.4s, v6.4s, %[rx1].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "st1 {v15.4s}, [%[outptr1]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v10", "v14", "v15");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+ "ld1 {v15.2s}, [%[outptr1]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v15.2s, v6.2s, %[rx1].s[0]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ "st1 {v15.2s}, [%[outptr1]], #8\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1)
+ : "cc", "memory", "x0", "v6", "v14", "v15");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+ *outptr1 += (*kernel0) * (*_x1);
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ _x0 += 1;
+ _x1 += 1;
+ }
+
+ img1 += inch * 2 * _stride;
+ out0 += outch * 2;
+ q += 2;
+ }
+
+ for (; q < outw; q++)
+ {
+ if (padding)
+ {
+ if ((q * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w >= pad_left + w))
+ {
+ img1 += inch * _stride;
+ out0 += outch;
+ continue;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("v4") = vld1q_f32(_x0);
+
+ float *outptr0 = out0;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v12.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v13.4s, %[rx0].s[3]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+ "fmla v14.4s, v8.4s, %[rx0].s[2]\n"
+ "fmla v14.4s, v9.4s, %[rx0].s[3]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v8.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v9.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+ "fmla v14.2s, v8.2s, %[rx0].s[2]\n"
+ "fmla v14.2s, v9.2s, %[rx0].s[3]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [stride] "r"(stride), [rx0] "w"(rx0)
+ : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ kernel0++;
+ outptr0++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_f32(_x0);
+
+ float *outptr0 = out0;
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v11.4s, %[rx0].s[1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+ "fmla v14.4s, v7.4s, %[rx0].s[1]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+ "add x0, x0, %[stride]\n"
+ "ld1 {v7.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+ "fmla v14.2s, v7.2s, %[rx0].s[1]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [stride] "r"(stride), [rx0] "w"(rx0)
+ : "cc", "memory", "x0", "v6", "v7", "v14");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+
+ kernel0++;
+ outptr0++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0);
+
+ float *outptr0 = out0;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "beq 1f\n"
+
+ "0:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v10.4s}, [x0]\n"
+
+ "fmla v14.4s, v10.4s, %[rx0].s[0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "ld1 {v6.4s}, [x0]\n"
+
+ "ld1 {v14.4s}, [%[outptr0]]\n"
+
+ "fmla v14.4s, v6.4s, %[rx0].s[0]\n"
+
+ "st1 {v14.4s}, [%[outptr0]], #16\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [rx0] "w"(rx0), [oddn] "r"(oddn)
+ : "cc", "memory", "x0", "v6", "v10", "v14");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("ld1 {v14.2s}, [%[outptr0]]\n"
+
+ "mov x0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "ld1 {v6.2s}, [x0]\n"
+
+ "fmla v14.2s, v6.2s, %[rx0].s[0]\n"
+
+ "st1 {v14.2s}, [%[outptr0]], #8\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [rx0] "w"(rx0)
+ : "cc", "memory", "x0", "v6", "v14");
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+
+ kernel0++;
+ outptr0++;
+ }
+
+ _x0 += 1;
+ }
+
+ img1 += inch * _stride;
+ out0 += outch;
+ }
+ }
+ }
+}
+
+#else // __aarch64__
+static void direct_conv_l(const convMat_t &bottom_blob, convMat_t &top_blob,
+ const convMat_t &_kernel, const int _stride, const int padding,
+ const int pad_top, const int pad_left)
+{
+ const int w = bottom_blob.w;
+ const int h = bottom_blob.h;
+ const int inch = bottom_blob.c;
+ const int outw = top_blob.w;
+ const int outh = top_blob.h;
+ const int outch = top_blob.c;
+ const int kernel_w = _kernel.w;
+ const int kernel_h = _kernel.h;
+
+ for (int m = 0; m < kernel_w * kernel_h; m++)
+ {
+ const float *_kernel0 = _kernel.data + m * inch * outch;
+ const float *img0 =
+ bottom_blob.data + (m / kernel_w - pad_top) * w * inch + (m % kernel_w - pad_left) * inch;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif // _OPENMP
+ for (int p = 0; p < outh; p++)
+ {
+ float *out0 = top_blob.data + p * outw * outch;
+ // clear output.
+ if (m == 0)
+ {
+ for (int j = 0; j < outw * outch; j++)
+ {
+ *(out0 + j) = 0.f;
+ }
+ }
+
+ if (padding)
+ {
+ if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h))
+ {
+ continue;
+ }
+ }
+
+ const float *img1 = img0 + p * w * inch * _stride;
+
+ int q = 0;
+ for (; q + 1 < outw; /*q += 2*/)
+ {
+ if (padding)
+ {
+ if (((q + 1) * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w) >= pad_left + w)
+ {
+ out0 += outch * 2;
+ img1 += inch * _stride * 2;
+ q += 2;
+ continue;
+ }
+ else if (q * _stride + m % kernel_w < pad_left)
+ {
+ out0 += outch;
+ img1 += inch * _stride;
+ q++;
+ continue;
+ }
+ else if ((q + 1) * _stride + m % kernel_w >= pad_left + w)
+ {
+ break;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *_x1 = img1 + inch * _stride;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("q4") = vld1q_f32(_x0);
+ register float32x4_t rx1 asm("q5") = vld1q_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q15, q6, %e[rx1][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q15, q7, %e[rx1][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q15, q8, %f[rx1][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+ "vmla.f32 q15, q9, %f[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vmla.f32 q14, q10, %e[rx0][0]\n"
+ "vmla.f32 q15, q10, %e[rx1][0]\n"
+ "vmla.f32 q14, q11, %e[rx0][1]\n"
+ "vmla.f32 q15, q11, %e[rx1][1]\n"
+ "vmla.f32 q14, q12, %f[rx0][0]\n"
+ "vmla.f32 q15, q12, %f[rx1][0]\n"
+ "vmla.f32 q14, q13, %f[rx0][1]\n"
+ "vmla.f32 q15, q13, %f[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q15, q6, %e[rx1][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q15, q7, %e[rx1][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q15, q8, %f[rx1][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+ "vmla.f32 q15, q9, %f[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+
+ "vmla.f32 q14, q10, %e[rx0][0]\n"
+ "vmla.f32 q15, q10, %e[rx1][0]\n"
+ "vmla.f32 q14, q11, %e[rx0][1]\n"
+ "vmla.f32 q15, q11, %e[rx1][1]\n"
+ "vmla.f32 q14, q12, %f[rx0][0]\n"
+ "vmla.f32 q15, q12, %f[rx1][0]\n"
+ "vmla.f32 q14, q13, %f[rx0][1]\n"
+ "vmla.f32 q15, q13, %f[rx1][1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q15, q6, %e[rx1][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q15, q7, %e[rx1][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q15, q8, %f[rx1][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+ "vmla.f32 q15, q9, %f[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+ "vld1.f32 {d30}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18}, [r0]\n"
+
+ "vmla.f32 d28, d12, %e[rx0][0]\n"
+ "vmla.f32 d30, d12, %e[rx1][0]\n"
+ "vmla.f32 d28, d14, %e[rx0][1]\n"
+ "vmla.f32 d30, d14, %e[rx1][1]\n"
+ "vmla.f32 d28, d16, %f[rx0][0]\n"
+ "vmla.f32 d30, d16, %f[rx1][0]\n"
+ "vmla.f32 d28, d18, %f[rx0][1]\n"
+ "vmla.f32 d30, d18, %f[rx1][1]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+ "vst1.f32 {d30}, [%[outptr1]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1)
+#ifndef _OPENMP
+
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x1 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x1 + 3));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ _x1 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("d8") = vld1_f32(_x0);
+ register float32x2_t rx1 asm("d10") = vld1_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+ "vmla.f32 q15, q7, %P[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q15, q10, %P[rx1][0]\n"
+ "vmla.f32 q14, q11, %P[rx0][1]\n"
+ "vmla.f32 q15, q11, %P[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+ "vmla.f32 q15, q7, %P[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q15, q10, %P[rx1][0]\n"
+ "vmla.f32 q14, q11, %P[rx0][1]\n"
+ "vmla.f32 q15, q11, %P[rx1][1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+ "vmla.f32 q15, q7, %P[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+ "vld1.f32 {d30}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14}, [r0]\n"
+
+ "vmla.f32 d28, d12, %P[rx0][0]\n"
+ "vmla.f32 d30, d12, %P[rx1][0]\n"
+ "vmla.f32 d28, d14, %P[rx0][1]\n"
+ "vmla.f32 d30, d14, %P[rx1][1]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+ "vst1.f32 {d30}, [%[outptr1]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ _x1 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0);
+ register float32x2_t rx1 asm("d10") = vld1_dup_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q15, q10, %P[rx1][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q15, q10, %P[rx1][0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q10", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+ "vld1.f32 {d30}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+
+ "vmla.f32 d28, d12, %P[rx0][0]\n"
+ "vmla.f32 d30, d12, %P[rx1][0]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+ "vst1.f32 {d30}, [%[outptr1]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+ *outptr1 += (*kernel0) * (*_x1);
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ _x0 += 1;
+ _x1 += 1;
+ }
+
+ img1 += inch * 2 * _stride;
+ out0 += outch * 2;
+ q += 2;
+ }
+
+ for (; q < outw; q++)
+ {
+ if (padding)
+ {
+ if ((q * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w) >= pad_left + bottom_blob.w)
+ {
+ img1 += inch * _stride;
+ out0 += outch;
+ continue;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("q4") = vld1q_f32(_x0);
+
+ float *outptr0 = out0;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vmla.f32 q14, q10, %e[rx0][0]\n"
+ "vmla.f32 q14, q11, %e[rx0][1]\n"
+ "vmla.f32 q14, q12, %f[rx0][0]\n"
+ "vmla.f32 q14, q13, %f[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+
+ "vmla.f32 q14, q10, %e[rx0][0]\n"
+ "vmla.f32 q14, q11, %e[rx0][1]\n"
+ "vmla.f32 q14, q12, %f[rx0][0]\n"
+ "vmla.f32 q14, q13, %f[rx0][1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18}, [r0]\n"
+
+ "vmla.f32 d28, d12, %e[rx0][0]\n"
+ "vmla.f32 d28, d14, %e[rx0][1]\n"
+ "vmla.f32 d28, d16, %f[rx0][0]\n"
+ "vmla.f32 d28, d18, %f[rx0][1]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [stride] "r"(stride), [rx0] "w"(rx0)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ kernel0++;
+ outptr0++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("d8") = vld1_f32(_x0);
+
+ float *outptr0 = out0;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q14, q11, %P[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q14, q11, %P[rx0][1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14}, [r0]\n"
+
+ "vmla.f32 d28, d12, %P[rx0][0]\n"
+ "vmla.f32 d28, d14, %P[rx0][1]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [stride] "r"(stride), [rx0] "w"(rx0)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+
+ kernel0++;
+ outptr0++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0);
+
+ float *outptr0 = out0;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [rx0] "w"(rx0), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q10", "q14"
+
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+
+ "vmla.f32 d28, d12, %P[rx0][0]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [rx0] "w"(rx0)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+
+ kernel0++;
+ outptr0++;
+ }
+
+ _x0 += 1;
+ }
+
+ img1 += inch * _stride;
+ out0 += outch;
+ }
+ }
+ }
+}
+
+static void direct_conv_s(const convMat_t &bottom_blob, convMat_t &top_blob,
+ const convMat_t &_kernel, const int _stride, const int padding,
+ const int pad_top, const int pad_left)
+{
+ const int w = bottom_blob.w;
+ const int h = bottom_blob.h;
+ const int inch = bottom_blob.c;
+ const int outw = top_blob.w;
+ const int outh = top_blob.h;
+ const int outch = top_blob.c;
+ const int kernel_w = _kernel.w;
+ const int kernel_h = _kernel.h;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif // _OPENMP
+ for (int p = 0; p < outh; p++)
+ {
+ const float *img0 = bottom_blob.data + (p * _stride - pad_top) * w * inch;
+ float *out = top_blob.data + p * outw * outch;
+
+ // clear output.
+ for (int j = 0; j < outw * outch; j++)
+ {
+ *(out + j) = 0.f;
+ }
+
+ for (int m = 0; m < kernel_w * kernel_h; m++)
+ {
+ if (padding)
+ {
+ if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h))
+ {
+ continue;
+ }
+ }
+
+ float *out0 = out;
+ const float *_kernel0 = _kernel.data + m * inch * outch;
+ const float *img1 = img0 + (m / kernel_w) * w * inch + (m % kernel_w - pad_left) * inch;
+
+ int q = 0;
+ for (; q + 1 < outw; /*q += 2*/)
+ {
+ if (padding)
+ {
+ if (((q + 1) * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w >= pad_left + w))
+ {
+ out0 += outch * 2;
+ img1 += inch * _stride * 2;
+ q += 2;
+ continue;
+ }
+ else if (q * _stride + m % kernel_w < pad_left)
+ {
+ out0 += outch;
+ img1 += inch * _stride;
+ q++;
+ continue;
+ }
+ else if ((q + 1) * _stride + m % kernel_w >= pad_left + w)
+ {
+ break;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *_x1 = img1 + inch * _stride;
+
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("q4") = vld1q_f32(_x0);
+ register float32x4_t rx1 asm("q5") = vld1q_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q15, q6, %e[rx1][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q15, q7, %e[rx1][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q15, q8, %f[rx1][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+ "vmla.f32 q15, q9, %f[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vmla.f32 q14, q10, %e[rx0][0]\n"
+ "vmla.f32 q15, q10, %e[rx1][0]\n"
+ "vmla.f32 q14, q11, %e[rx0][1]\n"
+ "vmla.f32 q15, q11, %e[rx1][1]\n"
+ "vmla.f32 q14, q12, %f[rx0][0]\n"
+ "vmla.f32 q15, q12, %f[rx1][0]\n"
+ "vmla.f32 q14, q13, %f[rx0][1]\n"
+ "vmla.f32 q15, q13, %f[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q15, q6, %e[rx1][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q15, q7, %e[rx1][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q15, q8, %f[rx1][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+ "vmla.f32 q15, q9, %f[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+
+ "vmla.f32 q14, q10, %e[rx0][0]\n"
+ "vmla.f32 q15, q10, %e[rx1][0]\n"
+ "vmla.f32 q14, q11, %e[rx0][1]\n"
+ "vmla.f32 q15, q11, %e[rx1][1]\n"
+ "vmla.f32 q14, q12, %f[rx0][0]\n"
+ "vmla.f32 q15, q12, %f[rx1][0]\n"
+ "vmla.f32 q14, q13, %f[rx0][1]\n"
+ "vmla.f32 q15, q13, %f[rx1][1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q15, q6, %e[rx1][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q15, q7, %e[rx1][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q15, q8, %f[rx1][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+ "vmla.f32 q15, q9, %f[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15");
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+ "vld1.f32 {d30}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18}, [r0]\n"
+
+ "vmla.f32 d28, d12, %e[rx0][0]\n"
+ "vmla.f32 d30, d12, %e[rx1][0]\n"
+ "vmla.f32 d28, d14, %e[rx0][1]\n"
+ "vmla.f32 d30, d14, %e[rx1][1]\n"
+ "vmla.f32 d28, d16, %f[rx0][0]\n"
+ "vmla.f32 d30, d16, %f[rx1][0]\n"
+ "vmla.f32 d28, d18, %f[rx0][1]\n"
+ "vmla.f32 d30, d18, %f[rx1][1]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+ "vst1.f32 {d30}, [%[outptr1]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15"
+#else
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x1 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x1 + 3));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ _x1 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("d8") = vld1_f32(_x0);
+ register float32x2_t rx1 asm("d10") = vld1_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+ "vmla.f32 q15, q7, %P[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q15, q10, %P[rx1][0]\n"
+ "vmla.f32 q14, q11, %P[rx0][1]\n"
+ "vmla.f32 q15, q11, %P[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+ "vmla.f32 q15, q7, %P[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q15, q10, %P[rx1][0]\n"
+ "vmla.f32 q14, q11, %P[rx0][1]\n"
+ "vmla.f32 q15, q11, %P[rx1][1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+ "vmla.f32 q15, q7, %P[rx1][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+ "vld1.f32 {d30}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14}, [r0]\n"
+
+ "vmla.f32 d28, d12, %P[rx0][0]\n"
+ "vmla.f32 d30, d12, %P[rx1][0]\n"
+ "vmla.f32 d28, d14, %P[rx0][1]\n"
+ "vmla.f32 d30, d14, %P[rx1][1]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+ "vst1.f32 {d30}, [%[outptr1]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+ *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1));
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ _x1 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0);
+ register float32x2_t rx1 asm("d10") = vld1_dup_f32(_x1);
+
+ float *outptr0 = out0;
+ float *outptr1 = out0 + outch;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q15, q10, %P[rx1][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q15, q10, %P[rx1][0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+ "vld1.f32 {d30-d31}, [%[outptr1]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q15, q6, %P[rx1][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vst1.f32 {d30-d31}, [%[outptr1]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1), [_n] "+r"(_n)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q10", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+ "vld1.f32 {d30}, [%[outptr1]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+
+ "vmla.f32 d28, d12, %P[rx0][0]\n"
+ "vmla.f32 d30, d12, %P[rx1][0]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+ "vst1.f32 {d30}, [%[outptr1]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1)
+ : [rx0] "w"(rx0), [rx1] "w"(rx1)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+ *outptr1 += (*kernel0) * (*_x1);
+
+ kernel0++;
+ outptr0++;
+ outptr1++;
+ }
+
+ _x0 += 1;
+ _x1 += 1;
+ }
+
+ img1 += inch * 2 * _stride;
+ out0 += outch * 2;
+ q += 2;
+ }
+
+ for (; q < outw; q++)
+ {
+ if (padding)
+ {
+ if ((q * _stride + m % kernel_w < pad_left) ||
+ (q * _stride + m % kernel_w >= pad_left + w))
+ {
+ img1 += inch * _stride;
+ out0 += outch;
+ continue;
+ }
+ }
+
+ const float *_x0 = img1;
+ const float *kernel0 = _kernel0;
+
+ int i = 0;
+ for (; i + 3 < inch; i += 4)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x4_t rx0 asm("q4") = vld1q_f32(_x0);
+
+ float *outptr0 = out0;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vmla.f32 q14, q10, %e[rx0][0]\n"
+ "vmla.f32 q14, q11, %e[rx0][1]\n"
+ "vmla.f32 q14, q12, %f[rx0][0]\n"
+ "vmla.f32 q14, q13, %f[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+
+ "vmla.f32 q14, q10, %e[rx0][0]\n"
+ "vmla.f32 q14, q11, %e[rx0][1]\n"
+ "vmla.f32 q14, q12, %f[rx0][0]\n"
+ "vmla.f32 q14, q13, %f[rx0][1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %e[rx0][0]\n"
+ "vmla.f32 q14, q7, %e[rx0][1]\n"
+ "vmla.f32 q14, q8, %f[rx0][0]\n"
+ "vmla.f32 q14, q9, %f[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d16}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d18}, [r0]\n"
+
+ "vmla.f32 d28, d12, %e[rx0][0]\n"
+ "vmla.f32 d28, d14, %e[rx0][1]\n"
+ "vmla.f32 d28, d16, %f[rx0][0]\n"
+ "vmla.f32 d28, d18, %f[rx0][1]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [stride] "r"(stride), [rx0] "w"(rx0)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) +
+ (*(kernel0 + outch * 2)) * (*(_x0 + 2)) +
+ (*(kernel0 + outch * 3)) * (*(_x0 + 3));
+
+ kernel0++;
+ outptr0++;
+ }
+
+ kernel0 += outch * 3;
+ _x0 += 4;
+ }
+
+ for (; i + 1 < inch; i += 2)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("d8") = vld1_f32(_x0);
+
+ float *outptr0 = out0;
+
+ int stride = outch << 2;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q14, q11, %P[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+ "vmla.f32 q14, q11, %P[rx0][1]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+ "vmla.f32 q14, q7, %P[rx0][1]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+ "add r0, r0, %[stride]\n"
+ "vld1.f32 {d14}, [r0]\n"
+
+ "vmla.f32 d28, d12, %P[rx0][0]\n"
+ "vmla.f32 d28, d14, %P[rx0][1]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [stride] "r"(stride), [rx0] "w"(rx0)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1));
+
+ kernel0++;
+ outptr0++;
+ }
+
+ kernel0 += outch;
+ _x0 += 2;
+ }
+
+ for (; i < inch; i++)
+ {
+ int nn = outch >> 2;
+ int remain = outch & 0x03;
+
+ register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0);
+
+ float *outptr0 = out0;
+
+ if (nn > 0)
+ {
+ int _n = nn >> 1;
+ int oddn = nn & 1;
+
+ asm volatile("cmp %[_n], #0\n"
+ "beq 2f\n"
+ "subs %[_n], %[_n], #1\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "beq 1f\n"
+
+ "0:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "subs %[_n], %[_n], #1\n"
+ "bne 0b\n"
+
+ "1:\n"
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+
+ "vmla.f32 q14, q10, %P[rx0][0]\n"
+
+ "cmp %[oddn], #1\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+
+ "bne 3f\n"
+
+ "2:\n"
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #16\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+
+ "vld1.f32 {d28-d29}, [%[outptr0]]\n"
+
+ "vmla.f32 q14, q6, %P[rx0][0]\n"
+
+ "vst1.f32 {d28-d29}, [%[outptr0]]!\n"
+ "3:\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n)
+ : [rx0] "w"(rx0), [oddn] "r"(oddn)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q10", "q14"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ }
+
+ if (remain >= 2)
+ {
+ asm volatile("vld1.f32 {d28}, [%[outptr0]]\n"
+
+ "mov r0, %[kernel0]\n"
+ "add %[kernel0], %[kernel0], #8\n"
+ "vld1.f32 {d12}, [r0]\n"
+
+ "vmla.f32 d28, d12, %P[rx0][0]\n"
+
+ "vst1.f32 {d28}, [%[outptr0]]!\n"
+ : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0)
+ : [rx0] "w"(rx0)
+#ifndef _OPENMP
+ : "cc", "memory", "r0", "q6", "q14", "q15"
+#else // _OPENMP
+ : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
+ "q14", "q15"
+#endif // _OPENMP
+ );
+ remain -= 2;
+ }
+
+ if (remain == 1)
+ {
+ *outptr0 += (*kernel0) * (*_x0);
+
+ kernel0++;
+ outptr0++;
+ }
+
+ _x0 += 1;
+ }
+
+ img1 += inch * _stride;
+ out0 += outch;
+ }
+ }
+ }
+}
+#endif // __aarch64__
+
+void direct_conv_colmajor(const convMat_t &bottom_blob, convMat_t &top_blob,
+ const convMat_t &kernel, const convParams_t &params, int num_threads)
+{
+ omp_set_num_threads(num_threads);
+
+ if (bottom_blob.c * top_blob.c < 256 * 256)
+ {
+ direct_conv_s(bottom_blob, top_blob, kernel, params.stride_w, params.padding, params.pad_h,
+ params.pad_w);
+ return;
+ }
+
+ direct_conv_l(bottom_blob, top_blob, kernel, params.stride_w, params.padding, params.pad_h,
+ params.pad_w);
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/direct_conv_colmajor.h b/compute/ncnn/src/srcn/direct_conv_colmajor.h
new file mode 100644
index 000000000..5e15192c9
--- /dev/null
+++ b/compute/ncnn/src/srcn/direct_conv_colmajor.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_DIRECT_CONV_COLMAJOR_H__
+#define __NNFW_SRCN_DIRECT_CONV_COLMAJOR_H__
+
+#include "ncnn/srcn/conv_type.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void direct_conv_colmajor(const convMat_t &, convMat_t &, const convMat_t &, const convParams_t &,
+ int);
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_DIRECT_CONV_COLMAJOR_H__
diff --git a/compute/ncnn/src/srcn/sgemm_kernel.cc b/compute/ncnn/src/srcn/sgemm_kernel.cc
new file mode 100644
index 000000000..90c3641db
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_kernel.cc
@@ -0,0 +1,2508 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+
+namespace nnfw
+{
+namespace srcn
+{
+
+#if __aarch64__
+static void sgemm_rowmajor_micro_kernel_8x12(const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k, const int k0,
+ const int stride)
+{
+ int oddk = (k & 1);
+ int nk = ((k + 1) / 2) - 1;
+
+ const int nstride = stride << 2;
+
+ __asm __volatile("ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+
+ "cmp %[k0], #0\n"
+ "beq 0f\n"
+
+ "mov x0, %[res_ptr]\n"
+ "ld1 {v8.4s, v9.4s, v10.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v11.4s, v12.4s, v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v14.4s, v15.4s, v16.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v17.4s, v18.4s, v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v20.4s, v21.4s, v22.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v23.4s, v24.4s, v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v26.4s, v27.4s, v28.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v29.4s, v30.4s, v31.4s}, [x0]\n"
+ "cbz %w[nk], 4f\n"
+ "b 1f\n"
+
+ "0:\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "cbz %w[nk], 4f\n"
+
+ "1:\n"
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v11.4s, v2.4s, v0.s[1]\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v14.4s, v2.4s, v0.s[2]\n"
+ "fmla v17.4s, v2.4s, v0.s[3]\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "fmla v12.4s, v3.4s, v0.s[1]\n"
+ "fmla v15.4s, v3.4s, v0.s[2]\n"
+ "fmla v18.4s, v3.4s, v0.s[3]\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "fmla v13.4s, v4.4s, v0.s[1]\n"
+ "fmla v16.4s, v4.4s, v0.s[2]\n"
+ "fmla v19.4s, v4.4s, v0.s[3]\n"
+
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v20.4s, v2.4s, v1.s[0]\n"
+ "fmla v23.4s, v2.4s, v1.s[1]\n"
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v26.4s, v2.4s, v1.s[2]\n"
+ "fmla v29.4s, v2.4s, v1.s[3]\n"
+ "fmla v21.4s, v3.4s, v1.s[0]\n"
+ "fmla v24.4s, v3.4s, v1.s[1]\n"
+ "fmla v27.4s, v3.4s, v1.s[2]\n"
+ "fmla v30.4s, v3.4s, v1.s[3]\n"
+ "fmla v22.4s, v4.4s, v1.s[0]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "fmla v28.4s, v4.4s, v1.s[2]\n"
+ "fmla v31.4s, v4.4s, v1.s[3]\n"
+
+ "fmla v8.4s, v5.4s, v0.s[0]\n"
+ "fmla v11.4s, v5.4s, v0.s[1]\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v14.4s, v5.4s, v0.s[2]\n"
+ "fmla v17.4s, v5.4s, v0.s[3]\n"
+ "fmla v9.4s, v6.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v0.s[1]\n"
+ "fmla v15.4s, v6.4s, v0.s[2]\n"
+ "fmla v18.4s, v6.4s, v0.s[3]\n"
+ "fmla v10.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v0.s[1]\n"
+ "fmla v16.4s, v7.4s, v0.s[2]\n"
+ "fmla v19.4s, v7.4s, v0.s[3]\n"
+
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v20.4s, v5.4s, v1.s[0]\n"
+ "fmla v23.4s, v5.4s, v1.s[1]\n"
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v26.4s, v5.4s, v1.s[2]\n"
+ "fmla v29.4s, v5.4s, v1.s[3]\n"
+ "fmla v21.4s, v6.4s, v1.s[0]\n"
+ "fmla v24.4s, v6.4s, v1.s[1]\n"
+ "fmla v27.4s, v6.4s, v1.s[2]\n"
+ "fmla v30.4s, v6.4s, v1.s[3]\n"
+ "fmla v22.4s, v7.4s, v1.s[0]\n"
+ "fmla v25.4s, v7.4s, v1.s[1]\n"
+ "subs %w[nk], %w[nk], #1\n"
+ "fmla v28.4s, v7.4s, v1.s[2]\n"
+ "fmla v31.4s, v7.4s, v1.s[3]\n"
+ "bne 1b\n"
+
+ "4:\n"
+ "mov x0, %[res_ptr]\n"
+ "cbnz %[oddk], 2f\n"
+
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "fmla v11.4s, v2.4s, v0.s[1]\n"
+ "fmla v12.4s, v3.4s, v0.s[1]\n"
+ "fmla v13.4s, v4.4s, v0.s[1]\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v14.4s, v2.4s, v0.s[2]\n"
+ "fmla v15.4s, v3.4s, v0.s[2]\n"
+ "fmla v16.4s, v4.4s, v0.s[2]\n"
+ "fmla v17.4s, v2.4s, v0.s[3]\n"
+ "fmla v18.4s, v3.4s, v0.s[3]\n"
+ "fmla v19.4s, v4.4s, v0.s[3]\n"
+
+ "fmla v20.4s, v2.4s, v1.s[0]\n"
+ "fmla v21.4s, v3.4s, v1.s[0]\n"
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v22.4s, v4.4s, v1.s[0]\n"
+ "fmla v23.4s, v2.4s, v1.s[1]\n"
+ "fmla v24.4s, v3.4s, v1.s[1]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "fmla v26.4s, v2.4s, v1.s[2]\n"
+ "fmla v27.4s, v3.4s, v1.s[2]\n"
+ "fmla v28.4s, v4.4s, v1.s[2]\n"
+ "fmla v29.4s, v2.4s, v1.s[3]\n"
+ "fmla v30.4s, v3.4s, v1.s[3]\n"
+ "fmla v31.4s, v4.4s, v1.s[3]\n"
+
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v8.4s, v5.4s, v0.s[0]\n"
+ "fmla v9.4s, v6.4s, v0.s[0]\n"
+ "fmla v10.4s, v7.4s, v0.s[0]\n"
+ "st1 {v8.4s, v9.4s, v10.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v11.4s, v5.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v0.s[1]\n"
+ "st1 {v11.4s, v12.4s, v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v14.4s, v5.4s, v0.s[2]\n"
+ "fmla v15.4s, v6.4s, v0.s[2]\n"
+ "fmla v16.4s, v7.4s, v0.s[2]\n"
+ "st1 {v14.4s, v15.4s, v16.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v17.4s, v5.4s, v0.s[3]\n"
+ "fmla v18.4s, v6.4s, v0.s[3]\n"
+ "fmla v19.4s, v7.4s, v0.s[3]\n"
+ "st1 {v17.4s, v18.4s, v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+
+ "fmla v20.4s, v5.4s, v1.s[0]\n"
+ "fmla v21.4s, v6.4s, v1.s[0]\n"
+ "fmla v22.4s, v7.4s, v1.s[0]\n"
+ "st1 {v20.4s, v21.4s, v22.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v23.4s, v5.4s, v1.s[1]\n"
+ "fmla v24.4s, v6.4s, v1.s[1]\n"
+ "fmla v25.4s, v7.4s, v1.s[1]\n"
+ "st1 {v23.4s, v24.4s, v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v26.4s, v5.4s, v1.s[2]\n"
+ "fmla v27.4s, v6.4s, v1.s[2]\n"
+ "fmla v28.4s, v7.4s, v1.s[2]\n"
+ "st1 {v26.4s, v27.4s, v28.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v29.4s, v5.4s, v1.s[3]\n"
+ "fmla v30.4s, v6.4s, v1.s[3]\n"
+ "fmla v31.4s, v7.4s, v1.s[3]\n"
+ "b 3f\n"
+
+ "2:\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "st1 {v8.4s, v9.4s, v10.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v11.4s, v2.4s, v0.s[1]\n"
+ "fmla v12.4s, v3.4s, v0.s[1]\n"
+ "fmla v13.4s, v4.4s, v0.s[1]\n"
+ "st1 {v11.4s, v12.4s, v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v14.4s, v2.4s, v0.s[2]\n"
+ "fmla v15.4s, v3.4s, v0.s[2]\n"
+ "fmla v16.4s, v4.4s, v0.s[2]\n"
+ "st1 {v14.4s, v15.4s, v16.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v17.4s, v2.4s, v0.s[3]\n"
+ "fmla v18.4s, v3.4s, v0.s[3]\n"
+ "fmla v19.4s, v4.4s, v0.s[3]\n"
+ "st1 {v17.4s, v18.4s, v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+
+ "fmla v20.4s, v2.4s, v1.s[0]\n"
+ "fmla v21.4s, v3.4s, v1.s[0]\n"
+ "fmla v22.4s, v4.4s, v1.s[0]\n"
+ "st1 {v20.4s, v21.4s, v22.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v23.4s, v2.4s, v1.s[1]\n"
+ "fmla v24.4s, v3.4s, v1.s[1]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "st1 {v23.4s, v24.4s, v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v26.4s, v2.4s, v1.s[2]\n"
+ "fmla v27.4s, v3.4s, v1.s[2]\n"
+ "fmla v28.4s, v4.4s, v1.s[2]\n"
+ "st1 {v26.4s, v27.4s, v28.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v29.4s, v2.4s, v1.s[3]\n"
+ "fmla v30.4s, v3.4s, v1.s[3]\n"
+ "fmla v31.4s, v4.4s, v1.s[3]\n"
+
+ "3:\n"
+ "st1 {v29.4s, v30.4s, v31.4s}, [x0]\n"
+ : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr),
+ [nk] "+r"(nk)
+ : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride)
+ : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+ "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+static void sgemm_rowmajor_micro_kernel_12x8(const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k, const int k0,
+ const int stride)
+{
+ int oddk = (k & 1);
+ int nk = ((k + 1) / 2) - 1;
+
+ const int nstride = stride << 2;
+
+ __asm __volatile("ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "ld1 {v4.4s, v5.4s}, [%[rhs_ptr]], #32\n"
+
+ "cmp %[k0], #0\n"
+ "beq 0f\n"
+
+ "mov x0, %[res_ptr]\n"
+ "ld1 {v8.4s, v9.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v10.4s, v11.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v12.4s, v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v14.4s, v15.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v16.4s, v17.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v18.4s, v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v20.4s, v21.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v22.4s, v23.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v24.4s, v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v26.4s, v27.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v28.4s, v29.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v30.4s, v31.4s}, [x0]\n"
+ "cbz %w[nk], 4f\n"
+ "b 1f\n"
+
+ "0:\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "cbz %w[nk], 4f\n"
+
+ "1:\n"
+ "fmla v8.4s, v4.4s, v0.s[0]\n"
+ "fmla v10.4s, v4.4s, v0.s[1]\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v12.4s, v4.4s, v0.s[2]\n"
+ "fmla v14.4s, v4.4s, v0.s[3]\n"
+ "fmla v9.4s, v5.4s, v0.s[0]\n"
+ "fmla v11.4s, v5.4s, v0.s[1]\n"
+ "fmla v13.4s, v5.4s, v0.s[2]\n"
+ "fmla v15.4s, v5.4s, v0.s[3]\n"
+
+ "fmla v16.4s, v4.4s, v1.s[0]\n"
+ "fmla v18.4s, v4.4s, v1.s[1]\n"
+ "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v20.4s, v4.4s, v1.s[2]\n"
+ "fmla v22.4s, v4.4s, v1.s[3]\n"
+ "fmla v17.4s, v5.4s, v1.s[0]\n"
+ "fmla v19.4s, v5.4s, v1.s[1]\n"
+ "fmla v21.4s, v5.4s, v1.s[2]\n"
+ "fmla v23.4s, v5.4s, v1.s[3]\n"
+
+ "ld1 {v6.4s, v7.4s}, [%[rhs_ptr]], #32\n"
+
+ "fmla v24.4s, v4.4s, v2.s[0]\n"
+ "fmla v26.4s, v4.4s, v2.s[1]\n"
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v28.4s, v4.4s, v2.s[2]\n"
+ "fmla v30.4s, v4.4s, v2.s[3]\n"
+ "fmla v25.4s, v5.4s, v2.s[0]\n"
+ "fmla v27.4s, v5.4s, v2.s[1]\n"
+ "fmla v29.4s, v5.4s, v2.s[2]\n"
+ "fmla v31.4s, v5.4s, v2.s[3]\n"
+
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v12.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v0.s[3]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v0.s[3]\n"
+
+ "fmla v16.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v1.s[1]\n"
+ "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v20.4s, v6.4s, v1.s[2]\n"
+ "fmla v22.4s, v6.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v1.s[1]\n"
+ "fmla v21.4s, v7.4s, v1.s[2]\n"
+ "fmla v23.4s, v7.4s, v1.s[3]\n"
+
+ "ld1 {v4.4s, v5.4s}, [%[rhs_ptr]], #32\n"
+
+ "fmla v24.4s, v6.4s, v2.s[0]\n"
+ "fmla v26.4s, v6.4s, v2.s[1]\n"
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v28.4s, v6.4s, v2.s[2]\n"
+ "fmla v30.4s, v6.4s, v2.s[3]\n"
+ "fmla v25.4s, v7.4s, v2.s[0]\n"
+ "fmla v27.4s, v7.4s, v2.s[1]\n"
+ "subs %w[nk], %w[nk], #1\n"
+ "fmla v29.4s, v7.4s, v2.s[2]\n"
+ "fmla v31.4s, v7.4s, v2.s[3]\n"
+ "bne 1b\n"
+
+ "4:\n"
+ "mov x0, %[res_ptr]\n"
+ "cbnz %[oddk], 2f\n"
+
+ "fmla v8.4s, v4.4s, v0.s[0]\n"
+ "fmla v9.4s, v5.4s, v0.s[0]\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v10.4s, v4.4s, v0.s[1]\n"
+ "fmla v11.4s, v5.4s, v0.s[1]\n"
+ "fmla v12.4s, v4.4s, v0.s[2]\n"
+ "fmla v13.4s, v5.4s, v0.s[2]\n"
+ "fmla v14.4s, v4.4s, v0.s[3]\n"
+ "fmla v15.4s, v5.4s, v0.s[3]\n"
+
+ "fmla v16.4s, v4.4s, v1.s[0]\n"
+ "fmla v17.4s, v5.4s, v1.s[0]\n"
+ "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v18.4s, v4.4s, v1.s[1]\n"
+ "fmla v19.4s, v5.4s, v1.s[1]\n"
+ "fmla v20.4s, v4.4s, v1.s[2]\n"
+ "fmla v21.4s, v5.4s, v1.s[2]\n"
+ "fmla v22.4s, v4.4s, v1.s[3]\n"
+ "fmla v23.4s, v5.4s, v1.s[3]\n"
+
+ "ld1 {v6.4s, v7.4s}, [%[rhs_ptr]], #32\n"
+
+ "fmla v24.4s, v4.4s, v2.s[0]\n"
+ "fmla v25.4s, v5.4s, v2.s[0]\n"
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "fmla v26.4s, v4.4s, v2.s[1]\n"
+ "fmla v27.4s, v5.4s, v2.s[1]\n"
+ "fmla v28.4s, v4.4s, v2.s[2]\n"
+ "fmla v29.4s, v5.4s, v2.s[2]\n"
+ "fmla v30.4s, v4.4s, v2.s[3]\n"
+ "fmla v31.4s, v5.4s, v2.s[3]\n"
+
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "st1 {v8.4s, v9.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "st1 {v10.4s, v11.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v12.4s, v6.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v0.s[2]\n"
+ "st1 {v12.4s, v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v14.4s, v6.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v0.s[3]\n"
+ "st1 {v14.4s, v15.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+
+ "fmla v16.4s, v6.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v1.s[0]\n"
+ "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
+ "st1 {v16.4s, v17.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v18.4s, v6.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v1.s[1]\n"
+ "st1 {v18.4s, v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v20.4s, v6.4s, v1.s[2]\n"
+ "fmla v21.4s, v7.4s, v1.s[2]\n"
+ "st1 {v20.4s, v21.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v22.4s, v6.4s, v1.s[3]\n"
+ "fmla v23.4s, v7.4s, v1.s[3]\n"
+ "st1 {v22.4s, v23.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+
+ "fmla v24.4s, v6.4s, v2.s[0]\n"
+ "fmla v25.4s, v7.4s, v2.s[0]\n"
+ "st1 {v24.4s, v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v26.4s, v6.4s, v2.s[1]\n"
+ "fmla v27.4s, v7.4s, v2.s[1]\n"
+ "st1 {v26.4s, v27.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v28.4s, v6.4s, v2.s[2]\n"
+ "fmla v29.4s, v7.4s, v2.s[2]\n"
+ "st1 {v28.4s, v29.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v30.4s, v6.4s, v2.s[3]\n"
+ "fmla v31.4s, v7.4s, v2.s[3]\n"
+ "b 3f\n"
+
+ "2:\n"
+ "fmla v8.4s, v4.4s, v0.s[0]\n"
+ "fmla v9.4s, v5.4s, v0.s[0]\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "st1 {v8.4s, v9.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v10.4s, v4.4s, v0.s[1]\n"
+ "fmla v11.4s, v5.4s, v0.s[1]\n"
+ "st1 {v10.4s, v11.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v12.4s, v4.4s, v0.s[2]\n"
+ "fmla v13.4s, v5.4s, v0.s[2]\n"
+ "st1 {v12.4s, v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v14.4s, v4.4s, v0.s[3]\n"
+ "fmla v15.4s, v5.4s, v0.s[3]\n"
+ "st1 {v14.4s, v15.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+
+ "fmla v16.4s, v4.4s, v1.s[0]\n"
+ "fmla v17.4s, v5.4s, v1.s[0]\n"
+ "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
+ "st1 {v16.4s, v17.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v18.4s, v4.4s, v1.s[1]\n"
+ "fmla v19.4s, v5.4s, v1.s[1]\n"
+ "st1 {v18.4s, v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v20.4s, v4.4s, v1.s[2]\n"
+ "fmla v21.4s, v5.4s, v1.s[2]\n"
+ "st1 {v20.4s, v21.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v22.4s, v4.4s, v1.s[3]\n"
+ "fmla v23.4s, v5.4s, v1.s[3]\n"
+ "st1 {v22.4s, v23.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+
+ "fmla v24.4s, v4.4s, v2.s[0]\n"
+ "fmla v25.4s, v5.4s, v2.s[0]\n"
+ "st1 {v24.4s, v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v26.4s, v4.4s, v2.s[1]\n"
+ "fmla v27.4s, v5.4s, v2.s[1]\n"
+ "st1 {v26.4s, v27.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v28.4s, v4.4s, v2.s[2]\n"
+ "fmla v29.4s, v5.4s, v2.s[2]\n"
+ "st1 {v28.4s, v29.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v30.4s, v4.4s, v2.s[3]\n"
+ "fmla v31.4s, v5.4s, v2.s[3]\n"
+
+ "3:\n"
+ "st1 {v30.4s, v31.4s}, [x0]\n"
+ : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr),
+ [nk] "+r"(nk)
+ : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride)
+ : "x0", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+ "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+#ifdef BATCH_DILATION_FIX
+static void sgemm_rowmajor_micro_kernel_4x24(const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k, const int k0,
+ const int stride)
+{
+ int oddk = (k & 1);
+ int nk = ((k + 1) / 2) - 1;
+
+ const int nstride = stride << 2;
+
+ __asm __volatile("ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+
+ "cmp %[k0], #0\n"
+ "beq 0f\n"
+
+ "mov x0, %[res_ptr]\n"
+ "mov x1, x0\n"
+ "ld1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n"
+ "ld1 {v11.4s, v12.4s, v13.4s}, [x1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "ld1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n"
+ "ld1 {v17.4s, v18.4s, v19.4s}, [x1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "ld1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n"
+ "ld1 {v23.4s, v24.4s, v25.4s}, [x1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "ld1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n"
+ "ld1 {v29.4s, v30.4s, v31.4s}, [x1]\n"
+ "cbz %w[nk], 4f\n"
+ "b 1f\n"
+
+ "0:\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "cbz %w[nk], 4f\n"
+
+ "1:\n"
+ "mov x0, v0.d[0]\n"
+ "cmp x0, #0\n"
+ "bne 5f\n"
+ "mov x0, v0.d[1]\n"
+ "cmp x0, #0\n"
+ "bne 5f\n"
+ "add %[rhs_ptr], %[rhs_ptr], #96\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "b 6f\n"
+ "5:\n"
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v14.4s, v2.4s, v0.s[1]\n"
+ "fmla v20.4s, v2.4s, v0.s[2]\n"
+ "fmla v26.4s, v2.4s, v0.s[3]\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "fmla v15.4s, v3.4s, v0.s[1]\n"
+ "fmla v21.4s, v3.4s, v0.s[2]\n"
+ "fmla v27.4s, v3.4s, v0.s[3]\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "fmla v16.4s, v4.4s, v0.s[1]\n"
+ "fmla v22.4s, v4.4s, v0.s[2]\n"
+ "fmla v28.4s, v4.4s, v0.s[3]\n"
+
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+
+ "fmla v11.4s, v5.4s, v0.s[0]\n"
+ "fmla v17.4s, v5.4s, v0.s[1]\n"
+ "fmla v23.4s, v5.4s, v0.s[2]\n"
+ "fmla v29.4s, v5.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v0.s[0]\n"
+ "fmla v18.4s, v6.4s, v0.s[1]\n"
+ "fmla v24.4s, v6.4s, v0.s[2]\n"
+ "fmla v30.4s, v6.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v0.s[0]\n"
+ "fmla v19.4s, v7.4s, v0.s[1]\n"
+ "fmla v25.4s, v7.4s, v0.s[2]\n"
+ "fmla v31.4s, v7.4s, v0.s[3]\n"
+
+ "6:\n"
+ "mov x0, v1.d[0]\n"
+ "cmp x0, #0\n"
+ "bne 7f\n"
+ "mov x0, v1.d[1]\n"
+ "cmp x0, #0\n"
+ "bne 7f\n"
+ "add %[rhs_ptr], %[rhs_ptr], #96\n"
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v8.4s, v2.4s, v1.s[0]\n"
+ "fmla v14.4s, v2.4s, v1.s[1]\n"
+ "fmla v20.4s, v2.4s, v1.s[2]\n"
+ "fmla v26.4s, v2.4s, v1.s[3]\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v9.4s, v3.4s, v1.s[0]\n"
+ "fmla v15.4s, v3.4s, v1.s[1]\n"
+ "fmla v21.4s, v3.4s, v1.s[2]\n"
+ "fmla v27.4s, v3.4s, v1.s[3]\n"
+ "fmla v10.4s, v4.4s, v1.s[0]\n"
+ "fmla v16.4s, v4.4s, v1.s[1]\n"
+ "fmla v22.4s, v4.4s, v1.s[2]\n"
+ "fmla v28.4s, v4.4s, v1.s[3]\n"
+
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+
+ "fmla v11.4s, v5.4s, v1.s[0]\n"
+ "fmla v17.4s, v5.4s, v1.s[1]\n"
+ "fmla v23.4s, v5.4s, v1.s[2]\n"
+ "fmla v29.4s, v5.4s, v1.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v1.s[1]\n"
+ "fmla v24.4s, v6.4s, v1.s[2]\n"
+ "fmla v30.4s, v6.4s, v1.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v1.s[1]\n"
+ "fmla v25.4s, v7.4s, v1.s[2]\n"
+ "fmla v31.4s, v7.4s, v1.s[3]\n"
+
+ "8:\n"
+ "subs %w[nk], %w[nk], #1\n"
+ "bne 1b\n"
+
+ "4:\n"
+ "mov x0, %[res_ptr]\n"
+ "cbnz %[oddk], 2f\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "fmla v14.4s, v2.4s, v0.s[1]\n"
+ "fmla v15.4s, v3.4s, v0.s[1]\n"
+ "fmla v16.4s, v4.4s, v0.s[1]\n"
+ "fmla v20.4s, v2.4s, v0.s[2]\n"
+ "fmla v21.4s, v3.4s, v0.s[2]\n"
+ "fmla v22.4s, v4.4s, v0.s[2]\n"
+ "fmla v26.4s, v2.4s, v0.s[3]\n"
+ "fmla v27.4s, v3.4s, v0.s[3]\n"
+ "fmla v28.4s, v4.4s, v0.s[3]\n"
+
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v11.4s, v5.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v0.s[0]\n"
+ "fmla v17.4s, v5.4s, v0.s[1]\n"
+ "fmla v18.4s, v6.4s, v0.s[1]\n"
+ "fmla v19.4s, v7.4s, v0.s[1]\n"
+ "fmla v23.4s, v5.4s, v0.s[2]\n"
+ "fmla v24.4s, v6.4s, v0.s[2]\n"
+ "fmla v25.4s, v7.4s, v0.s[2]\n"
+ "fmla v29.4s, v5.4s, v0.s[3]\n"
+ "fmla v30.4s, v6.4s, v0.s[3]\n"
+ "fmla v31.4s, v7.4s, v0.s[3]\n"
+
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v8.4s, v2.4s, v1.s[0]\n"
+ "fmla v9.4s, v3.4s, v1.s[0]\n"
+ "fmla v10.4s, v4.4s, v1.s[0]\n"
+ "mov x1, x0\n"
+ "st1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n"
+ "fmla v11.4s, v5.4s, v1.s[0]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "st1 {v11.4s, v12.4s, v13.4s}, [x1]\n"
+ "fmla v14.4s, v2.4s, v1.s[1]\n"
+ "fmla v15.4s, v3.4s, v1.s[1]\n"
+ "fmla v16.4s, v4.4s, v1.s[1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n"
+ "fmla v17.4s, v5.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v1.s[1]\n"
+ "st1 {v17.4s, v18.4s, v19.4s}, [x1]\n"
+ "fmla v20.4s, v2.4s, v1.s[2]\n"
+ "fmla v21.4s, v3.4s, v1.s[2]\n"
+ "fmla v22.4s, v4.4s, v1.s[2]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n"
+ "fmla v23.4s, v5.4s, v1.s[2]\n"
+ "fmla v24.4s, v6.4s, v1.s[2]\n"
+ "fmla v25.4s, v7.4s, v1.s[2]\n"
+ "st1 {v23.4s, v24.4s, v25.4s}, [x1]\n"
+ "fmla v26.4s, v2.4s, v1.s[3]\n"
+ "fmla v27.4s, v3.4s, v1.s[3]\n"
+ "fmla v28.4s, v4.4s, v1.s[3]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n"
+ "fmla v29.4s, v5.4s, v1.s[3]\n"
+ "fmla v30.4s, v6.4s, v1.s[3]\n"
+ "fmla v31.4s, v7.4s, v1.s[3]\n"
+ "b 3f\n"
+
+ "2:\n"
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "mov x1, x0\n"
+ "st1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n"
+ "fmla v11.4s, v5.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v0.s[0]\n"
+ "st1 {v11.4s, v12.4s, v13.4s}, [x1]\n"
+ "fmla v14.4s, v2.4s, v0.s[1]\n"
+ "fmla v15.4s, v3.4s, v0.s[1]\n"
+ "fmla v16.4s, v4.4s, v0.s[1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n"
+ "fmla v17.4s, v5.4s, v0.s[1]\n"
+ "fmla v18.4s, v6.4s, v0.s[1]\n"
+ "fmla v19.4s, v7.4s, v0.s[1]\n"
+ "st1 {v17.4s, v18.4s, v19.4s}, [x1]\n"
+ "fmla v20.4s, v2.4s, v0.s[2]\n"
+ "fmla v21.4s, v3.4s, v0.s[2]\n"
+ "fmla v22.4s, v4.4s, v0.s[2]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n"
+ "fmla v23.4s, v5.4s, v0.s[2]\n"
+ "fmla v24.4s, v6.4s, v0.s[2]\n"
+ "fmla v25.4s, v7.4s, v0.s[2]\n"
+ "st1 {v23.4s, v24.4s, v25.4s}, [x1]\n"
+ "fmla v26.4s, v2.4s, v0.s[3]\n"
+ "fmla v27.4s, v3.4s, v0.s[3]\n"
+ "fmla v28.4s, v4.4s, v0.s[3]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n"
+ "fmla v29.4s, v5.4s, v0.s[3]\n"
+ "fmla v30.4s, v6.4s, v0.s[3]\n"
+ "fmla v31.4s, v7.4s, v0.s[3]\n"
+ "3:\n"
+ "st1 {v29.4s, v30.4s, v31.4s}, [x1]\n"
+ : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr),
+ [nk] "+r"(nk)
+ : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride)
+ : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+ "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+ "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+#else // BATCH_DILATION_FIX
+static void sgemm_rowmajor_micro_kernel_4x24(const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k, const int k0,
+ const int stride)
+{
+ int oddk = (k & 1);
+ int nk = ((k + 1) / 2) - 1;
+
+ const int nstride = stride << 2;
+
+ __asm __volatile("ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+ "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
+ "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
+ "ld1 {v4.4s}, [%[rhs_ptr]], #16\n"
+
+ "cmp %[k0], #0\n"
+ "beq 0f\n"
+
+ "mov x0, %[res_ptr]\n"
+ "mov x1, x0\n"
+ "ld1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n"
+ "ld1 {v11.4s, v12.4s, v13.4s}, [x1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "ld1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n"
+ "ld1 {v17.4s, v18.4s, v19.4s}, [x1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "ld1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n"
+ "ld1 {v23.4s, v24.4s, v25.4s}, [x1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "ld1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n"
+ "ld1 {v29.4s, v30.4s, v31.4s}, [x1]\n"
+ "cbz %w[nk], 4f\n"
+ "b 1f\n"
+
+ "0:\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "cbz %w[nk], 4f\n"
+
+ "1:\n"
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v14.4s, v2.4s, v0.s[1]\n"
+ "fmla v20.4s, v2.4s, v0.s[2]\n"
+ "fmla v26.4s, v2.4s, v0.s[3]\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "fmla v15.4s, v3.4s, v0.s[1]\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v21.4s, v3.4s, v0.s[2]\n"
+ "fmla v27.4s, v3.4s, v0.s[3]\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "fmla v16.4s, v4.4s, v0.s[1]\n"
+ "fmla v22.4s, v4.4s, v0.s[2]\n"
+ "fmla v28.4s, v4.4s, v0.s[3]\n"
+
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+
+ "fmla v11.4s, v5.4s, v0.s[0]\n"
+ "fmla v17.4s, v5.4s, v0.s[1]\n"
+ "fmla v23.4s, v5.4s, v0.s[2]\n"
+ "fmla v29.4s, v5.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v0.s[0]\n"
+ "fmla v18.4s, v6.4s, v0.s[1]\n"
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v24.4s, v6.4s, v0.s[2]\n"
+ "fmla v30.4s, v6.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v0.s[0]\n"
+ "fmla v19.4s, v7.4s, v0.s[1]\n"
+ "fmla v25.4s, v7.4s, v0.s[2]\n"
+ "fmla v31.4s, v7.4s, v0.s[3]\n"
+
+ "fmla v8.4s, v2.4s, v1.s[0]\n"
+ "fmla v14.4s, v2.4s, v1.s[1]\n"
+ "fmla v20.4s, v2.4s, v1.s[2]\n"
+ "fmla v26.4s, v2.4s, v1.s[3]\n"
+ "fmla v9.4s, v3.4s, v1.s[0]\n"
+ "fmla v15.4s, v3.4s, v1.s[1]\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v21.4s, v3.4s, v1.s[2]\n"
+ "fmla v27.4s, v3.4s, v1.s[3]\n"
+ "fmla v10.4s, v4.4s, v1.s[0]\n"
+ "fmla v16.4s, v4.4s, v1.s[1]\n"
+ "fmla v22.4s, v4.4s, v1.s[2]\n"
+ "fmla v28.4s, v4.4s, v1.s[3]\n"
+
+ "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
+
+ "fmla v11.4s, v5.4s, v1.s[0]\n"
+ "fmla v17.4s, v5.4s, v1.s[1]\n"
+ "fmla v23.4s, v5.4s, v1.s[2]\n"
+ "fmla v29.4s, v5.4s, v1.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v1.s[1]\n"
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+ "fmla v24.4s, v6.4s, v1.s[2]\n"
+ "fmla v30.4s, v6.4s, v1.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v1.s[1]\n"
+ "subs %w[nk], %w[nk], #1\n"
+ "fmla v25.4s, v7.4s, v1.s[2]\n"
+ "fmla v31.4s, v7.4s, v1.s[3]\n"
+ "bne 1b\n"
+
+ "4:\n"
+ "mov x0, %[res_ptr]\n"
+ "cbnz %[oddk], 2f\n"
+ "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "fmla v14.4s, v2.4s, v0.s[1]\n"
+ "fmla v15.4s, v3.4s, v0.s[1]\n"
+ "fmla v16.4s, v4.4s, v0.s[1]\n"
+ "fmla v20.4s, v2.4s, v0.s[2]\n"
+ "fmla v21.4s, v3.4s, v0.s[2]\n"
+ "fmla v22.4s, v4.4s, v0.s[2]\n"
+ "fmla v26.4s, v2.4s, v0.s[3]\n"
+ "fmla v27.4s, v3.4s, v0.s[3]\n"
+ "fmla v28.4s, v4.4s, v0.s[3]\n"
+
+ "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v11.4s, v5.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v0.s[0]\n"
+ "fmla v17.4s, v5.4s, v0.s[1]\n"
+ "fmla v18.4s, v6.4s, v0.s[1]\n"
+ "fmla v19.4s, v7.4s, v0.s[1]\n"
+ "fmla v23.4s, v5.4s, v0.s[2]\n"
+ "fmla v24.4s, v6.4s, v0.s[2]\n"
+ "fmla v25.4s, v7.4s, v0.s[2]\n"
+ "fmla v29.4s, v5.4s, v0.s[3]\n"
+ "fmla v30.4s, v6.4s, v0.s[3]\n"
+ "fmla v31.4s, v7.4s, v0.s[3]\n"
+
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v8.4s, v2.4s, v1.s[0]\n"
+ "fmla v9.4s, v3.4s, v1.s[0]\n"
+ "fmla v10.4s, v4.4s, v1.s[0]\n"
+ "mov x1, x0\n"
+ "st1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n"
+ "fmla v11.4s, v5.4s, v1.s[0]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "st1 {v11.4s, v12.4s, v13.4s}, [x1]\n"
+ "fmla v14.4s, v2.4s, v1.s[1]\n"
+ "fmla v15.4s, v3.4s, v1.s[1]\n"
+ "fmla v16.4s, v4.4s, v1.s[1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n"
+ "fmla v17.4s, v5.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v1.s[1]\n"
+ "st1 {v17.4s, v18.4s, v19.4s}, [x1]\n"
+ "fmla v20.4s, v2.4s, v1.s[2]\n"
+ "fmla v21.4s, v3.4s, v1.s[2]\n"
+ "fmla v22.4s, v4.4s, v1.s[2]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n"
+ "fmla v23.4s, v5.4s, v1.s[2]\n"
+ "fmla v24.4s, v6.4s, v1.s[2]\n"
+ "fmla v25.4s, v7.4s, v1.s[2]\n"
+ "st1 {v23.4s, v24.4s, v25.4s}, [x1]\n"
+ "fmla v26.4s, v2.4s, v1.s[3]\n"
+ "fmla v27.4s, v3.4s, v1.s[3]\n"
+ "fmla v28.4s, v4.4s, v1.s[3]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n"
+ "fmla v29.4s, v5.4s, v1.s[3]\n"
+ "fmla v30.4s, v6.4s, v1.s[3]\n"
+ "fmla v31.4s, v7.4s, v1.s[3]\n"
+ "b 3f\n"
+
+ "2:\n"
+ "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n"
+
+ "fmla v8.4s, v2.4s, v0.s[0]\n"
+ "fmla v9.4s, v3.4s, v0.s[0]\n"
+ "fmla v10.4s, v4.4s, v0.s[0]\n"
+ "mov x1, x0\n"
+ "st1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n"
+ "fmla v11.4s, v5.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v0.s[0]\n"
+ "st1 {v11.4s, v12.4s, v13.4s}, [x1]\n"
+ "fmla v14.4s, v2.4s, v0.s[1]\n"
+ "fmla v15.4s, v3.4s, v0.s[1]\n"
+ "fmla v16.4s, v4.4s, v0.s[1]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n"
+ "fmla v17.4s, v5.4s, v0.s[1]\n"
+ "fmla v18.4s, v6.4s, v0.s[1]\n"
+ "fmla v19.4s, v7.4s, v0.s[1]\n"
+ "st1 {v17.4s, v18.4s, v19.4s}, [x1]\n"
+ "fmla v20.4s, v2.4s, v0.s[2]\n"
+ "fmla v21.4s, v3.4s, v0.s[2]\n"
+ "fmla v22.4s, v4.4s, v0.s[2]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n"
+ "fmla v23.4s, v5.4s, v0.s[2]\n"
+ "fmla v24.4s, v6.4s, v0.s[2]\n"
+ "fmla v25.4s, v7.4s, v0.s[2]\n"
+ "st1 {v23.4s, v24.4s, v25.4s}, [x1]\n"
+ "fmla v26.4s, v2.4s, v0.s[3]\n"
+ "fmla v27.4s, v3.4s, v0.s[3]\n"
+ "fmla v28.4s, v4.4s, v0.s[3]\n"
+ "add x0, x0, %[nstride]\n"
+ "mov x1, x0\n"
+ "st1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n"
+ "fmla v29.4s, v5.4s, v0.s[3]\n"
+ "fmla v30.4s, v6.4s, v0.s[3]\n"
+ "fmla v31.4s, v7.4s, v0.s[3]\n"
+ "3:\n"
+ "st1 {v29.4s, v30.4s, v31.4s}, [x1]\n"
+ : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr),
+ [nk] "+r"(nk)
+ : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride)
+ : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+ "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+ "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+#endif // BATCH_DILATION_FIX
+
+static void sgemm_rowmajor_micro_kernel_24x4(const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k, const int k0,
+ const int stride)
+{
+ int oddk = (k & 1);
+ int nk = ((k + 1) / 2) - 1;
+
+ const int nstride = stride << 2;
+
+ __asm __volatile("ld1 {v0.4s, v1.4s, v2.4s}, [%[lhs_ptr]], #48\n"
+ "ld1 {v6.4s}, [%[rhs_ptr]], #16\n"
+
+ "cmp %[k0], #0\n"
+ "beq 0f\n"
+
+ "mov x0, %[res_ptr]\n"
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v14.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v15.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v16.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v17.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v18.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v20.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v21.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v22.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v23.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v24.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v26.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v27.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v28.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v29.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v30.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "ld1 {v31.4s}, [x0]\n"
+ "cbz %w[nk], 4f\n"
+ "b 1f\n"
+
+ "0:\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "cbz %w[nk], 4f\n"
+
+ "1:\n"
+ "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v9.4s, v6.4s, v0.s[1]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v11.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v13.4s, v6.4s, v1.s[1]\n"
+ "ld1 {v7.4s}, [%[rhs_ptr]], #16\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v15.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v17.4s, v6.4s, v2.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v19.4s, v6.4s, v2.s[3]\n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%[lhs_ptr]], #48\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "fmla v21.4s, v6.4s, v3.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v23.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "fmla v25.4s, v6.4s, v4.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "fmla v27.4s, v6.4s, v4.s[3]\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "fmla v29.4s, v6.4s, v5.s[1]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "fmla v31.4s, v6.4s, v5.s[3]\n"
+
+ "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n"
+ "fmla v8.4s, v7.4s, v0.s[0]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v10.4s, v7.4s, v0.s[2]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v12.4s, v7.4s, v1.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ld1 {v6.4s}, [%[rhs_ptr]], #16\n"
+ "fmla v14.4s, v7.4s, v1.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v16.4s, v7.4s, v2.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v18.4s, v7.4s, v2.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%[lhs_ptr]], #48\n"
+ "fmla v20.4s, v7.4s, v3.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v22.4s, v7.4s, v3.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v24.4s, v7.4s, v4.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "fmla v26.4s, v7.4s, v4.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v28.4s, v7.4s, v5.s[0]\n"
+ "fmla v29.4s, v7.4s, v5.s[1]\n"
+ "subs %w[nk], %w[nk], #1\n"
+ "fmla v30.4s, v7.4s, v5.s[2]\n"
+ "fmla v31.4s, v7.4s, v5.s[3]\n"
+ "bne 1b\n"
+
+ "4:\n"
+ "mov x0, %[res_ptr]\n"
+ "cbnz %[oddk], 2f\n"
+
+ "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v9.4s, v6.4s, v0.s[1]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v11.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v13.4s, v6.4s, v1.s[1]\n"
+ "ld1 {v7.4s}, [%[rhs_ptr]], #16\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v15.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v17.4s, v6.4s, v2.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v19.4s, v6.4s, v2.s[3]\n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%[lhs_ptr]], #48\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "fmla v21.4s, v6.4s, v3.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v23.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "fmla v25.4s, v6.4s, v4.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "fmla v27.4s, v6.4s, v4.s[3]\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "fmla v29.4s, v6.4s, v5.s[1]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "fmla v31.4s, v6.4s, v5.s[3]\n"
+
+ "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n"
+ "fmla v8.4s, v7.4s, v0.s[0]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "st1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v10.4s, v7.4s, v0.s[2]\n"
+ "st1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "st1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v12.4s, v7.4s, v1.s[0]\n"
+ "st1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "st1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v14.4s, v7.4s, v1.s[2]\n"
+ "st1 {v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "st1 {v14.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v16.4s, v7.4s, v2.s[0]\n"
+ "st1 {v15.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "st1 {v16.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v18.4s, v7.4s, v2.s[2]\n"
+ "st1 {v17.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "st1 {v18.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v20.4s, v7.4s, v3.s[0]\n"
+ "st1 {v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "st1 {v20.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v22.4s, v7.4s, v3.s[2]\n"
+ "st1 {v21.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "st1 {v22.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v24.4s, v7.4s, v4.s[0]\n"
+ "st1 {v23.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "st1 {v24.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v26.4s, v7.4s, v4.s[2]\n"
+ "st1 {v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "st1 {v26.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v28.4s, v7.4s, v5.s[0]\n"
+ "st1 {v27.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v29.4s, v7.4s, v5.s[1]\n"
+ "st1 {v28.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v30.4s, v7.4s, v5.s[2]\n"
+ "st1 {v29.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v31.4s, v7.4s, v5.s[3]\n"
+ "st1 {v30.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "b 3f\n"
+
+ "2:\n"
+ "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v9.4s, v6.4s, v0.s[1]\n"
+ "st1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "st1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v11.4s, v6.4s, v0.s[3]\n"
+ "st1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "st1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v13.4s, v6.4s, v1.s[1]\n"
+ "st1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "st1 {v13.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v15.4s, v6.4s, v1.s[3]\n"
+ "st1 {v14.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "st1 {v15.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v17.4s, v6.4s, v2.s[1]\n"
+ "st1 {v16.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "st1 {v17.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v19.4s, v6.4s, v2.s[3]\n"
+ "st1 {v18.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "st1 {v19.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v21.4s, v6.4s, v3.s[1]\n"
+ "st1 {v20.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "st1 {v21.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v23.4s, v6.4s, v3.s[3]\n"
+ "st1 {v22.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "st1 {v23.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v25.4s, v6.4s, v4.s[1]\n"
+ "st1 {v24.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "st1 {v25.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v27.4s, v6.4s, v4.s[3]\n"
+ "st1 {v26.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "st1 {v27.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v29.4s, v6.4s, v5.s[1]\n"
+ "st1 {v28.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "st1 {v29.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "fmla v31.4s, v6.4s, v5.s[3]\n"
+ "st1 {v30.4s}, [x0]\n"
+ "add x0, x0, %[nstride]\n"
+ "3:\n"
+ "st1 {v31.4s}, [x0]\n"
+ : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr),
+ [nk] "+r"(nk)
+ : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride)
+ : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+ "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+#else // __aarch64__
+static void sgemm_rowmajor_micro_kernel_6x8(const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k, const int k0,
+ const int stride)
+{
+ int nk = k >> 2;
+ int rk = k & 3;
+
+ const int nstride = stride << 2;
+
+ if (rk == 0)
+ {
+ nk--;
+ rk = 4;
+ }
+
+ __asm __volatile("vld1.32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "cmp %[k0], #0\n"
+ "beq 0f\n"
+
+ "mov r0, %[res_ptr]\n"
+
+ "vld1.f32 {d8-d11}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d12-d15}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d16-d19}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d20-d23}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d24-d27}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d28-d31}, [r0]\n"
+ "b 1f\n"
+
+ "0:\n"
+ "vmov.i32 q4, #0\n"
+ "vmov.i32 q5, #0\n"
+ "vmov.i32 q6, #0\n"
+ "pld [%[lhs_ptr], #48]\n"
+ "vmov.i32 q7, #0\n"
+ "pld [%[rhs_ptr], #48]\n"
+ "vmov.i32 q8, #0\n"
+ "pld [%[lhs_ptr], #112]\n"
+ "vmov.i32 q9, #0\n"
+ "pld [%[rhs_ptr], #112]\n"
+ "vmov.i32 q10, #0\n"
+ "vmov.i32 q11, #0\n"
+ "vmov.i32 q12, #0\n"
+ "vmov.i32 q13, #0\n"
+ "pld [%[lhs_ptr], #176]\n"
+ "vmov.i32 q14, #0\n"
+ "pld [%[rhs_ptr], #176]\n"
+ "vmov.i32 q15, #0\n"
+
+ "1:\n"
+ "cmp %[nk], #0\n"
+ "beq 6f\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q6, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q10, q2, d1[1]\n"
+ "vmla.f32 q12, q2, d2[0]\n"
+ "vmla.f32 q14, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q5, q3, d0[0]\n"
+ "vmla.f32 q7, q3, d0[1]\n"
+ "vmla.f32 q9, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q13, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q4, q2, d3[0]\n"
+ "subs %[nk], %[nk], #1\n"
+ "vmla.f32 q6, q2, d3[1]\n"
+ "pld [%[lhs_ptr], #208]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q10, q2, d0[1]\n"
+ "pld [%[rhs_ptr], #192]\n"
+ "vmla.f32 q12, q2, d1[0]\n"
+ "vmla.f32 q14, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q5, q3, d3[0]\n"
+ "vmla.f32 q7, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q9, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q13, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n"
+
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q6, q2, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q10, q2, d3[1]\n"
+ "pld [%[lhs_ptr], #240]\n"
+ "vmla.f32 q12, q2, d0[0]\n"
+ "vmla.f32 q14, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q5, q3, d2[0]\n"
+ "vmla.f32 q7, q3, d2[1]\n"
+ "pld [%[rhs_ptr], #208]\n"
+ "vmla.f32 q9, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q13, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q6, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q10, q2, d2[1]\n"
+ "vmla.f32 q12, q2, d3[0]\n"
+ "vmla.f32 q14, q2, d3[1]\n"
+ "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q5, q3, d1[0]\n"
+ "vmla.f32 q7, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q9, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q13, q3, d3[0]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ "6:\n"
+ "mov r0, %[res_ptr]\n"
+ "subs %[rk], %[rk], #1\n"
+ "beq 3f\n"
+
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q6, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q10, q2, d1[1]\n"
+ "vmla.f32 q12, q2, d2[0]\n"
+ "subs %[rk], %[rk], #1\n"
+ "vmla.f32 q14, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q5, q3, d0[0]\n"
+ "vmla.f32 q7, q3, d0[1]\n"
+ "vmla.f32 q9, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q13, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "beq 4f\n"
+
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q6, q2, d3[1]\n"
+ "subs %[rk], %[rk], #1\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q10, q2, d0[1]\n"
+ "vmla.f32 q12, q2, d1[0]\n"
+ "vmla.f32 q14, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q5, q3, d3[0]\n"
+ "vmla.f32 q7, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q9, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q13, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "beq 5f\n"
+
+ "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q6, q2, d2[1]\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q10, q2, d3[1]\n"
+ "vmla.f32 q12, q2, d0[0]\n"
+ "vmla.f32 q14, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q5, q3, d2[0]\n"
+ "vmla.f32 q7, q3, d2[1]\n"
+ "vmla.f32 q9, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q13, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q5, q3, d1[0]\n"
+ "vst1.32 {d8-d11}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q6, q2, d1[1]\n"
+ "vmla.f32 q7, q3, d1[1]\n"
+ "vst1.32 {d12-d15}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q3, d2[0]\n"
+ "vst1.32 {d16-d19}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q10, q2, d2[1]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d20-d23}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q12, q2, d3[0]\n"
+ "vmla.f32 q13, q3, d3[0]\n"
+ "vst1.32 {d24-d27}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q14, q2, d3[1]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "b 2f\n"
+
+ "3:\n"
+ "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vmla.f32 q5, q3, d0[0]\n"
+ "vst1.32 {d8-d11}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q6, q2, d0[1]\n"
+ "vmla.f32 q7, q3, d0[1]\n"
+ "vst1.32 {d12-d15}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vld1.32 {d2}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q9, q3, d1[0]\n"
+ "vst1.32 {d16-d19}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q10, q2, d1[1]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d20-d23}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q12, q2, d2[0]\n"
+ "vmla.f32 q13, q3, d2[0]\n"
+ "vst1.32 {d24-d27}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q14, q2, d2[1]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "b 2f\n"
+
+ "4:\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q5, q3, d3[0]\n"
+ "vst1.32 {d8-d11}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q6, q2, d3[1]\n"
+ "vmla.f32 q7, q3, d3[1]\n"
+ "vst1.32 {d12-d15}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q3, d0[0]\n"
+ "vst1.32 {d16-d19}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q10, q2, d0[1]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d20-d23}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q12, q2, d1[0]\n"
+ "vmla.f32 q13, q3, d1[0]\n"
+ "vst1.32 {d24-d27}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q14, q2, d1[1]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "b 2f\n"
+
+ "5:\n"
+ "vld1.32 {d0}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q3, d2[0]\n"
+ "vst1.32 {d8-d11}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q6, q2, d2[1]\n"
+ "vmla.f32 q7, q3, d2[1]\n"
+ "vst1.32 {d12-d15}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q9, q3, d3[0]\n"
+ "vst1.32 {d16-d19}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q10, q2, d3[1]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d20-d23}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q12, q2, d0[0]\n"
+ "vmla.f32 q13, q3, d0[0]\n"
+ "vst1.32 {d24-d27}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q14, q2, d0[1]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "2:\n"
+ "vst1.32 {d28-d31}, [r0]\n"
+ : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr),
+ [nk] "+r"(nk), [rk] "+r"(rk)
+ : [k0] "r"(k0), [nstride] "r"(nstride)
+ : "r0", "r1", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+ "q11", "q12", "q13", "q14", "q15", "cc");
+}
+
+static void sgemm_rowmajor_micro_kernel_4x12(const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k, const int k0,
+ const int stride)
+{
+ int rk = (k & 1);
+ int nk = (k + 1) / 2;
+
+ const int nstride = stride << 2;
+
+ asm volatile("vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "cmp %[k0], #0\n"
+ "beq 0f\n"
+
+ "mov r1, %[res_ptr]\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "mov r0, r1\n"
+ "vld1.f32 {d8-d9}, [r0]!\n"
+ "add r1, %[nstride]\n"
+ "vld1.f32 {d16-d17}, [r0]!\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "mov r0, r1\n"
+ "vld1.f32 {d10-d11}, [r0]!\n"
+ "add r1, %[nstride]\n"
+ "vld1.f32 {d18-d19}, [r0]!\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+ "mov r0, r1\n"
+ "vld1.f32 {d12-d13}, [r0]!\n"
+ "add r1, %[nstride]\n"
+ "vld1.f32 {d20-d21}, [r0]!\n"
+ "vld1.f32 {d28-d29}, [r0]\n"
+ "mov r0, r1\n"
+ "vld1.f32 {d14-d15}, [r0]!\n"
+ "vld1.f32 {d22-d23}, [r0]!\n"
+ "vld1.f32 {d30-d31}, [r0]\n"
+ "beq 2f\n"
+
+ "b 1f\n"
+
+ "0:\n"
+ "veor q4, q4\n"
+ "subs %[nk],%[nk], #1\n"
+ "vmov.f32 q8, q4\n"
+ "vmov.f32 q12, q4\n"
+ "vmov.f32 q5, q4\n"
+ "vmov.f32 q9, q4\n"
+ "vmov.f32 q13, q4\n"
+ "vmov.f32 q6, q4\n"
+ "vmov.f32 q10, q4\n"
+ "vmov.f32 q14, q4\n"
+ "vmov.f32 q7, q4\n"
+ "vmov.f32 q11, q4\n"
+ "vmov.f32 q15, q4\n"
+
+ "beq 2f\n"
+
+ "1:\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q8, q3, d0[0]\n"
+ "vmla.f32 q9, q3, d0[1]\n"
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q12, q2, d0[0]\n"
+ "vmla.f32 q13, q2, d0[1]\n"
+ "pld [%[lhs_ptr], #208]\n"
+ "vmla.f32 q14, q2, d1[0]\n"
+ "pld [%[rhs_ptr], #192]\n"
+ "vmla.f32 q15, q2, d1[1]\n"
+
+ "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q4, q3, d2[0]\n"
+ "vmla.f32 q5, q3, d2[1]\n"
+ "vmla.f32 q6, q3, d3[0]\n"
+ "vmla.f32 q7, q3, d3[1]\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q10, q2, d3[0]\n"
+ "vmla.f32 q11, q2, d3[1]\n"
+ "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "subs %[nk],%[nk], #1\n"
+ "pld [%[lhs_ptr], #240]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "pld [%[rhs_ptr], #208]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ "2:\n"
+ "cmp %[rk], #1\n"
+ "beq 3f\n"
+
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q8, q3, d0[0]\n"
+ "vmla.f32 q9, q3, d0[1]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q12, q2, d0[0]\n"
+ "vmla.f32 q13, q2, d0[1]\n"
+ "vmla.f32 q14, q2, d1[0]\n"
+ "vmla.f32 q15, q2, d1[1]\n"
+
+ "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n"
+ "vld1.f32 {d0-d1}, [%[rhs_ptr]]!\n"
+ "mov r1, %[res_ptr]\n"
+ "mov r0, r1\n"
+ "vmla.f32 q4, q3, d2[0]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q12, q0, d2[0]\n"
+ "vst1.f32 {d8-d9}, [r0]!\n"
+ "add r1, %[nstride]\n"
+ "vmla.f32 q5, q3, d2[1]\n"
+ "vst1.f32 {d16-d17}, [r0]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.f32 {d24-d25}, [r0]\n"
+ "mov r0, r1\n"
+ "vmla.f32 q13, q0, d2[1]\n"
+ "vst1.f32 {d10-d11}, [r0]!\n"
+ "vmla.f32 q6, q3, d3[0]\n"
+ "add r1, %[nstride]\n"
+ "vst1.f32 {d18-d19}, [r0]!\n"
+ "vmla.f32 q10, q2, d3[0]\n"
+ "vst1.f32 {d26-d27}, [r0]\n"
+ "mov r0, r1\n"
+ "vmla.f32 q14, q0, d3[0]\n"
+ "vst1.f32 {d12-d13}, [r0]!\n"
+ "add r1, %[nstride]\n"
+ "vmla.f32 q7, q3, d3[1]\n"
+ "vst1.f32 {d20-d21}, [r0]!\n"
+ "vmla.f32 q11, q2, d3[1]\n"
+ "vst1.f32 {d28-d29}, [r0]\n"
+ "mov r0, r1\n"
+ "vmla.f32 q15, q0, d3[1]\n"
+ "b 4f\n"
+
+ "3:\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vld1.f32 {d2-d3}, [%[rhs_ptr]]!\n"
+ "mov r1, %[res_ptr]\n"
+ "mov r0, r1\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vmla.f32 q8, q3, d0[0]\n"
+ "vmla.f32 q12, q1, d0[0]\n"
+ "vst1.f32 {d8-d9}, [r0]!\n"
+ "add r1, %[nstride]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vst1.f32 {d16-d17}, [r0]!\n"
+ "vmla.f32 q9, q3, d0[1]\n"
+ "vst1.f32 {d24-d25}, [r0]\n"
+ "mov r0, r1\n"
+ "vmla.f32 q13, q1, d0[1]\n"
+ "vst1.f32 {d10-d11}, [r0]!\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "add r1, %[nstride]\n"
+ "vst1.f32 {d18-d19}, [r0]!\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.f32 {d26-d27}, [r0]\n"
+ "mov r0, r1\n"
+ "vmla.f32 q14, q1, d1[0]\n"
+ "vst1.f32 {d12-d13}, [r0]!\n"
+ "add r1, %[nstride]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.f32 {d20-d21}, [r0]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.f32 {d28-d29}, [r0]\n"
+ "mov r0, r1\n"
+ "vmla.f32 q15, q1, d1[1]\n"
+
+ "4:\n"
+ "vst1.f32 {d14-d15}, [r0]!\n"
+ "vst1.f32 {d22-d23}, [r0]!\n"
+ "vst1.f32 {d30-d31}, [r0]\n"
+
+ : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr),
+ [nk] "+r"(nk), [rk] "+r"(rk)
+ : [k0] "r"(k0), [nstride] "r"(nstride)
+ : "r0", "r1", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+ "q11", "q12", "q13", "q14", "q15", "cc");
+}
+
+static void sgemm_rowmajor_micro_kernel_12x4(const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k, const int k0,
+ const int stride)
+{
+ int rk = (k & 1);
+ int nk = (k + 1) / 2;
+
+ const int nstride = stride << 2;
+
+ asm volatile("vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n"
+
+ "cmp %[k0], #0\n"
+ "beq 0f\n"
+
+ "mov r0, %[res_ptr]\n"
+ "subs %[nk], %[nk], #1\n"
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d28-d29}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d30-d31}, [r0]\n"
+ "beq 2f\n"
+ "b 1f\n"
+
+ "0:\n"
+ "veor q4, q4\n"
+ "subs %[nk],%[nk], #1\n"
+ "vmov.f32 q5, q4\n"
+ "vmov.f32 q6, q4\n"
+ "vmov.f32 q7, q4\n"
+ "vmov.f32 q8, q4\n"
+ "vmov.f32 q9, q4\n"
+ "vmov.f32 q10, q4\n"
+ "vmov.f32 q11, q4\n"
+ "vmov.f32 q12, q4\n"
+ "vmov.f32 q13, q4\n"
+ "vmov.f32 q14, q4\n"
+ "vmov.f32 q15, q4\n"
+
+ "beq 2f\n"
+
+ "1:\n"
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q10, q2, d3[0]\n"
+ "vmla.f32 q11, q2, d3[1]\n"
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q12, q2, d0[0]\n"
+ "vmla.f32 q13, q2, d0[1]\n"
+ "pld [%[rhs_ptr], #208]\n"
+ "vmla.f32 q14, q2, d1[0]\n"
+ "pld [%[lhs_ptr], #192]\n"
+ "vmla.f32 q15, q2, d1[1]\n"
+
+ "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q4, q3, d2[0]\n"
+ "vmla.f32 q5, q3, d2[1]\n"
+ "vmla.f32 q6, q3, d3[0]\n"
+ "vmla.f32 q7, q3, d3[1]\n"
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q8, q3, d0[0]\n"
+ "vmla.f32 q9, q3, d0[1]\n"
+ "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "subs %[nk],%[nk], #1\n"
+ "pld [%[rhs_ptr], #240]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "pld [%[lhs_ptr], #208]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ "2:\n"
+ "cmp %[rk], #1\n"
+ "beq 3f\n"
+
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vmla.f32 q10, q2, d3[0]\n"
+ "vmla.f32 q11, q2, d3[1]\n"
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q12, q2, d0[0]\n"
+ "vmla.f32 q13, q2, d0[1]\n"
+ "vmla.f32 q14, q2, d1[0]\n"
+ "vmla.f32 q15, q2, d1[1]\n"
+
+ "mov r0, %[res_ptr]\n"
+ "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q4, q3, d2[0]\n"
+ "vst1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q5, q3, d2[1]\n"
+ "vst1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q6, q3, d3[0]\n"
+ "vst1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q7, q3, d3[1]\n"
+ "vst1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q8, q3, d0[0]\n"
+ "vst1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q9, q3, d0[1]\n"
+ "vst1.f32 {d18-d19}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.f32 {d26-d27}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.f32 {d28-d29}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "b 4f\n"
+
+ "3:\n"
+ "mov r0, %[res_ptr]\n"
+ "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vst1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vst1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.f32 {d18-d19}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q10, q2, d3[0]\n"
+ "vst1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q11, q2, d3[1]\n"
+ "vst1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q12, q2, d0[0]\n"
+ "vst1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q13, q2, d0[1]\n"
+ "vst1.f32 {d26-d27}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q14, q2, d1[0]\n"
+ "vst1.f32 {d28-d29}, [r0]\n"
+ "add r0, r0, %[nstride]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+
+ "4:\n"
+ "vst1.f32 {d30-d31}, [r0]\n"
+ : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr),
+ [nk] "+r"(nk), [rk] "+r"(rk)
+ : [k0] "r"(k0), [nstride] "r"(nstride)
+ : "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15", "cc");
+}
+#endif // __aarch64__
+
+typedef void (*sgemm_rowmajoy_micro_kernel_func)(const float *, const float *, float *, const int,
+ const int, const int);
+
+static sgemm_rowmajoy_micro_kernel_func sgemm_rowmajoy_micro_kernel_table[12][12] = {
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {
+
+ 0, 0, 0, 0, 0,
+#if !__aarch64__
+ sgemm_rowmajor_micro_kernel_4x12,
+#else // !__aarch64__
+ 0,
+#endif // !__aarch64__
+ 0, 0, 0, 0, 0,
+#if __aarch64__
+ sgemm_rowmajor_micro_kernel_4x24
+#else // __aarch64__
+ 0
+#endif // __aarch64__
+ },
+ {0, 0, 0,
+#if !__aarch64__
+ sgemm_rowmajor_micro_kernel_6x8,
+#else // !__aarch64__
+ 0,
+#endif // !__aarch64__
+ 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0,
+#if __aarch64__
+ sgemm_rowmajor_micro_kernel_8x12,
+#else // __aarch64__
+ 0,
+#endif // __aarch64__
+ 0, 0, 0, 0, 0, 0
+
+ },
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+ },
+ {0,
+#if !__aarch64__
+ sgemm_rowmajor_micro_kernel_12x4,
+#else // !__aarch64__
+ 0,
+#endif // !__aarch64__
+ 0,
+#if __aarch64__
+ sgemm_rowmajor_micro_kernel_12x8,
+#else // __aarch64__
+ 0,
+#endif // __aarch64__
+ 0, 0, 0, 0, 0, 0, 0, 0
+
+ },
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+ },
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+ },
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+ },
+ {0,
+#if __aarch64__
+ sgemm_rowmajor_micro_kernel_24x4,
+#else // __aarch64__
+ 0,
+#endif // __aarch64__
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+ },
+
+};
+
+void _sgemm_rowmajor_macro_kernel_divnm(const int mr, const int nr, const int mb, const int nb,
+ const int kb, const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k0, const int nstride,
+ const int kstride)
+{
+ const int nm = (mb + mr - 1) / mr;
+ const int nn = (nb + nr - 1) / nr;
+ const int rm = mb % mr;
+ const int rn = nb % nr;
+
+ sgemm_rowmajoy_micro_kernel_func sgemm_rowmajoy_micro_kernel =
+ sgemm_rowmajoy_micro_kernel_table[mr / 2 - 1][nr / 2 - 1];
+ if (!sgemm_rowmajoy_micro_kernel)
+ return;
+
+ for (int j = 0; j < nn; j++)
+ {
+ const int _nr = (j != nn - 1 || rn == 0) ? nr : rn;
+ for (int i = 0; i < nm; i++)
+ {
+ const int _mr = (i != nm - 1 || rm == 0) ? mr : rm;
+ if (_mr == mr && _nr == nr)
+ {
+ sgemm_rowmajoy_micro_kernel(&lhs_ptr[i * mr * kstride], &rhs_ptr[j * nr * kstride],
+ &res_ptr[i * mr * nstride + j * nr], kb, k0, nstride);
+ }
+ else
+ {
+ float res_micro[mr * nr];
+ float *res = &res_ptr[i * mr * nstride + j * nr];
+
+ sgemm_rowmajoy_micro_kernel(&lhs_ptr[i * mr * kstride], &rhs_ptr[j * nr * kstride],
+ res_micro, kb, 0, nr);
+ if (k0 == 0)
+ {
+ for (int pi = 0; pi < _mr; pi++)
+ {
+ for (int pj = 0; pj < _nr; pj++)
+ {
+ res[pi * nstride + pj] = res_micro[pi * nr + pj];
+ }
+ }
+ }
+ else
+ {
+ for (int pi = 0; pi < _mr; pi++)
+ {
+ for (int pj = 0; pj < _nr; pj++)
+ {
+ res[pi * nstride + pj] += res_micro[pi * nr + pj];
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void _sgemm_rowmajor_macro_kernel_divmn(const int mr, const int nr, const int mb, const int nb,
+ const int kb, const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k0, const int nstride,
+ const int kstride)
+{
+ const int nm = (mb + mr - 1) / mr;
+ const int nn = (nb + nr - 1) / nr;
+ const int rm = mb % mr;
+ const int rn = nb % nr;
+
+ sgemm_rowmajoy_micro_kernel_func sgemm_rowmajoy_micro_kernel =
+ sgemm_rowmajoy_micro_kernel_table[mr / 2 - 1][nr / 2 - 1];
+ if (!sgemm_rowmajoy_micro_kernel)
+ return;
+
+ for (int j = 0; j < nm; j++)
+ {
+ const int _mr = (j != nm - 1 || rm == 0) ? mr : rm;
+ for (int i = 0; i < nn; i++)
+ {
+ const int _nr = (i != nn - 1 || rn == 0) ? nr : rn;
+ if (_mr == mr && _nr == nr)
+ {
+ sgemm_rowmajoy_micro_kernel(&lhs_ptr[j * mr * kstride], &rhs_ptr[i * nr * kstride],
+ &res_ptr[j * mr * nstride + i * nr], kb, k0, nstride);
+ }
+ else
+ {
+ float res_micro[mr * nr];
+ float *res = &res_ptr[j * mr * nstride + i * nr];
+
+ sgemm_rowmajoy_micro_kernel(&lhs_ptr[j * mr * kstride], &rhs_ptr[i * nr * kstride],
+ res_micro, kb, 0, nr);
+ if (k0 == 0)
+ {
+ for (int pi = 0; pi < _mr; pi++)
+ {
+ for (int pj = 0; pj < _nr; pj++)
+ {
+ res[pi * nstride + pj] = res_micro[pi * nr + pj];
+ }
+ }
+ }
+ else
+ {
+ for (int pi = 0; pi < _mr; pi++)
+ {
+ for (int pj = 0; pj < _nr; pj++)
+ {
+ res[pi * nstride + pj] += res_micro[pi * nr + pj];
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void _sgemm_colmajor_macro_kernel_divnm(const int mr, const int nr, const int mb, const int nb,
+ const int kb, const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k0, const int mstride,
+ const int kstride)
+{
+ _sgemm_rowmajor_macro_kernel_divmn(nr, mr, nb, mb, kb, rhs_ptr, lhs_ptr, res_ptr, k0, mstride,
+ kstride);
+}
+
+void _sgemm_colmajor_macro_kernel_divmn(const int mr, const int nr, const int mb, const int nb,
+ const int kb, const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k0, const int mstride,
+ const int kstride)
+{
+ _sgemm_rowmajor_macro_kernel_divnm(nr, mr, nb, mb, kb, rhs_ptr, lhs_ptr, res_ptr, k0, mstride,
+ kstride);
+}
+
+#if __aarch64__
+void _sparse_sgemm_kernel(const int nb, float lhs_data, const float *rhs_ptr, float *res_ptr)
+{
+ int nn = nb >> 3;
+ int rn = nb & 7;
+
+ if (nn > 0)
+ {
+ asm volatile("mov x0, %[res_ptr]\n"
+ "dup v0.2d, %[lhs_data]\n"
+ "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"
+ "ld1 {v2.4s}, [x0], #16\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "beq 2f\n"
+
+ "1:\n"
+ "ld1 {v4.4s}, [x0], #16\n"
+ "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
+
+ "fmla v2.4s, v1.4s, v0.s[0]\n"
+ "st1 {v2.4s}, [%[res_ptr]], #16\n"
+
+ "ld1 {v2.4s}, [x0], #16\n"
+ "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"
+
+ "fmla v4.4s, v3.4s, v0.s[0]\n"
+ "st1 {v4.4s}, [%[res_ptr]], #16\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "bne 1b\n"
+
+ "2:\n"
+ "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
+ "ld1 {v4.4s}, [x0], #16\n"
+
+ "fmla v2.4s, v1.4s, v0.s[0]\n"
+ "st1 {v2.4s}, [%[res_ptr]], #16\n"
+
+ "fmla v4.4s, v3.4s, v0.s[0]\n"
+ "st1 {v4.4s}, [%[res_ptr]], #16\n"
+ : [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), [nn] "+r"(nn)
+ : [lhs_data] "r"(lhs_data)
+ : "x0", "v0", "v1", "v2", "v3", "v4", "cc");
+ }
+ if (rn > 0)
+ {
+ int _nn = rn >> 2;
+ int _rn = rn & 3;
+
+ if (_nn > 0)
+ {
+ asm volatile("dup v0.2d, %[lhs_data]\n"
+ "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"
+ "ld1 {v2.4s}, [%[res_ptr]]\n"
+ "fmla v2.4s, v1.4s, v0.s[0]\n"
+ "st1 {v2.4s}, [%[res_ptr]], #16\n"
+ : [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr)
+ : [lhs_data] "r"(lhs_data)
+ : "x0", "x1", "x2", "cc");
+ }
+ if (_rn > 0)
+ {
+ for (int i = 0; i < _rn; i++)
+ {
+ res_ptr[i] += lhs_data * rhs_ptr[i];
+ }
+ }
+ }
+}
+
+#else // __aarch64__
+void _sparse_sgemm_kernel(const int nb, float lhs_data, const float *rhs_ptr, float *res_ptr)
+{
+ int nn = nb >> 3;
+ int rn = nb & 7;
+
+ if (nn > 0)
+ {
+ asm volatile("mov r0, %[res_ptr]\n"
+ "vdup.32 d0, %[lhs_data]\n"
+ "vld1.f32 {d2-d3}, [%[rhs_ptr]]!\n"
+ "vld1.f32 {d4-d5}, [r0]!\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "beq 2f\n"
+
+ "1:\n"
+ "vld1.f32 {d8-d9}, [r0]!\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q2, q1, d0[0]\n"
+ "vst1.f32 {d4-d5}, [%[res_ptr]]!\n"
+
+ "vld1.f32 {d4-d5}, [r0]!\n"
+ "vld1.f32 {d2-d3}, [%[rhs_ptr]]!\n"
+
+ "vmla.f32 q4, q3, d0[0]\n"
+ "vst1.f32 {d8-d9}, [%[res_ptr]]!\n"
+
+ "subs %[nn], %[nn], #1\n"
+ "bne 1b\n"
+
+ "2:\n"
+ "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n"
+ "vld1.f32 {d8-d9}, [r0]!\n"
+
+ "vmla.f32 q2, q1, d0[0]\n"
+ "vst1.f32 {d4-d5}, [%[res_ptr]]!\n"
+
+ "vmla.f32 q4, q3, d0[0]\n"
+ "vst1.f32 {d8-d9}, [%[res_ptr]]!\n"
+ : [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), [nn] "+r"(nn)
+ : [lhs_data] "r"(lhs_data)
+ : "r0", "q0", "q1", "q2", "q3", "q4", "cc");
+ }
+ if (rn > 0)
+ {
+ int _nn = rn >> 2;
+ int _rn = rn & 3;
+
+ if (_nn > 0)
+ {
+ asm volatile("vdup.32 d0, %[lhs_data]\n"
+ "vld1.f32 {d2-d3}, [%[rhs_ptr]]!\n"
+ "vld1.f32 {d4-d5}, [%[res_ptr]]\n"
+ "vmla.f32 q2, q1, d0[0]\n"
+ "vst1.f32 {d4-d5}, [%[res_ptr]]!\n"
+ : [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr)
+ : [lhs_data] "r"(lhs_data)
+ : "q0", "q1", "q2", "cc");
+ }
+ if (_rn > 0)
+ {
+ for (int i = 0; i < _rn; i++)
+ {
+ res_ptr[i] += lhs_data * rhs_ptr[i];
+ }
+ }
+ }
+}
+#endif // __aarch64__
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/sgemm_kernel.h b/compute/ncnn/src/srcn/sgemm_kernel.h
new file mode 100644
index 000000000..9e220bc33
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_kernel.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_SGEMM_KERNEL_H__
+#define __NNFW_SRCN_SGEMM_KERNEL_H__
+
+#include "ncnn/srcn/conv_type.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void _sgemm_rowmajor_macro_kernel_divnm(const int mr, const int nr, const int mb, const int nb,
+ const int kb, const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k0, const int nstride,
+ const int kstride);
+
+void _sgemm_rowmajor_macro_kernel_divmn(const int mr, const int nr, const int mb, const int nb,
+ const int kb, const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k0, const int nstride,
+ const int kstride);
+
+void _sgemm_colmajor_macro_kernel_divnm(const int mr, const int nr, const int mb, const int nb,
+ const int kb, const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k0, const int mstride,
+ const int kstride);
+
+void _sgemm_colmajor_macro_kernel_divmn(const int mr, const int nr, const int mb, const int nb,
+ const int kb, const float *lhs_ptr, const float *rhs_ptr,
+ float *res_ptr, const int k0, const int mstride,
+ const int kstride);
+
+void _sparse_sgemm_kernel(const int nb, float lhs_data, const float *rhs_ptr, float *res_ptr);
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_SGEMM_KERNEL_H__
diff --git a/compute/ncnn/src/srcn/sgemm_pack.cc b/compute/ncnn/src/srcn/sgemm_pack.cc
new file mode 100644
index 000000000..8767f6c0a
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_pack.cc
@@ -0,0 +1,2316 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+#include <arm_neon.h>
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void _pack_rowmajor_notrans_lhs(const int mr, const int mb, const int kb, const int stride,
+ const float *lhs_ptr, float *plhs_ptr)
+{
+ const int nm = mb / mr;
+ const int rm = mb % mr;
+
+ switch (mr)
+ {
+#if __aarch64__
+ case 24:
+ for (int i = 0; i < nm; i++)
+ {
+ int nk = kb >> 2;
+ int rk = kb & 0x03;
+
+ const float *lhs_temp = lhs_ptr;
+ const int _stride = stride << 2;
+
+ if (nk > 0)
+ {
+ asm volatile("0:\n"
+ "mov x0, %[lhs_temp]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v8.4s, v10.4s\n"
+ "zip2 v30.4s, v8.4s, v10.4s\n"
+ "zip1 v29.4s, v9.4s, v11.4s\n"
+ "zip2 v31.4s, v9.4s, v11.4s\n"
+ "zip1 v8.4s, v28.4s, v29.4s\n"
+ "zip2 v9.4s, v28.4s, v29.4s\n"
+ "zip1 v10.4s, v30.4s, v31.4s\n"
+ "zip2 v11.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v14.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v15.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v12.4s, v14.4s\n"
+ "zip2 v30.4s, v12.4s, v14.4s\n"
+ "zip1 v29.4s, v13.4s, v15.4s\n"
+ "zip2 v31.4s, v13.4s, v15.4s\n"
+ "zip1 v12.4s, v28.4s, v29.4s\n"
+ "zip2 v13.4s, v28.4s, v29.4s\n"
+ "zip1 v14.4s, v30.4s, v31.4s\n"
+ "zip2 v15.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v16.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v17.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v18.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v19.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v16.4s, v18.4s\n"
+ "zip2 v30.4s, v16.4s, v18.4s\n"
+ "zip1 v29.4s, v17.4s, v19.4s\n"
+ "zip2 v31.4s, v17.4s, v19.4s\n"
+ "zip1 v16.4s, v28.4s, v29.4s\n"
+ "zip2 v17.4s, v28.4s, v29.4s\n"
+ "zip1 v18.4s, v30.4s, v31.4s\n"
+ "zip2 v19.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v20.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v21.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v22.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v23.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v20.4s, v22.4s\n"
+ "zip2 v30.4s, v20.4s, v22.4s\n"
+ "zip1 v29.4s, v21.4s, v23.4s\n"
+ "zip2 v31.4s, v21.4s, v23.4s\n"
+ "zip1 v20.4s, v28.4s, v29.4s\n"
+ "zip2 v21.4s, v28.4s, v29.4s\n"
+ "zip1 v22.4s, v30.4s, v31.4s\n"
+ "zip2 v23.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v24.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v25.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v26.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v27.4s}, [x0]\n"
+
+ "zip1 v28.4s, v24.4s, v26.4s\n"
+ "zip2 v30.4s, v24.4s, v26.4s\n"
+ "zip1 v29.4s, v25.4s, v27.4s\n"
+ "zip2 v31.4s, v25.4s, v27.4s\n"
+ "zip1 v24.4s, v28.4s, v29.4s\n"
+ "zip2 v25.4s, v28.4s, v29.4s\n"
+ "zip1 v26.4s, v30.4s, v31.4s\n"
+ "zip2 v27.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v8.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v12.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v16.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v20.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v24.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v9.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v13.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v17.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v21.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v25.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v10.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v14.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v18.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v22.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v26.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v11.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v15.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v19.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v23.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v27.4s}, [%[plhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+ "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ plhs_ptr[0] = lhs_temp[0];
+ plhs_ptr[1] = lhs_temp[stride];
+ plhs_ptr[2] = lhs_temp[stride << 1];
+ plhs_ptr[3] = lhs_temp[3 * stride];
+ plhs_ptr[4] = lhs_temp[stride << 2];
+ plhs_ptr[5] = lhs_temp[5 * stride];
+ plhs_ptr[6] = lhs_temp[6 * stride];
+ plhs_ptr[7] = lhs_temp[7 * stride];
+ plhs_ptr[8] = lhs_temp[stride << 3];
+ plhs_ptr[9] = lhs_temp[9 * stride];
+ plhs_ptr[10] = lhs_temp[10 * stride];
+ plhs_ptr[11] = lhs_temp[11 * stride];
+ plhs_ptr[12] = lhs_temp[0];
+ plhs_ptr[13] = lhs_temp[13 * stride];
+ plhs_ptr[14] = lhs_temp[14 * stride];
+ plhs_ptr[15] = lhs_temp[15 * stride];
+ plhs_ptr[16] = lhs_temp[stride << 4];
+ plhs_ptr[17] = lhs_temp[17 * stride];
+ plhs_ptr[18] = lhs_temp[18 * stride];
+ plhs_ptr[19] = lhs_temp[19 * stride];
+ plhs_ptr[20] = lhs_temp[20 * stride];
+ plhs_ptr[21] = lhs_temp[21 * stride];
+ plhs_ptr[22] = lhs_temp[22 * stride];
+ plhs_ptr[23] = lhs_temp[23 * stride];
+ plhs_ptr += mr;
+ lhs_temp++;
+ }
+
+ lhs_ptr += mr * stride;
+ }
+ break;
+ case 16:
+ for (int i = 0; i < nm; i++)
+ {
+ int nk = kb >> 2;
+ int rk = kb & 0x03;
+
+ const float *lhs_temp = lhs_ptr;
+ const int _stride = stride << 2;
+
+ if (nk > 0)
+ {
+ asm volatile("0:\n"
+ "mov x0, %[lhs_temp]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v8.4s, v10.4s\n"
+ "zip2 v30.4s, v8.4s, v10.4s\n"
+ "zip1 v29.4s, v9.4s, v11.4s\n"
+ "zip2 v31.4s, v9.4s, v11.4s\n"
+ "zip1 v8.4s, v28.4s, v29.4s\n"
+ "zip2 v9.4s, v28.4s, v29.4s\n"
+ "zip1 v10.4s, v30.4s, v31.4s\n"
+ "zip2 v11.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v14.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v15.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v12.4s, v14.4s\n"
+ "zip2 v30.4s, v12.4s, v14.4s\n"
+ "zip1 v29.4s, v13.4s, v15.4s\n"
+ "zip2 v31.4s, v13.4s, v15.4s\n"
+ "zip1 v12.4s, v28.4s, v29.4s\n"
+ "zip2 v13.4s, v28.4s, v29.4s\n"
+ "zip1 v14.4s, v30.4s, v31.4s\n"
+ "zip2 v15.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v16.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v17.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v18.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v19.4s}, [x0]\n"
+
+ "zip1 v28.4s, v16.4s, v18.4s\n"
+ "zip2 v30.4s, v16.4s, v18.4s\n"
+ "zip1 v29.4s, v17.4s, v19.4s\n"
+ "zip2 v31.4s, v17.4s, v19.4s\n"
+ "zip1 v16.4s, v28.4s, v29.4s\n"
+ "zip2 v17.4s, v28.4s, v29.4s\n"
+ "zip1 v18.4s, v30.4s, v31.4s\n"
+ "zip2 v19.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v8.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v12.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v16.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v9.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v13.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v17.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v10.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v14.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v18.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v11.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v15.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v19.4s}, [%[plhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29",
+ "v30", "v31");
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ plhs_ptr[0] = lhs_temp[0];
+ plhs_ptr[1] = lhs_temp[stride];
+ plhs_ptr[2] = lhs_temp[stride << 1];
+ plhs_ptr[3] = lhs_temp[3 * stride];
+ plhs_ptr[4] = lhs_temp[stride << 2];
+ plhs_ptr[5] = lhs_temp[5 * stride];
+ plhs_ptr[6] = lhs_temp[6 * stride];
+ plhs_ptr[7] = lhs_temp[7 * stride];
+ plhs_ptr[8] = lhs_temp[stride << 3];
+ plhs_ptr[9] = lhs_temp[9 * stride];
+ plhs_ptr[10] = lhs_temp[10 * stride];
+ plhs_ptr[11] = lhs_temp[11 * stride];
+ plhs_ptr[12] = lhs_temp[0];
+ plhs_ptr[13] = lhs_temp[13 * stride];
+ plhs_ptr[14] = lhs_temp[14 * stride];
+ plhs_ptr[15] = lhs_temp[15 * stride];
+ plhs_ptr += mr;
+ lhs_temp++;
+ }
+
+ lhs_ptr += mr * stride;
+ }
+ break;
+#endif // __aarch64__
+ case 12:
+ for (int i = 0; i < nm; i++)
+ {
+ int nk = kb >> 2;
+ int rk = kb & 0x03;
+
+ const float *lhs_temp = lhs_ptr;
+ const int _stride = stride << 2;
+
+ if (nk > 0)
+ {
+#if __aarch64__
+ asm volatile("0:\n"
+ "mov x0, %[lhs_temp]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v8.4s, v10.4s\n"
+ "zip2 v30.4s, v8.4s, v10.4s\n"
+ "zip1 v29.4s, v9.4s, v11.4s\n"
+ "zip2 v31.4s, v9.4s, v11.4s\n"
+ "zip1 v8.4s, v28.4s, v29.4s\n"
+ "zip2 v9.4s, v28.4s, v29.4s\n"
+ "zip1 v10.4s, v30.4s, v31.4s\n"
+ "zip2 v11.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v14.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v15.4s}, [x0]\n"
+
+ "zip1 v28.4s, v12.4s, v14.4s\n"
+ "zip2 v30.4s, v12.4s, v14.4s\n"
+ "zip1 v29.4s, v13.4s, v15.4s\n"
+ "zip2 v31.4s, v13.4s, v15.4s\n"
+ "zip1 v12.4s, v28.4s, v29.4s\n"
+ "zip2 v13.4s, v28.4s, v29.4s\n"
+ "zip1 v14.4s, v30.4s, v31.4s\n"
+ "zip2 v15.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v8.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v12.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v9.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v13.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v10.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v14.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v11.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v15.4s}, [%[plhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v28", "v29", "v30", "v31");
+#else // __aarch64__
+ asm volatile("0:\n"
+ "mov r0, %[lhs_temp]\n"
+
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+
+ "vzip.32 q4, q6\n"
+ "vzip.32 q5, q7\n"
+ "vzip.32 q4, q5\n"
+ "vzip.32 q6, q7\n"
+
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+
+ "vzip.32 q8, q10\n"
+ "vzip.32 q9, q11\n"
+ "vzip.32 q8, q9\n"
+ "vzip.32 q10, q11\n"
+
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d28-d29}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d30-d31}, [r0]\n"
+
+ "vzip.32 q12, q14\n"
+ "vzip.32 q13, q15\n"
+ "vzip.32 q12, q13\n"
+ "vzip.32 q14, q15\n"
+
+ "vst1.f32 {d8-d9}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d16-d17}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d24-d25}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d10-d11}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d18-d19}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d26-d27}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d12-d13}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d20-d21}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d28-d29}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d14-d15}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d22-d23}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d30-d31}, [%[plhs_ptr]]!\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+#endif // __aarch64__
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ plhs_ptr[0] = lhs_temp[0];
+ plhs_ptr[1] = lhs_temp[stride];
+ plhs_ptr[2] = lhs_temp[stride << 1];
+ plhs_ptr[3] = lhs_temp[3 * stride];
+ plhs_ptr[4] = lhs_temp[stride << 2];
+ plhs_ptr[5] = lhs_temp[5 * stride];
+ plhs_ptr[6] = lhs_temp[6 * stride];
+ plhs_ptr[7] = lhs_temp[7 * stride];
+ plhs_ptr[8] = lhs_temp[stride << 3];
+ plhs_ptr[9] = lhs_temp[9 * stride];
+ plhs_ptr[10] = lhs_temp[10 * stride];
+ plhs_ptr[11] = lhs_temp[11 * stride];
+ plhs_ptr += mr;
+ lhs_temp++;
+ }
+
+ lhs_ptr += mr * stride;
+ }
+ break;
+ case 8:
+ for (int i = 0; i < nm; i++)
+ {
+ int nk = kb >> 2;
+ int rk = kb & 0x03;
+
+ const float *lhs_temp = lhs_ptr;
+ const int _stride = stride << 2;
+
+ if (nk > 0)
+ {
+#if __aarch64__
+ asm volatile("0:\n"
+ "mov x0, %[lhs_temp]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "zip1 v28.4s, v8.4s, v10.4s\n"
+ "zip2 v30.4s, v8.4s, v10.4s\n"
+ "zip1 v29.4s, v9.4s, v11.4s\n"
+ "zip2 v31.4s, v9.4s, v11.4s\n"
+ "zip1 v8.4s, v28.4s, v29.4s\n"
+ "zip2 v9.4s, v28.4s, v29.4s\n"
+ "zip1 v10.4s, v30.4s, v31.4s\n"
+ "zip2 v11.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v8.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v9.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v10.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v11.4s}, [%[plhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v28", "v29", "v30", "v31");
+#else // __aarch64__
+ asm volatile("0:\n"
+ "mov r0, %[lhs_temp]\n"
+
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+
+ "vzip.32 q4, q6\n"
+ "vzip.32 q5, q7\n"
+ "vzip.32 q4, q5\n"
+ "vzip.32 q6, q7\n"
+
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vzip.32 q8, q10\n"
+ "vzip.32 q9, q11\n"
+ "vzip.32 q8, q9\n"
+ "vzip.32 q10, q11\n"
+
+ "vst1.f32 {d8-d9}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d16-d17}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d10-d11}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d18-d19}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d12-d13}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d20-d21}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d14-d15}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d22-d23}, [%[plhs_ptr]]!\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+#endif // __aarch64__
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ plhs_ptr[0] = lhs_temp[0];
+ plhs_ptr[1] = lhs_temp[stride];
+ plhs_ptr[2] = lhs_temp[stride << 1];
+ plhs_ptr[3] = lhs_temp[3 * stride];
+ plhs_ptr[4] = lhs_temp[stride << 2];
+ plhs_ptr[5] = lhs_temp[5 * stride];
+ plhs_ptr[6] = lhs_temp[6 * stride];
+ plhs_ptr[7] = lhs_temp[7 * stride];
+ plhs_ptr += mr;
+ lhs_temp++;
+ }
+
+ lhs_ptr += mr * stride;
+ }
+ break;
+ case 6:
+ for (int i = 0; i < nm; i++)
+ {
+ int nk = kb >> 2;
+ int rk = kb & 0x03;
+
+ const float *lhs_temp = lhs_ptr;
+ const int _stride = stride << 2;
+
+ if (nk > 0)
+ {
+#if __aarch64__
+ // TODO: 4--->6
+ asm volatile("0:\n"
+ "mov x0, %[lhs_temp]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v8.4s}, [x0]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[plhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v28", "v29", "v30", "v31");
+#else // __aarch64__
+ asm volatile("0:\n"
+ "mov r0, %[lhs_temp]\n"
+
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vzip.32 q4, q6\n"
+ "vzip.32 q5, q7\n"
+ "vzip.32 q4, q5\n"
+ "vzip.32 q6, q7\n"
+ "vzip.32 q8, q9\n"
+
+ "vst1.f32 {d8-d9}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d16}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d10-d11}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d17}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d12-d13}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d18}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d14-d15}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d19}, [%[plhs_ptr]]!\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9");
+#endif // __aarch64__
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ plhs_ptr[0] = lhs_temp[0];
+ plhs_ptr[1] = lhs_temp[stride];
+ plhs_ptr[2] = lhs_temp[stride << 1];
+ plhs_ptr[3] = lhs_temp[3 * stride];
+ plhs_ptr[4] = lhs_temp[stride << 2];
+ plhs_ptr[5] = lhs_temp[5 * stride];
+ plhs_ptr += mr;
+ lhs_temp++;
+ }
+
+ lhs_ptr += mr * stride;
+ }
+ break;
+ case 4:
+ for (int i = 0; i < nm; i++)
+ {
+ int nk = kb >> 2;
+ int rk = kb & 0x03;
+
+ const float *lhs_temp = lhs_ptr;
+ const int _stride = stride << 2;
+
+ if (nk > 0)
+ {
+#if __aarch64__
+ asm volatile("0:\n"
+ "mov x0, %[lhs_temp]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[plhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[plhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v28", "v29", "v30", "v31");
+#else // __aarch64__
+ asm volatile("0:\n"
+ "mov r0, %[lhs_temp]\n"
+
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vzip.32 q4, q6\n"
+ "vzip.32 q5, q7\n"
+ "vzip.32 q4, q5\n"
+ "vzip.32 q6, q7\n"
+
+ "vst1.f32 {d8-d9}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d10-d11}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d12-d13}, [%[plhs_ptr]]!\n"
+ "vst1.f32 {d14-d15}, [%[plhs_ptr]]!\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[lhs_temp], %[lhs_temp], #16\n"
+ "bne 0b\n"
+ : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "r0", "q4", "q5", "q6", "q7");
+#endif // __aarch64__
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ plhs_ptr[0] = lhs_temp[0];
+ plhs_ptr[1] = lhs_temp[stride];
+ plhs_ptr[2] = lhs_temp[stride << 1];
+ plhs_ptr[3] = lhs_temp[3 * stride];
+ plhs_ptr += mr;
+ lhs_temp++;
+ }
+
+ lhs_ptr += mr * stride;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (rm > 0)
+ {
+ for (int j = 0; j < kb; j++)
+ {
+ for (int i = 0; i < rm; i++)
+ {
+ plhs_ptr[i] = lhs_ptr[i * stride];
+ }
+ for (int i = rm; i < mr; i++)
+ {
+ plhs_ptr[i] = 0.f;
+ }
+ plhs_ptr += mr;
+ lhs_ptr++;
+ }
+ }
+}
+
+void _pack_rowmajor_notrans_rhs(const int nr, const int nb, const int kb, const int stride,
+ const float *rhs_ptr, float *prhs_ptr)
+{
+ const int nn = nb / nr;
+ const int rn = nb % nr;
+
+ switch (nr)
+ {
+ case 24:
+ for (int j = 0; j < nn; j++)
+ {
+ const float *rhs_temp = rhs_ptr;
+ float32x4_t q0, q1, q2, q3, q4, q5;
+ for (int i = 0; i < kb; i++)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1q_f32(rhs_temp + 4);
+ q2 = vld1q_f32(rhs_temp + 8);
+ q3 = vld1q_f32(rhs_temp + 12);
+ q4 = vld1q_f32(rhs_temp + 16);
+ q5 = vld1q_f32(rhs_temp + 20);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 8, q2);
+ vst1q_f32(prhs_ptr + 12, q3);
+ vst1q_f32(prhs_ptr + 16, q4);
+ vst1q_f32(prhs_ptr + 20, q5);
+
+ rhs_temp += stride;
+ prhs_ptr += nr;
+ }
+
+ rhs_ptr += nr;
+ }
+ break;
+ case 16:
+ for (int j = 0; j < nn; j++)
+ {
+ const float *rhs_temp = rhs_ptr;
+ float32x4_t q0, q1, q2, q3;
+ for (int i = 0; i < kb; i++)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1q_f32(rhs_temp + 4);
+ q2 = vld1q_f32(rhs_temp + 8);
+ q3 = vld1q_f32(rhs_temp + 12);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 8, q2);
+ vst1q_f32(prhs_ptr + 12, q3);
+
+ rhs_temp += stride;
+ prhs_ptr += nr;
+ }
+
+ rhs_ptr += nr;
+ }
+ break;
+ case 12:
+ for (int j = 0; j < nn; j++)
+ {
+ const float *rhs_temp = rhs_ptr;
+ float32x4_t q0, q1, q2;
+ for (int i = 0; i < kb; i++)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1q_f32(rhs_temp + 4);
+ q2 = vld1q_f32(rhs_temp + 8);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 8, q2);
+
+ rhs_temp += stride;
+ prhs_ptr += nr;
+ }
+
+ rhs_ptr += nr;
+ }
+ break;
+ case 8:
+ for (int j = 0; j < nn; j++)
+
+ {
+ const float *rhs_temp = rhs_ptr;
+ float32x4_t q0, q1, q2, q3;
+
+ int i = 0;
+ for (; i + 1 < kb; i += 2)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1q_f32(rhs_temp + 4);
+ q2 = vld1q_f32(rhs_temp + stride);
+ q3 = vld1q_f32(rhs_temp + stride + 4);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 8, q2);
+ vst1q_f32(prhs_ptr + 12, q3);
+
+ rhs_temp += stride << 1;
+ prhs_ptr += nr << 1;
+ }
+
+ for (; i < kb; i++)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1q_f32(rhs_temp + 4);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+
+ rhs_temp += stride;
+ prhs_ptr += nr;
+ }
+
+ rhs_ptr += nr;
+ }
+ break;
+ case 6:
+ for (int j = 0; j < nn; j++)
+
+ {
+ const float *rhs_temp = rhs_ptr;
+ float32x4_t q0, q2;
+ float32x2_t q1, q3;
+
+ int i = 0;
+ for (; i + 1 < kb; i += 2)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1_f32(rhs_temp + 4);
+
+ q2 = vld1q_f32(rhs_temp + stride);
+ q3 = vld1_f32(rhs_temp + stride + 4);
+ vst1q_f32(prhs_ptr, q0);
+ vst1_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 6, q2);
+ vst1_f32(prhs_ptr + 10, q3);
+
+ rhs_temp += stride << 1;
+ prhs_ptr += nr << 1;
+ }
+
+ for (; i < kb; i++)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1_f32(rhs_temp + 4);
+
+ vst1q_f32(prhs_ptr, q0);
+ vst1_f32(prhs_ptr + 4, q1);
+
+ rhs_temp += stride;
+ prhs_ptr += nr;
+ }
+
+ rhs_ptr += nr;
+ }
+ break;
+ case 4:
+ for (int j = 0; j < nn; j++)
+
+ {
+ const float *rhs_temp = rhs_ptr;
+ float32x4_t q0, q1, q2, q3;
+
+ int i = 0;
+ for (; i + 3 < kb; i += 4)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1q_f32(rhs_temp + stride);
+ q2 = vld1q_f32(rhs_temp + (stride << 1));
+ q3 = vld1q_f32(rhs_temp + (stride * 3));
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 8, q2);
+ vst1q_f32(prhs_ptr + 12, q3);
+
+ rhs_temp += stride << 2;
+ prhs_ptr += nr << 2;
+ }
+ for (; i + 1 < kb; i += 2)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ q1 = vld1q_f32(rhs_temp + stride);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+
+ rhs_temp += stride << 1;
+ prhs_ptr += nr << 1;
+ }
+ for (; i < kb; i++)
+ {
+ q0 = vld1q_f32(rhs_temp);
+ vst1q_f32(prhs_ptr, q0);
+
+ rhs_temp += stride;
+ prhs_ptr += nr;
+ }
+
+ rhs_ptr += nr;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (rn > 0)
+ {
+ for (int i = 0; i < kb; i++)
+ {
+ for (int j = 0; j < rn; j++)
+ {
+ prhs_ptr[j] = rhs_ptr[j];
+ }
+ for (int j = rn; j < nr; j++)
+ {
+ prhs_ptr[j] = 0.f;
+ }
+ prhs_ptr += nr;
+ rhs_ptr += stride;
+ }
+ }
+}
+
+void _pack_rowmajor_trans_lhs(const int mr, const int mb, const int kb, const int stride,
+ const float *lhs_ptr, float *plhs_ptr)
+{
+ _pack_rowmajor_notrans_rhs(mr, mb, kb, stride, lhs_ptr, plhs_ptr);
+}
+
+void _pack_rowmajor_trans_rhs(const int nr, const int nb, const int kb, const int stride,
+ const float *rhs_ptr, float *prhs_ptr)
+{
+ _pack_rowmajor_notrans_lhs(nr, nb, kb, stride, rhs_ptr, prhs_ptr);
+}
+
+static inline void _pack_rowmajor_image_subn(const int nr, const int nb, const int stride,
+ const float *buffer, float *prhs_ptr)
+{
+ const int nn = nb / nr;
+ const int rn = nb % nr;
+
+ switch (nr)
+ {
+ case 24:
+ for (int j = 0; j < nn; j++)
+ {
+ float32x4_t q0, q1, q2, q3, q4, q5;
+ q0 = vld1q_f32(buffer);
+ q1 = vld1q_f32(buffer + 4);
+ q2 = vld1q_f32(buffer + 8);
+ q3 = vld1q_f32(buffer + 12);
+ q4 = vld1q_f32(buffer + 16);
+ q5 = vld1q_f32(buffer + 20);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 8, q2);
+ vst1q_f32(prhs_ptr + 12, q3);
+ vst1q_f32(prhs_ptr + 16, q4);
+ vst1q_f32(prhs_ptr + 20, q5);
+ prhs_ptr += stride;
+ buffer += nr;
+ }
+ break;
+ case 16:
+ for (int j = 0; j < nn; j++)
+ {
+ float32x4_t q0, q1, q2, q3;
+ q0 = vld1q_f32(buffer);
+ q1 = vld1q_f32(buffer + 4);
+ q2 = vld1q_f32(buffer + 8);
+ q3 = vld1q_f32(buffer + 12);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 8, q2);
+ vst1q_f32(prhs_ptr + 12, q3);
+ prhs_ptr += stride;
+ buffer += nr;
+ }
+ break;
+ case 12:
+ for (int j = 0; j < nn; j++)
+ {
+ float32x4_t q0, q1, q2;
+ q0 = vld1q_f32(buffer);
+ q1 = vld1q_f32(buffer + 4);
+ q2 = vld1q_f32(buffer + 8);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ vst1q_f32(prhs_ptr + 8, q2);
+ prhs_ptr += stride;
+ buffer += nr;
+ }
+ break;
+ case 8:
+ for (int j = 0; j < nn; j++)
+ {
+ float32x4_t q0, q1;
+ q0 = vld1q_f32(buffer);
+ q1 = vld1q_f32(buffer + 4);
+ vst1q_f32(prhs_ptr, q0);
+ vst1q_f32(prhs_ptr + 4, q1);
+ prhs_ptr += stride;
+ buffer += nr;
+ }
+ break;
+ case 6:
+ for (int j = 0; j < nn; j++)
+ {
+ float32x4_t q0;
+ float32x2_t q1;
+ q0 = vld1q_f32(buffer);
+ q1 = vld1_f32(buffer + 4);
+ vst1q_f32(prhs_ptr, q0);
+ vst1_f32(prhs_ptr + 4, q1);
+ prhs_ptr += stride;
+ buffer += nr;
+ }
+ break;
+ case 4:
+ for (int j = 0; j < nn; j++)
+ {
+ float32x4_t q0;
+ q0 = vld1q_f32(buffer);
+ vst1q_f32(prhs_ptr, q0);
+ prhs_ptr += stride;
+ buffer += nr;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (rn > 0)
+ {
+ for (int j = 0; j < rn; j++)
+ {
+ prhs_ptr[j] = buffer[j];
+ }
+ for (int j = rn; j < nr; j++)
+ {
+ prhs_ptr[j] = 0.f;
+ }
+ }
+}
+
+void _pack_rowmajor_image_rhs(const int nr, const int nb, const int kb, const int k0, const int n0,
+ convMat_t *input, convMat_t *output, convParams_t *params,
+ float *prhs_ptr)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int outw = output->w;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+
+ const int in_row0 = n0 / outw * stride_h;
+ const int in_col0 = n0 % outw * stride_w;
+ int seg0 = outw - n0 % outw;
+ if (seg0 > nb)
+ seg0 = nb;
+ int rows = (nb - seg0 + outw - 1) / outw;
+ if (seg0)
+ rows++;
+ const int segn = (nb - seg0) % outw;
+
+ float row_data[nb];
+
+ for (int i = k0; i < kb + k0; i++)
+ {
+ const int ic = i / (kernel_w * kernel_h);
+ const int in_row1 = ((i / kernel_w) % kernel_h) * params->dilation_h + in_row0;
+ const int in_col1 = i % kernel_w * params->dilation_w;
+
+#ifdef NCNN
+ const float *input_data = input->data + ic * alignSize(w * h, 16 / sizeof(float));
+#else // NCNN
+ const float *input_data = input->data + ic * w * h;
+#endif // NCNN
+ float *buffer = row_data;
+ int in_row = in_row1 - pad_h;
+
+ for (int out_rows = rows; out_rows; out_rows--)
+ {
+ int cols = (out_rows != 1 || segn == 0) ? outw : segn;
+ int in_col = in_col1 - pad_w;
+ if (out_rows == rows)
+ {
+ cols = seg0;
+ in_col += in_col0;
+ }
+ if ((unsigned int)in_row < (unsigned int)h)
+ {
+ for (int out_col = cols; out_col; out_col--)
+ {
+ if ((unsigned int)in_col < (unsigned int)w)
+ *(buffer++) = input_data[in_row * w + in_col];
+ else
+ *(buffer++) = 0;
+ in_col += stride_w;
+ }
+ }
+ else
+ {
+ for (int out_col = cols; out_col; out_col--)
+ {
+ *(buffer++) = 0;
+ in_col += stride_w;
+ }
+ }
+
+ in_row += stride_h;
+ }
+
+ _pack_rowmajor_image_subn(nr, nb, nr * kb, row_data, prhs_ptr);
+ prhs_ptr += nr;
+ }
+}
+
+void _pack_rowmajor_image_rhs_batch(const int nr, const int nb, const int kb, const int k0,
+ const int n0, convMat_t *input, convMat_t *output,
+ convParams_t *params, float *prhs_ptr)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int c = input->c;
+
+#ifdef NCNN
+ const int seg_size = alignSize(output->w * output->h, 16 / sizeof(float));
+#else // NCNN
+ const int seg_size = output->w * output->h;
+#endif // NCNN
+
+#ifdef NCNN
+ float *data = input->data + (alignSize(w * h, 16 / sizeof(float)) * c) * (n0 / seg_size);
+#else // NCNN
+ float *data = input->data + (w * h * c) * (n0 / seg_size);
+#endif // NCNN
+
+ int seg0 = seg_size - n0 % seg_size;
+ if (seg0 > nb)
+ seg0 = nb;
+ int nseg = (nb - seg0 + seg_size - 1) / seg_size;
+ if (seg0)
+ nseg++;
+ const int segn = (nb - seg0) % seg_size;
+ convMat_t _input = {w, h, c, 1, data};
+
+ for (int i = 0; i < nseg; i++)
+ {
+ const int _nb = (i == 0 ? seg0 : (i == nseg - 1 ? segn : seg_size));
+ const int _n0 = (i == 0 ? seg_size - seg0 : 0);
+
+ _pack_rowmajor_image_rhs(nr, _nb, kb, k0, _n0, &_input, output, params, prhs_ptr);
+
+#ifdef NCNN
+ _input.data += alignSize(w * h, 16 / sizeof(float)) * c;
+#else // NCNN
+ _input.data += w * h * c;
+#endif // NCNN
+ }
+}
+
+void _unpack_rowmajor_image_res(const int mb, const int nb, const int m0, const int n0,
+ convMat_t *input, convMat_t *output, convParams_t *params,
+ float *pres_ptr)
+{
+ const int outw = output->w;
+ const int outh = output->h;
+ const int w = input->w;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+
+ const int out_row0 = n0 / w * stride_h;
+ const int out_col0 = n0 % w * stride_w;
+ int seg0 = w - n0 % w;
+ if (seg0 > nb)
+ seg0 = nb;
+ int rows = (nb - seg0 + w - 1) / w;
+ if (seg0)
+ rows++;
+ const int segn = (nb - seg0) % w;
+
+ for (int i = m0; i < mb + m0; i++)
+ {
+ const int oc = i / (kernel_w * kernel_h);
+ const int out_row1 = ((i / kernel_w) % kernel_h) * params->dilation_h + out_row0;
+ const int out_col1 = i % kernel_w * params->dilation_w;
+
+#ifdef NCNN
+ float *output_data = output->data + oc * alignSize(outw * outh, 16 / sizeof(float));
+#else // NCNN
+ float *output_data = output->data + oc * outw * outh;
+#endif // NCNN
+ int out_row = out_row1 - pad_h;
+
+ for (int in_rows = rows; in_rows; in_rows--)
+ {
+ int cols = (in_rows != 1 || segn == 0) ? w : segn;
+ int out_col = out_col1 - pad_w;
+ if (in_rows == rows)
+ {
+ cols = seg0;
+ out_col += out_col0;
+ }
+ if ((unsigned int)out_row < (unsigned int)outh)
+ {
+ for (int in_col = cols; in_col; in_col--)
+ {
+ if ((unsigned int)out_col < (unsigned int)outw)
+ output_data[out_row * outw + out_col] += *pres_ptr++;
+ else
+ pres_ptr++;
+ out_col += stride_w;
+ }
+ }
+ else
+ {
+ pres_ptr += cols;
+ }
+ out_row += stride_h;
+ }
+ }
+}
+
+// TODO:v8 & other case.
+static inline void _pack_colmajor_image_rhs_sub(const int nr, const int k, const float *buffer,
+ float *prhs_ptr)
+{
+ int nk = k >> 2;
+ int rk = k & 0x03;
+
+ const int _stride = k << 2;
+
+ switch (nr)
+ {
+ case 12:
+ if (nk > 0)
+ {
+#if __aarch64__
+ asm volatile("0:\n"
+ "mov x0, %[buffer]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v8.4s, v10.4s\n"
+ "zip2 v30.4s, v8.4s, v10.4s\n"
+ "zip1 v29.4s, v9.4s, v11.4s\n"
+ "zip2 v31.4s, v9.4s, v11.4s\n"
+ "zip1 v8.4s, v28.4s, v29.4s\n"
+ "zip2 v9.4s, v28.4s, v29.4s\n"
+ "zip1 v10.4s, v30.4s, v31.4s\n"
+ "zip2 v11.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v12.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v13.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v14.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v15.4s}, [x0]\n"
+
+ "zip1 v28.4s, v12.4s, v14.4s\n"
+ "zip2 v30.4s, v12.4s, v14.4s\n"
+ "zip1 v29.4s, v13.4s, v15.4s\n"
+ "zip2 v31.4s, v13.4s, v15.4s\n"
+ "zip1 v12.4s, v28.4s, v29.4s\n"
+ "zip2 v13.4s, v28.4s, v29.4s\n"
+ "zip1 v14.4s, v30.4s, v31.4s\n"
+ "zip2 v15.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v8.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v12.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v9.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v13.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v10.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v14.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v11.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v15.4s}, [%[prhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[buffer], %[buffer], #16\n"
+ "bne 0b\n"
+ : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v14", "v15", "v28", "v29", "v30", "v31");
+#else // __aarch64__
+ asm volatile("0:\n"
+ "mov r0, %[buffer]\n"
+
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+
+ "vzip.32 q4, q6\n"
+ "vzip.32 q5, q7\n"
+ "vzip.32 q4, q5\n"
+ "vzip.32 q6, q7\n"
+
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+
+ "vzip.32 q8, q10\n"
+ "vzip.32 q9, q11\n"
+ "vzip.32 q8, q9\n"
+ "vzip.32 q10, q11\n"
+
+ "vld1.f32 {d24-d25}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d26-d27}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d28-d29}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d30-d31}, [r0]\n"
+
+ "vzip.32 q12, q14\n"
+ "vzip.32 q13, q15\n"
+ "vzip.32 q12, q13\n"
+ "vzip.32 q14, q15\n"
+
+ "vst1.f32 {d8-d9}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d16-d17}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d24-d25}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d10-d11}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d18-d19}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d26-d27}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d12-d13}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d20-d21}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d28-d29}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d14-d15}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d22-d23}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d30-d31}, [%[prhs_ptr]]!\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[buffer], %[buffer], #16\n"
+ "bne 0b\n"
+ : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+#endif // __aarch64__
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ prhs_ptr[0] = buffer[0];
+ prhs_ptr[1] = buffer[k];
+ prhs_ptr[2] = buffer[k << 1];
+ prhs_ptr[3] = buffer[3 * k];
+ prhs_ptr[4] = buffer[k << 2];
+ prhs_ptr[5] = buffer[5 * k];
+ prhs_ptr[6] = buffer[6 * k];
+ prhs_ptr[7] = buffer[7 * k];
+ prhs_ptr[8] = buffer[k << 3];
+ prhs_ptr[9] = buffer[9 * k];
+ prhs_ptr[10] = buffer[10 * k];
+ prhs_ptr[11] = buffer[11 * k];
+ prhs_ptr += nr;
+ buffer++;
+ }
+ break;
+
+ case 8:
+ if (nk > 0)
+ {
+#if __aarch64__
+ asm volatile("0:\n"
+ "mov x0, %[buffer]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "ld1 {v8.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v9.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v10.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v11.4s}, [x0]\n"
+
+ "zip1 v28.4s, v8.4s, v10.4s\n"
+ "zip2 v30.4s, v8.4s, v10.4s\n"
+ "zip1 v29.4s, v9.4s, v11.4s\n"
+ "zip2 v31.4s, v9.4s, v11.4s\n"
+ "zip1 v8.4s, v28.4s, v29.4s\n"
+ "zip2 v9.4s, v28.4s, v29.4s\n"
+ "zip1 v10.4s, v30.4s, v31.4s\n"
+ "zip2 v11.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v8.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v9.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v10.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v11.4s}, [%[prhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[buffer], %[buffer], #16\n"
+ "bne 0b\n"
+ : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v28", "v29", "v30", "v31");
+#else // __aarch64__
+ asm volatile("0:\n"
+ "mov r0, %[buffer]\n"
+
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+
+ "vzip.32 q4, q6\n"
+ "vzip.32 q5, q7\n"
+ "vzip.32 q4, q5\n"
+ "vzip.32 q6, q7\n"
+
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d20-d21}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d22-d23}, [r0]\n"
+
+ "vzip.32 q8, q10\n"
+ "vzip.32 q9, q11\n"
+ "vzip.32 q8, q9\n"
+ "vzip.32 q10, q11\n"
+
+ "vst1.f32 {d8-d9}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d16-d17}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d10-d11}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d18-d19}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d12-d13}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d20-d21}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d14-d15}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d22-d23}, [%[prhs_ptr]]!\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[buffer], %[buffer], #16\n"
+ "bne 0b\n"
+ : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+#endif // __aarch64__
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ prhs_ptr[0] = buffer[0];
+ prhs_ptr[1] = buffer[k];
+ prhs_ptr[2] = buffer[k << 1];
+ prhs_ptr[3] = buffer[3 * k];
+ prhs_ptr[4] = buffer[k << 2];
+ prhs_ptr[5] = buffer[5 * k];
+ prhs_ptr[6] = buffer[6 * k];
+ prhs_ptr[7] = buffer[7 * k];
+ prhs_ptr += nr;
+ buffer++;
+ }
+ break;
+#if !__aarch64__
+ case 6:
+ if (nk > 0)
+ {
+ asm volatile("0:\n"
+ "mov r0, %[buffer]\n"
+
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d16-d17}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d18-d19}, [r0]\n"
+
+ "vzip.32 q4, q6\n"
+ "vzip.32 q5, q7\n"
+ "vzip.32 q4, q5\n"
+ "vzip.32 q6, q7\n"
+ "vzip.32 q8, q9\n"
+
+ "vst1.f32 {d8-d9}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d16}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d10-d11}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d17}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d12-d13}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d18}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d14-d15}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d19}, [%[prhs_ptr]]!\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[buffer], %[buffer], #16\n"
+ "bne 0b\n"
+ : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9");
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ prhs_ptr[0] = buffer[0];
+ prhs_ptr[1] = buffer[k];
+ prhs_ptr[2] = buffer[k << 1];
+ prhs_ptr[3] = buffer[3 * k];
+ prhs_ptr[4] = buffer[k << 2];
+ prhs_ptr[5] = buffer[5 * k];
+ prhs_ptr += nr;
+ buffer++;
+ }
+ break;
+#endif // !__aarch64__
+ case 4:
+ if (nk > 0)
+ {
+#if __aarch64__
+ asm volatile("0:\n"
+ "mov x0, %[buffer]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "add x0, x0, %[_stride]\n"
+ "ld1 {v7.4s}, [x0]\n"
+
+ "zip1 v28.4s, v4.4s, v6.4s\n"
+ "zip2 v30.4s, v4.4s, v6.4s\n"
+ "zip1 v29.4s, v5.4s, v7.4s\n"
+ "zip2 v31.4s, v5.4s, v7.4s\n"
+ "zip1 v4.4s, v28.4s, v29.4s\n"
+ "zip2 v5.4s, v28.4s, v29.4s\n"
+ "zip1 v6.4s, v30.4s, v31.4s\n"
+ "zip2 v7.4s, v30.4s, v31.4s\n"
+
+ "st1 {v4.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v5.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v6.4s}, [%[prhs_ptr]], #16\n"
+ "st1 {v7.4s}, [%[prhs_ptr]], #16\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[buffer], %[buffer], #16\n"
+ "bne 0b\n"
+ : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v28", "v29", "v30", "v31");
+#else // __aarch64__
+ asm volatile("0:\n"
+ "mov r0, %[buffer]\n"
+
+ "vld1.f32 {d8-d9}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d10-d11}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d12-d13}, [r0]\n"
+ "add r0, r0, %[_stride]\n"
+ "vld1.f32 {d14-d15}, [r0]\n"
+
+ "vzip.32 q4, q6\n"
+ "vzip.32 q5, q7\n"
+ "vzip.32 q4, q5\n"
+ "vzip.32 q6, q7\n"
+
+ "vst1.f32 {d8-d9}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d10-d11}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d12-d13}, [%[prhs_ptr]]!\n"
+ "vst1.f32 {d14-d15}, [%[prhs_ptr]]!\n"
+
+ "subs %[nk], %[nk], #1\n"
+ "add %[buffer], %[buffer], #16\n"
+ "bne 0b\n"
+ : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk)
+ : [_stride] "r"(_stride)
+ : "cc", "memory", "r0", "q4", "q5", "q6", "q7");
+#endif // __aarch64__
+ }
+
+ for (int j = 0; j < rk; j++)
+ {
+ prhs_ptr[0] = buffer[0];
+ prhs_ptr[1] = buffer[k];
+ prhs_ptr[2] = buffer[k << 1];
+ prhs_ptr[3] = buffer[3 * k];
+ prhs_ptr += nr;
+ buffer++;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+void _pack_colmajor_notrans_lhs(const int mr, const int mb, const int kb, const int stride,
+ const float *lhs_ptr, float *plhs_ptr)
+{
+ _pack_rowmajor_notrans_rhs(mr, mb, kb, stride, lhs_ptr, plhs_ptr);
+}
+
+void _pack_colmajor_notrans_rhs(const int nr, const int nb, const int kb, const int stride,
+ const float *rhs_ptr, float *prhs_ptr)
+{
+ _pack_rowmajor_notrans_lhs(nr, nb, kb, stride, rhs_ptr, prhs_ptr);
+}
+
+void _pack_colmajor_trans_lhs(const int mr, const int mb, const int kb, const int stride,
+ const float *lhs_ptr, float *plhs_ptr)
+{
+ _pack_rowmajor_notrans_lhs(mr, mb, kb, stride, lhs_ptr, plhs_ptr);
+}
+
+void _pack_colmajor_trans_rhs(const int nr, const int nb, const int kb, const int stride,
+ const float *rhs_ptr, float *prhs_ptr)
+{
+ _pack_rowmajor_notrans_rhs(nr, nb, kb, stride, rhs_ptr, prhs_ptr);
+}
+
+void _pack_colmajor_image_rhs(const int nr, const int nb, const int kb, const int k0, const int n0,
+ convMat_t *input, convMat_t *output, convParams_t *params,
+ float *prhs_ptr)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int c = input->c;
+ const int outw = output->w;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+ const float *input_data = input->data;
+
+ int c0 = c - k0 % c;
+ if (c0 > kb)
+ c0 = kb;
+ int nc = (kb - c0 + c - 1) / c;
+ if (c0)
+ nc++;
+ const int cn = (kb - c0) % c;
+
+ int seg0 = outw - n0 % outw;
+ if (seg0 > nb)
+ seg0 = nb;
+ int rows = (nb - seg0 + outw - 1) / outw;
+ if (seg0)
+ rows++;
+ const int segn = (nb - seg0) % outw;
+
+ const int in_row0 = n0 / outw * stride_h;
+ const int in_col0 = n0 % outw * stride_w;
+
+ for (int i = 0; i < nc; i++)
+ {
+ const int channels = (i == 0 && c0 != 0) ? c0 : ((i == nc - 1 && cn != 0) ? cn : c);
+ const int c1 = (i == 0) ? k0 % c : 0;
+
+ float tmp_data[channels * nr];
+ int nindex = 0;
+ float *buffer = tmp_data;
+ float *prhs_tmp = prhs_ptr;
+
+ const int in_row1 = (k0 / c + i) / kernel_w % kernel_h * params->dilation_h + in_row0;
+ const int in_col1 = (k0 / c + i) % kernel_w * params->dilation_w;
+
+ int in_row = in_row1 - pad_h;
+
+ for (int out_rows = rows; out_rows; out_rows--)
+ {
+ int cols = (out_rows != 1 || segn == 0) ? outw : segn;
+ int in_col = in_col1 - pad_w;
+ if (out_rows == rows)
+ {
+ cols = seg0;
+ in_col += in_col0;
+ }
+ if ((unsigned int)in_row < (unsigned int)h)
+ {
+ for (int out_col = cols; out_col; out_col--)
+ {
+ if ((unsigned int)in_col < (unsigned int)w)
+ {
+ for (int j = c1; j < c1 + channels; j++)
+ {
+ *(buffer++) = input_data[(in_row * w + in_col) * c + j];
+ }
+ }
+ else
+ {
+ for (int j = 0; j < channels; j++)
+ {
+ *(buffer++) = 0;
+ }
+ }
+ in_col += stride_w;
+
+ nindex++;
+ if (nindex == nr)
+ {
+ nindex = 0;
+ buffer = tmp_data;
+ _pack_colmajor_image_rhs_sub(nr, channels, tmp_data, prhs_tmp);
+ prhs_tmp += kb * nr;
+ }
+ }
+ }
+ else
+ {
+ for (int out_col = cols; out_col; out_col--)
+ {
+ for (int j = 0; j < channels; j++)
+ {
+ *(buffer++) = 0;
+ }
+ in_col += stride_w;
+
+ nindex++;
+ if (nindex == nr)
+ {
+ nindex = 0;
+ buffer = tmp_data;
+ _pack_colmajor_image_rhs_sub(nr, channels, tmp_data, prhs_tmp);
+ prhs_tmp += kb * nr;
+ }
+ }
+ }
+
+ in_row += stride_h;
+ }
+
+ if (nindex > 0)
+ {
+ float *data = tmp_data;
+ for (int i = 0; i < channels; i++)
+ {
+ for (int j = 0; j < nindex; j++)
+ {
+ prhs_tmp[j] = data[j * channels];
+ }
+ for (int j = nindex; j < nr; j++)
+ {
+ prhs_tmp[j] = 0.f;
+ }
+ prhs_tmp += nr;
+ data++;
+ }
+ }
+
+ prhs_ptr += channels * nr;
+ }
+}
+
+void _pack_colmajor_image_rhs_batch(const int nr, const int nb, const int kb, const int k0,
+ const int n0, convMat_t *input, convMat_t *output,
+ convParams_t *params, float *prhs_ptr)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int c = input->c;
+ const int outw = output->w;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+
+ int c0 = c - k0 % c;
+ if (c0 > kb)
+ c0 = kb;
+ int nc = (kb - c0 + c - 1) / c;
+ if (c0)
+ nc++;
+ const int cn = (kb - c0) % c;
+
+ const int seg_size = output->w * output->h;
+
+ const float *indata = input->data + (w * h * c) * (n0 / seg_size);
+
+ int bseg0 = seg_size - n0 % seg_size;
+ if (bseg0 > nb)
+ bseg0 = nb;
+ int bnseg = (nb - bseg0 + seg_size - 1) / seg_size;
+ if (bseg0)
+ bnseg++;
+ const int bsegn = (nb - bseg0) % seg_size;
+
+ for (int ll = 0; ll < nc; ll++)
+ {
+ const float *input_data = indata;
+
+ const int channels = (ll == 0 && c0 != 0) ? c0 : ((ll == nc - 1 && cn != 0) ? cn : c);
+ const int c1 = (ll == 0) ? k0 % c : 0;
+
+ int nindex = 0;
+ float *prhs_tmp = prhs_ptr;
+ float tmp_data[channels * nr];
+ float *buffer = tmp_data;
+
+ for (int i = 0; i < bnseg; i++)
+ {
+ const int _nb =
+ ((i == 0 && bseg0 != 0) ? bseg0 : ((i == bnseg - 1 && bsegn != 0) ? bsegn : seg_size));
+ const int _n0 = (i == 0 ? n0 % seg_size : 0);
+
+ int seg0 = outw - _n0 % outw;
+ if (seg0 > _nb)
+ seg0 = _nb;
+ int rows = (_nb - seg0 + outw - 1) / outw;
+ if (seg0)
+ rows++;
+ const int segn = (_nb - seg0) % outw;
+
+ const int in_row0 = _n0 / outw * stride_h;
+ const int in_col0 = _n0 % outw * stride_w;
+
+ const int in_row1 = (k0 / c + ll) / kernel_w % kernel_h + in_row0;
+ const int in_col1 = (k0 / c + ll) % kernel_w;
+
+ int in_row = in_row1;
+
+ for (int out_rows = rows; out_rows; out_rows--)
+ {
+ int cols = (out_rows != 1 || segn == 0) ? outw : segn;
+ int in_col = in_col1;
+ if (out_rows == rows)
+ {
+ cols = seg0;
+ in_col += in_col0;
+ }
+ if ((unsigned int)in_row < (unsigned int)h)
+ {
+ for (int out_col = cols; out_col; out_col--)
+ {
+ if ((unsigned int)in_col < (unsigned int)w)
+ {
+ for (int j = c1; j < c1 + channels; j++)
+ {
+ *(buffer++) = input_data[(in_row * w + in_col) * c + j];
+ }
+ }
+ else
+ {
+ for (int j = 0; j < channels; j++)
+ {
+ *(buffer++) = 0;
+ }
+ }
+ in_col += stride_w;
+
+ nindex++;
+ if (nindex == nr)
+ {
+ nindex = 0;
+ buffer = tmp_data;
+ _pack_colmajor_image_rhs_sub(nr, channels, tmp_data, prhs_tmp);
+ prhs_tmp += kb * nr;
+ }
+ }
+ }
+ else
+ {
+ for (int out_col = cols; out_col; out_col--)
+ {
+ for (int j = 0; j < channels; j++)
+ {
+ *(buffer++) = 0;
+ }
+ in_col += stride_w;
+
+ nindex++;
+ if (nindex == nr)
+ {
+ nindex = 0;
+ buffer = tmp_data;
+ _pack_colmajor_image_rhs_sub(nr, channels, tmp_data, prhs_tmp);
+ prhs_tmp += kb * nr;
+ }
+ }
+ }
+
+ in_row += stride_h;
+ }
+
+ input_data += w * h * c;
+ }
+
+ if (nindex > 0)
+ {
+ float *data = tmp_data;
+ for (int ii = 0; ii < channels; ii++)
+ {
+ for (int jj = 0; jj < nindex; jj++)
+ {
+ prhs_tmp[jj] = data[jj * channels];
+ }
+ for (int jj = nindex; jj < nr; jj++)
+ {
+ prhs_tmp[jj] = 0.f;
+ }
+ prhs_tmp += nr;
+ data++;
+ }
+ }
+
+ prhs_ptr += channels * nr;
+ }
+}
+
+void _unpack_colmajor_image_res(const int mb, const int nb, const int m0, const int n0,
+ convMat_t *input, convMat_t *output, convParams_t *params,
+ float *pres_ptr)
+{
+ const int w = input->w;
+ const int outw = output->w;
+ const int outh = output->h;
+ const int outc = output->c;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+ float *output_data = output->data;
+
+ int c0 = outc - m0 % outc;
+ if (c0 > mb)
+ c0 = mb;
+ int nc = (mb - c0 + outc - 1) / outc;
+ if (c0)
+ nc++;
+ const int cn = (mb - c0) % outc;
+
+ int seg0 = w - n0 % w;
+ if (seg0 > nb)
+ seg0 = nb;
+ int rows = (nb - seg0 + w - 1) / w;
+ if (seg0)
+ rows++;
+ const int segn = (nb - seg0) % w;
+
+ const int out_row0 = n0 / w * stride_h;
+ const int out_col0 = n0 % w * stride_w;
+
+ for (int i = 0; i < nc; i++)
+ {
+ const int channels = (i == 0 && c0 != 0) ? c0 : ((i == nc - 1 && cn != 0) ? cn : outc);
+ const int c1 = (i == 0) ? m0 % outc : 0;
+
+ float *buffer = pres_ptr;
+
+ const int out_row1 = (m0 / outc + i) / kernel_w % kernel_h * params->dilation_h + out_row0;
+ const int out_col1 = (m0 / outc + i) % kernel_w * params->dilation_w;
+
+ int out_row = out_row1 - pad_h;
+
+ for (int in_rows = rows; in_rows; in_rows--)
+ {
+ int cols = (in_rows != 1 || segn == 0) ? w : segn;
+ int out_col = out_col1 - pad_w;
+ if (in_rows == rows)
+ {
+ cols = seg0;
+ out_col += out_col0;
+ }
+ if ((unsigned int)out_row < (unsigned int)outh)
+ {
+ for (int in_col = cols; in_col; in_col--)
+ {
+ if ((unsigned int)out_col < (unsigned int)outw)
+ {
+ for (int j = c1; j < c1 + channels; j++)
+ {
+ // Note:Data competition for multi-threads
+ //#pragma omp atomic //low performance
+ output_data[(out_row * outw + out_col) * outc + j] += *(buffer + j - c1);
+ }
+ }
+ buffer += mb;
+ out_col += stride_w;
+ }
+ }
+ else
+ {
+ buffer += cols * mb;
+ }
+ out_row += stride_h;
+ }
+
+ pres_ptr += channels;
+ }
+}
+
+void _sparse_pack_rowmajor_image(const int nb, const int k0, const int n0, convMat_t *input,
+ convMat_t *output, convParams_t *params, float *prhs_ptr)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int outw = output->w;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+
+ const int in_row0 = n0 / outw * stride_h;
+ const int in_col0 = n0 % outw * stride_w;
+ int seg0 = outw - n0 % outw;
+ if (seg0 > nb)
+ seg0 = nb;
+ int rows = (nb - seg0 + outw - 1) / outw;
+ if (seg0)
+ rows++;
+ const int segn = (nb - seg0) % outw;
+
+ const int ic = k0 / (kernel_w * kernel_h);
+ const int in_row1 = ((k0 / kernel_w) % kernel_h) * params->dilation_h + in_row0;
+ const int in_col1 = k0 % kernel_w * params->dilation_w;
+
+#ifdef NCNN
+ const float *input_data = input->data + ic * alignSize(w * h, 16 / sizeof(float));
+#else // NCNN
+ const float *input_data = input->data + ic * w * h;
+#endif // NCNN
+
+ int in_row = in_row1 - pad_h;
+
+ for (int out_rows = rows; out_rows; out_rows--)
+ {
+ int cols = (out_rows != 1 || segn == 0) ? outw : segn;
+ int in_col = in_col1 - pad_w;
+ if (out_rows == rows)
+ {
+ cols = seg0;
+ in_col += in_col0;
+ }
+ if ((unsigned int)in_row < (unsigned int)h)
+ {
+ for (int out_col = cols; out_col; out_col--)
+ {
+ if ((unsigned int)in_col < (unsigned int)w)
+ *(prhs_ptr++) = input_data[in_row * w + in_col];
+ else
+ *(prhs_ptr++) = 0;
+ in_col += stride_w;
+ }
+ }
+ else
+ {
+ for (int out_col = cols; out_col; out_col--)
+ {
+ *(prhs_ptr++) = 0;
+ in_col += stride_w;
+ }
+ }
+
+ in_row += stride_h;
+ }
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/sgemm_pack.h b/compute/ncnn/src/srcn/sgemm_pack.h
new file mode 100644
index 000000000..d64843ebb
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_pack.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_SGEMM_PACK_H__
+#define __NNFW_SRCN_SGEMM_PACK_H__
+
+#include "ncnn/srcn/conv_type.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void _pack_rowmajor_notrans_lhs(const int mr, const int mb, const int kb, const int stride,
+ const float *lhs_ptr, float *plhs_ptr);
+void _pack_rowmajor_notrans_rhs(const int nr, const int nb, const int kb, const int stride,
+ const float *rhs_ptr, float *prhs_ptr);
+void _pack_rowmajor_trans_lhs(const int mr, const int mb, const int kb, const int stride,
+ const float *lhs_ptr, float *plhs_ptr);
+void _pack_rowmajor_trans_rhs(const int nr, const int nb, const int kb, const int stride,
+ const float *rhs_ptr, float *prhs_ptr);
+void _pack_rowmajor_image_rhs(const int nr, const int nb, const int kb, const int k0, const int n0,
+ convMat_t *input, convMat_t *output, convParams_t *params,
+ float *prhs_ptr);
+void _pack_rowmajor_image_rhs_batch(const int nr, const int nb, const int kb, const int k0,
+ const int n0, convMat_t *input, convMat_t *output,
+ convParams_t *params, float *prhs_ptr);
+
+void _unpack_rowmajor_image_res(const int mb, const int nb, const int m0, const int n0,
+ convMat_t *input, convMat_t *output, convParams_t *params,
+ float *pres_ptr);
+
+void _pack_colmajor_notrans_lhs(const int mr, const int mb, const int kb, const int stride,
+ const float *lhs_ptr, float *plhs_ptr);
+void _pack_colmajor_notrans_rhs(const int nr, const int nb, const int kb, const int stride,
+ const float *rhs_ptr, float *prhs_ptr);
+void _pack_colmajor_trans_lhs(const int mr, const int mb, const int kb, const int stride,
+ const float *lhs_ptr, float *plhs_ptr);
+void _pack_colmajor_trans_rhs(const int nr, const int nb, const int kb, const int stride,
+ const float *rhs_ptr, float *prhs_ptr);
+
+void _pack_colmajor_image_rhs(const int nr, const int nb, const int kb, const int k0, const int n0,
+ convMat_t *input, convMat_t *output, convParams_t *params,
+ float *prhs_ptr);
+
+void _pack_colmajor_image_rhs_batch(const int nr, const int nb, const int kb, const int k0,
+ const int n0, convMat_t *input, convMat_t *output,
+ convParams_t *params, float *prhs_ptr);
+
+void _unpack_colmajor_image_res(const int mb, const int nb, const int m0, const int n0,
+ convMat_t *input, convMat_t *output, convParams_t *params,
+ float *pres_ptr);
+
+void _sparse_pack_rowmajor_image(const int nb, const int k0, const int n0, convMat_t *input,
+ convMat_t *output, convParams_t *params, float *prhs_ptr);
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_SGEMM_PACK_H__
diff --git a/compute/ncnn/src/srcn/sgemm_singlethread.cc b/compute/ncnn/src/srcn/sgemm_singlethread.cc
new file mode 100644
index 000000000..3de3e1214
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_singlethread.cc
@@ -0,0 +1,689 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+
+#include "common.h"
+#include "sgemm_kernel.h"
+#include "sgemm_pack.h"
+#include "sgemm_singlethread.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void sgemm_singlethread::param_init()
+{
+ if (n_ >= m_)
+ {
+ shard_type_ = shardByRow;
+ }
+ else
+ {
+ shard_type_ = shardByCol;
+ }
+
+#if __aarch64__
+ if (major_type_ == rowMajor)
+ {
+ if (shard_type_ == shardByRow)
+ {
+ mr_ = 8;
+ nr_ = 12;
+ }
+ else
+ {
+ mr_ = 12;
+ nr_ = 8;
+ }
+ }
+ else if (major_type_ == colMajor)
+ {
+ mr_ = 12;
+ nr_ = 8;
+ }
+#else // __aarch64__
+ if (major_type_ == rowMajor)
+ {
+ // it is a bug, but i do not know why as now.
+ if (ltrans_ == notrans && rtrans_ == trans)
+ {
+ mr_ = 4;
+ nr_ = 12;
+ }
+ else
+ {
+ mr_ = 6;
+ nr_ = 8;
+ }
+ }
+ else if (major_type_ == colMajor)
+ {
+ mr_ = 8;
+ nr_ = 6;
+ }
+#endif // __aarch64__
+
+ int k_div = (nr_ * sizeof_RhsScalar);
+ int k_sub = (mr_ * nr_ * sizeof_ResScalar);
+
+ int gen_col = GEN_COL / cache_div_;
+ int min_k = MAX_K / cache_div_;
+
+ const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div), min_k);
+ bk_ = MIN(k_cache, k_);
+
+ if (shard_type_ == shardByCol)
+ {
+ int m_sub = (bk_ * nr_ * sizeof_RhsScalar);
+ int m_div = (sizeof_LhsScalar * bk_ * 2 * cache_div_);
+ if (L3_CACHE_SIZE)
+ m_div = (sizeof_LhsScalar * bk_ * 2);
+ int m_cache = divup((L2_CACHE_SIZE - m_sub), m_div);
+ bm_ = MIN(m_cache, m_);
+
+ bn_ = MIN(gen_col, n_);
+ if (L3_CACHE_SIZE)
+ {
+ int n_sub = (bk_ * bm_ * sizeof_RhsScalar);
+ int n_cache = divup((L3_CACHE_SIZE - n_sub), (sizeof_LhsScalar * bk_ * 2));
+ bn_ = MIN(n_cache, bn_);
+ }
+ }
+ else
+ {
+ int n_sub = (bk_ * mr_ * sizeof_RhsScalar);
+ int n_div = (sizeof_LhsScalar * bk_ * 2 * cache_div_);
+ if (L3_CACHE_SIZE)
+ n_div = (sizeof_LhsScalar * bk_ * 2);
+ int n_cache = divup((L2_CACHE_SIZE - n_sub), n_div);
+ bn_ = MIN(n_cache, n_);
+
+ bm_ = MIN(gen_col, m_);
+ if (L3_CACHE_SIZE)
+ {
+ int m_sub = (bk_ * bn_ * sizeof_RhsScalar);
+ int m_cache = divup((L3_CACHE_SIZE - m_sub), (sizeof_LhsScalar * bk_ * 2));
+ bm_ = MIN(m_cache, bm_);
+ }
+ }
+
+ nm_ = divup(m_, bm_);
+ nn_ = divup(n_, bn_);
+ nk_ = divup(k_, bk_);
+
+ rm_ = m_ % bm_;
+ rn_ = n_ % bn_;
+ rk_ = k_ % bk_;
+}
+
+sgemm_singlethread::sgemm_singlethread(sgemmType_t major_type, sgemmTrans_t ltrans,
+ sgemmTrans_t rtrans, const int m, const int n, const int k,
+ const float *lhs_data, const float *rhs_data,
+ float *res_data, int cache_div)
+ : lhs_data_(lhs_data), rhs_data_(rhs_data), res_data_(res_data), major_type_(major_type),
+ ltrans_(ltrans), rtrans_(rtrans), m_(m), n_(n), k_(k), cache_div_(cache_div)
+{
+ param_init();
+}
+
+sgemm_singlethread::~sgemm_singlethread() {}
+
+void sgemm_singlethread::run()
+{
+ if (major_type_ == rowMajor)
+ {
+ if (ltrans_ == notrans && rtrans_ == notrans)
+ {
+ compute_rowmajor_nn();
+ }
+ else if (ltrans_ == notrans && rtrans_ == trans)
+ {
+ compute_rowmajor_nt();
+ }
+ else if (ltrans_ == trans && rtrans_ == notrans)
+ {
+ compute_rowmajor_tn();
+ }
+ else if (ltrans_ == trans && rtrans_ == trans)
+ {
+ compute_rowmajor_tt();
+ }
+ else
+ {
+ throw std::runtime_error{"error trans type."};
+ }
+ }
+ else if (major_type_ == colMajor)
+ {
+ if (ltrans_ == notrans && rtrans_ == notrans)
+ {
+ compute_colmajor_nn();
+ }
+ else if (ltrans_ == notrans && rtrans_ == trans)
+ {
+ compute_colmajor_nt();
+ }
+ else if (ltrans_ == trans && rtrans_ == notrans)
+ {
+ compute_colmajor_tn();
+ }
+ else if (ltrans_ == trans && rtrans_ == trans)
+ {
+ compute_colmajor_tt();
+ }
+ else
+ {
+ throw std::runtime_error{"error trans type."};
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error major type."};
+ }
+}
+
+void sgemm_singlethread::compute_rowmajor_nn()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float plhs_ptr[mstride * bk_];
+ float prhs_ptr[nstride * bk_];
+
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error shard type."};
+ }
+}
+
+void sgemm_singlethread::compute_rowmajor_nt()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float plhs_ptr[mstride * bk_];
+ float prhs_ptr[nstride * bk_];
+
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_trans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _pack_rowmajor_trans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error shard type."};
+ }
+}
+
+void sgemm_singlethread::compute_rowmajor_tn()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float plhs_ptr[mstride * bk_];
+ float prhs_ptr[nstride * bk_];
+
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error shard type."};
+ }
+}
+
+void sgemm_singlethread::compute_rowmajor_tt()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float plhs_ptr[mstride * bk_];
+ float prhs_ptr[nstride * bk_];
+
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_trans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _pack_rowmajor_trans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error shard type."};
+ }
+}
+
+void sgemm_singlethread::compute_colmajor_nn()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float plhs_ptr[mstride * bk_];
+ float prhs_ptr[nstride * bk_];
+
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error shard type."};
+ }
+}
+
+void sgemm_singlethread::compute_colmajor_nt()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float plhs_ptr[mstride * bk_];
+ float prhs_ptr[nstride * bk_];
+
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_trans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _pack_colmajor_trans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error shard type."};
+ }
+}
+
+void sgemm_singlethread::compute_colmajor_tn()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float plhs_ptr[mstride * bk_];
+ float prhs_ptr[nstride * bk_];
+
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error shard type."};
+ }
+}
+
+void sgemm_singlethread::compute_colmajor_tt()
+{
+ int mstride = (bm_ + mr_ - 1) / mr_ * mr_;
+ int nstride = (bn_ + nr_ - 1) / nr_ * nr_;
+
+ float plhs_ptr[mstride * bk_];
+ float prhs_ptr[nstride * bk_];
+
+ if (shard_type_ == shardByCol)
+ {
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_trans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else if (shard_type_ == shardByRow)
+ {
+ for (int i = 0; i < nm_; i++)
+ {
+ const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
+
+ for (int l = 0; l < nk_; l++)
+ {
+ const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
+
+ _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr);
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+
+ _pack_colmajor_trans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr);
+
+ _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr,
+ &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk);
+ }
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error{"error shard type."};
+ }
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/sgemm_singlethread.h b/compute/ncnn/src/srcn/sgemm_singlethread.h
new file mode 100644
index 000000000..47954e028
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_singlethread.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_SGEMM_SINGLETHREAD_H__
+#define __NNFW_SRCN_SGEMM_SINGLETHREAD_H__
+
+#include "common.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+typedef enum { rowMajor = 0, colMajor } sgemmType_t;
+
+typedef enum { trans = 0, notrans } sgemmTrans_t;
+
+class sgemm_singlethread
+{
+public:
+ sgemm_singlethread(sgemmType_t major_type, sgemmTrans_t ltrans, sgemmTrans_t rtrans, const int m,
+ const int n, const int k, const float *lhs_data, const float *rhs_data,
+ float *res_data, int cache_div);
+ ~sgemm_singlethread();
+
+ void run();
+
+private:
+ void param_init();
+
+ void compute_rowmajor_nn();
+ void compute_rowmajor_nt();
+ void compute_rowmajor_tn();
+ void compute_rowmajor_tt();
+
+ void compute_colmajor_nn();
+ void compute_colmajor_nt();
+ void compute_colmajor_tn();
+ void compute_colmajor_tt();
+
+ const float *lhs_data_;
+ const float *rhs_data_;
+ float *res_data_;
+
+ sgemmType_t major_type_;
+ sgemmTrans_t ltrans_;
+ sgemmTrans_t rtrans_;
+
+ int m_;
+ int n_;
+ int k_;
+
+ int bm_;
+ int bn_;
+ int bk_;
+
+ int rm_;
+ int rn_;
+ int rk_;
+
+ int nm_;
+ int nn_;
+ int nk_;
+
+ int mr_;
+ int nr_;
+
+ shardType_t shard_type_;
+ int cache_div_;
+};
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_SGEMM_SINGLETHREAD_H__
diff --git a/compute/ncnn/src/srcn/sgemm_test.cc b/compute/ncnn/src/srcn/sgemm_test.cc
new file mode 100644
index 000000000..1b10970bb
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_test.cc
@@ -0,0 +1,1883 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "ncnn/srcn/conv_type.h"
+#include "srcn/srcn_conv.h"
+//#include "srcn_sgemm.h"
+#include "conv_sgemm_singlethread.h"
+#include "conv_sgemm_multithreads.h"
+//#include "conv_sgemm_batch.h"
+#include "sgemm_singlethread.h"
+#include "conv_winograd.h"
+#include "winograd.h"
+
+//#include "conv_gpu.h"
+//#include "convolutiondepthwise_3x3.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+static void direct_conv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+ convParams_t *params)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int inch = input->c;
+ const int outw = output->w;
+ const int outh = output->h;
+ const int outch = output->c;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+ const int dilation_w = params->dilation_w;
+ const int dilation_h = params->dilation_h;
+ const float *input_data = input->data;
+ const float *filter_data = filter->data;
+ float *output_data = output->data;
+
+ for (int out_c = 0; out_c < outch; out_c++)
+ {
+ for (int out_row = 0; out_row < outh; out_row++)
+ {
+ for (int out_col = 0; out_col < outw; out_col++)
+ {
+ const int in_col0 = (out_col * stride_w) - pad_w;
+ const int in_row0 = (out_row * stride_h) - pad_h;
+ float sum = 0.f;
+ for (int in_c = 0; in_c < inch; in_c++)
+ {
+ for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+ {
+ for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+ {
+ const int in_col = in_col0 + filter_x * dilation_w;
+ const int in_row = in_row0 + filter_y * dilation_h;
+
+ if (((unsigned int)in_col < (unsigned int)w) &&
+ ((unsigned int)in_row < (unsigned int)h))
+ {
+ float input_value = input_data[(in_c * h + in_row) * w + in_col];
+ float filter_value =
+ filter_data[((out_c * inch + in_c) * kernel_h + filter_y) * kernel_w +
+ filter_x];
+ sum += (input_value * filter_value);
+ }
+ }
+ }
+ }
+ output_data[(out_c * outh + out_row) * outw + out_col] = sum;
+ }
+ }
+ }
+}
+
+static void direct_deconv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+ convParams_t *params)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int inch = input->c;
+ const int outw = output->w;
+ const int outh = output->h;
+ const int outch = output->c;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+ const int dilation_w = params->dilation_w;
+ const int dilation_h = params->dilation_h;
+ const float *input_data = input->data;
+ const float *filter_data = filter->data;
+ float *output_data = output->data;
+
+ for (int i = 0; i < outw * outh * outch; i++)
+ {
+ output_data[i] = 0;
+ }
+
+ for (int in_c = 0; in_c < inch; in_c++)
+ {
+ for (int in_row = 0; in_row < h; in_row++)
+ {
+ for (int in_col = 0; in_col < w; in_col++)
+ {
+ const int out_col0 = (in_col * stride_w) - pad_w;
+ const int out_row0 = (in_row * stride_h) - pad_h;
+ float in_value = input_data[(in_c * h + in_row) * w + in_col];
+ for (int out_c = 0; out_c < outch; out_c++)
+ {
+ for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+ {
+ for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+ {
+ const int out_col = out_col0 + filter_x * dilation_w;
+ const int out_row = out_row0 + filter_y * dilation_h;
+
+ if (((unsigned int)out_col < (unsigned int)outw) &&
+ ((unsigned int)out_row < (unsigned int)outh))
+ {
+ float filter_value =
+ filter_data[((in_c * outch + out_c) * kernel_h + filter_y) * kernel_w +
+ filter_x];
+ output_data[(out_c * outh + out_row) * outw + out_col] += filter_value * in_value;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static void direct_sgemm_rowmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B,
+ float *C)
+{
+ float *aa, *bb;
+
+ if (Atrans == trans)
+ {
+ aa = (float *)malloc(m * k * sizeof(float));
+ if (!aa)
+ return;
+
+ for (int i = 0; i < k; i++)
+ {
+ for (int j = 0; j < m; j++)
+ {
+ aa[j * k + i] = A[i * m + j];
+ }
+ }
+ }
+ else
+ {
+ aa = A;
+ }
+
+ if (Btrans == trans)
+ {
+ bb = (float *)malloc(n * k * sizeof(float));
+ if (!bb)
+ return;
+
+ for (int i = 0; i < n; i++)
+ {
+ for (int j = 0; j < k; j++)
+ {
+ bb[j * n + i] = B[i * k + j];
+ }
+ }
+ }
+ else
+ {
+ bb = B;
+ }
+
+ for (int i = 0; i < m; i++)
+ {
+ for (int j = 0; j < n; j++)
+ {
+ float res = 0.f;
+ for (int l = 0; l < k; l++)
+ {
+ res += aa[i * k + l] * bb[l * n + j];
+ }
+ C[i * n + j] = res;
+ }
+ }
+}
+
+/*static void direct_sgemm_kernel(const int k, const int lhs_stride, const int rhs_stride, const int
+res_stride,
+ const float *lhs_ptr, const float *rhs_ptr, float *res_ptr)
+{
+ int lstride = lhs_stride << 2;
+ int rstride = rhs_stride << 2;
+ int estride = res_stride << 2;
+ int rstep = rstride << 2;
+
+ int nk = (k >> 2) - 1;
+
+ __asm __volatile (
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ "mov x0, %[lhs_ptr]\n"
+ "add %[lhs_ptr], %[lhs_ptr], #16\n"
+ "ld1 {v0.4s}, [x0]\n"
+ "add x0, x0, %[lstride]\n"
+ "ld1 {v1.4s}, [x0]\n"
+ "add x0, x0, %[lstride]\n"
+ "ld1 {v2.4s}, [x0]\n"
+ "add x0, x0, %[lstride]\n"
+ "ld1 {v3.4s}, [x0]\n"
+ "add x0, x0, %[lstride]\n"
+
+ "mov x1, %[rhs_ptr]\n"
+ "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n"
+ "ld1 {v8.4s, v9.4s}, [x1]\n"
+ "add x1, x1, %[rstride]\n"
+ "ld1 {v10.4s, v11.4s}, [x1]\n"
+ "add x1, x1, %[rstride]\n"
+
+ "1:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v16.4s, v10.4s, v0.s[1]\n"
+ "fmla v17.4s, v11.4s, v0.s[1]\n"
+ "fmla v18.4s, v8.4s, v1.s[0]\n"
+ "fmla v19.4s, v9.4s, v1.s[0]\n"
+ "fmla v18.4s, v10.4s, v1.s[1]\n"
+ "fmla v19.4s, v11.4s, v1.s[1]\n"
+ "ld1 {v12.4s, v13.4s}, [x1]\n"
+ "fmla v20.4s, v8.4s, v2.s[0]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v21.4s, v9.4s, v2.s[0]\n"
+ "ld1 {v14.4s, v15.4s}, [x1]\n"
+ "fmla v20.4s, v10.4s, v2.s[1]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v21.4s, v11.4s, v2.s[1]\n"
+ "fmla v22.4s, v8.4s, v3.s[0]\n"
+ "fmla v23.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v3.s[1]\n"
+ "fmla v23.4s, v11.4s, v3.s[1]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "fmla v16.4s, v12.4s, v0.s[2]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v17.4s, v13.4s, v0.s[2]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "fmla v16.4s, v14.4s, v0.s[3]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v17.4s, v15.4s, v0.s[3]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "fmla v18.4s, v12.4s, v1.s[2]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v19.4s, v13.4s, v1.s[2]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "fmla v18.4s, v14.4s, v1.s[3]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v19.4s, v15.4s, v1.s[3]\n"
+ "fmla v20.4s, v12.4s, v2.s[2]\n"
+ "fmla v21.4s, v13.4s, v2.s[2]\n"
+ "fmla v20.4s, v14.4s, v2.s[3]\n"
+ "fmla v21.4s, v15.4s, v2.s[3]\n"
+ "fmla v22.4s, v12.4s, v3.s[2]\n"
+ "fmla v23.4s, v13.4s, v3.s[2]\n"
+ "fmla v22.4s, v14.4s, v3.s[3]\n"
+ "fmla v23.4s, v15.4s, v3.s[3]\n"
+
+ "mov x0, %[lhs_ptr]\n"
+ "add %[lhs_ptr], %[lhs_ptr], #16\n"
+
+ "fmla v24.4s, v8.4s, v4.s[0]\n"
+ "fmla v25.4s, v9.4s, v4.s[0]\n"
+ "ld1 {v0.4s}, [x0]\n"
+ "fmla v24.4s, v10.4s, v4.s[1]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v25.4s, v11.4s, v4.s[1]\n"
+ "ld1 {v1.4s}, [x0]\n"
+ "fmla v26.4s, v8.4s, v5.s[0]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v27.4s, v9.4s, v5.s[0]\n"
+ "ld1 {v2.4s}, [x0]\n"
+ "fmla v26.4s, v10.4s, v5.s[1]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v27.4s, v11.4s, v5.s[1]\n"
+ "ld1 {v3.4s}, [x0]\n"
+ "fmla v28.4s, v8.4s, v6.s[0]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v29.4s, v9.4s, v6.s[0]\n"
+ "fmla v28.4s, v10.4s, v6.s[1]\n"
+ "fmla v29.4s, v11.4s, v6.s[1]\n"
+ "fmla v30.4s, v8.4s, v7.s[0]\n"
+ "fmla v31.4s, v9.4s, v7.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[1]\n"
+ "fmla v31.4s, v11.4s, v7.s[1]\n"
+
+ "mov x1, %[rhs_ptr]\n"
+ "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n"
+
+ "fmla v24.4s, v12.4s, v4.s[2]\n"
+ "fmla v25.4s, v13.4s, v4.s[2]\n"
+ "ld1 {v8.4s, v9.4s}, [x1]\n"
+ "fmla v24.4s, v14.4s, v4.s[3]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v25.4s, v15.4s, v4.s[3]\n"
+ "ld1 {v10.4s, v11.4s}, [x1]\n"
+ "fmla v26.4s, v12.4s, v5.s[2]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v27.4s, v13.4s, v5.s[2]\n"
+ "fmla v26.4s, v14.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v5.s[3]\n"
+ "fmla v28.4s, v12.4s, v6.s[2]\n"
+ "fmla v29.4s, v13.4s, v6.s[2]\n"
+ "fmla v28.4s, v14.4s, v6.s[3]\n"
+ "fmla v29.4s, v15.4s, v6.s[3]\n"
+ "fmla v30.4s, v12.4s, v7.s[2]\n"
+ "fmla v31.4s, v13.4s, v7.s[2]\n"
+ "subs %w[nk], %w[nk], #1\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "bne 1b\n"
+
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v16.4s, v10.4s, v0.s[1]\n"
+ "fmla v17.4s, v11.4s, v0.s[1]\n"
+ "fmla v18.4s, v8.4s, v1.s[0]\n"
+ "fmla v19.4s, v9.4s, v1.s[0]\n"
+ "fmla v18.4s, v10.4s, v1.s[1]\n"
+ "fmla v19.4s, v11.4s, v1.s[1]\n"
+ "ld1 {v12.4s, v13.4s}, [x1]\n"
+ "fmla v20.4s, v8.4s, v2.s[0]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v21.4s, v9.4s, v2.s[0]\n"
+ "ld1 {v14.4s, v15.4s}, [x1]\n"
+ "fmla v20.4s, v10.4s, v2.s[1]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v21.4s, v11.4s, v2.s[1]\n"
+ "fmla v22.4s, v8.4s, v3.s[0]\n"
+ "fmla v23.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v3.s[1]\n"
+ "fmla v23.4s, v11.4s, v3.s[1]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "fmla v16.4s, v12.4s, v0.s[2]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v17.4s, v13.4s, v0.s[2]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "fmla v16.4s, v14.4s, v0.s[3]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v17.4s, v15.4s, v0.s[3]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "fmla v18.4s, v12.4s, v1.s[2]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v19.4s, v13.4s, v1.s[2]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "fmla v18.4s, v14.4s, v1.s[3]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v19.4s, v15.4s, v1.s[3]\n"
+ "fmla v20.4s, v12.4s, v2.s[2]\n"
+ "fmla v21.4s, v13.4s, v2.s[2]\n"
+ "fmla v20.4s, v14.4s, v2.s[3]\n"
+ "fmla v21.4s, v15.4s, v2.s[3]\n"
+ "fmla v22.4s, v12.4s, v3.s[2]\n"
+ "fmla v23.4s, v13.4s, v3.s[2]\n"
+ "fmla v22.4s, v14.4s, v3.s[3]\n"
+ "fmla v23.4s, v15.4s, v3.s[3]\n"
+
+ "mov x0, %[res_ptr]\n"
+ "fmla v24.4s, v8.4s, v4.s[0]\n"
+ "fmla v25.4s, v9.4s, v4.s[0]\n"
+ "st1 {v16.4s, v17.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v24.4s, v10.4s, v4.s[1]\n"
+ "fmla v25.4s, v11.4s, v4.s[1]\n"
+ "st1 {v18.4s, v19.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v26.4s, v8.4s, v5.s[0]\n"
+ "fmla v27.4s, v9.4s, v5.s[0]\n"
+ "st1 {v20.4s, v21.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v26.4s, v10.4s, v5.s[1]\n"
+ "fmla v27.4s, v11.4s, v5.s[1]\n"
+ "st1 {v22.4s, v23.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v28.4s, v8.4s, v6.s[0]\n"
+ "fmla v29.4s, v9.4s, v6.s[0]\n"
+ "fmla v28.4s, v10.4s, v6.s[1]\n"
+ "fmla v29.4s, v11.4s, v6.s[1]\n"
+ "fmla v30.4s, v8.4s, v7.s[0]\n"
+ "fmla v31.4s, v9.4s, v7.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[1]\n"
+ "fmla v31.4s, v11.4s, v7.s[1]\n"
+
+ "fmla v24.4s, v12.4s, v4.s[2]\n"
+ "fmla v25.4s, v13.4s, v4.s[2]\n"
+ "fmla v24.4s, v14.4s, v4.s[3]\n"
+ "fmla v25.4s, v15.4s, v4.s[3]\n"
+ "fmla v26.4s, v12.4s, v5.s[2]\n"
+ "fmla v27.4s, v13.4s, v5.s[2]\n"
+ "st1 {v24.4s, v25.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v26.4s, v14.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v5.s[3]\n"
+ "fmla v28.4s, v12.4s, v6.s[2]\n"
+ "fmla v29.4s, v13.4s, v6.s[2]\n"
+ "st1 {v26.4s, v27.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v28.4s, v14.4s, v6.s[3]\n"
+ "fmla v29.4s, v15.4s, v6.s[3]\n"
+ "fmla v30.4s, v12.4s, v7.s[2]\n"
+ "fmla v31.4s, v13.4s, v7.s[2]\n"
+ "st1 {v28.4s, v29.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "st1 {v30.4s, v31.4s}, [x0]\n"
+ :[lhs_ptr] "+r" (lhs_ptr), [rhs_ptr] "+r" (rhs_ptr), [res_ptr] "+r" (res_ptr),
+ [nk] "+r" (nk)
+ : [lstride] "r" (lstride), [rstride] "r" (rstride), [estride] "r" (estride), [rstep] "r"
+(rstep)
+ : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+ );
+}*/
+
+static void direct_conv_colmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+ convParams_t *params)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int inch = input->c;
+ const int outw = output->w;
+ const int outh = output->h;
+ const int outch = output->c;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+ const int dilation_w = params->dilation_w;
+ const int dilation_h = params->dilation_h;
+ const float *input_data = input->data;
+ const float *filter_data = filter->data;
+ float *output_data = output->data;
+
+ for (int out_row = 0; out_row < outh; out_row++)
+ {
+ for (int out_col = 0; out_col < outw; out_col++)
+ {
+ const int in_col0 = (out_col * stride_w) - pad_w;
+ const int in_row0 = (out_row * stride_h) - pad_h;
+
+ for (int out_c = 0; out_c < outch; out_c++)
+ {
+ float sum = 0.f;
+ for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+ {
+ for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+ {
+ const int in_col = in_col0 + filter_x * dilation_w;
+ const int in_row = in_row0 + filter_y * dilation_h;
+
+ if (((unsigned int)in_col < (unsigned int)w) &&
+ ((unsigned int)in_row < (unsigned int)h))
+ {
+ for (int in_c = 0; in_c < inch; in_c++)
+ {
+ float input_value = input_data[(in_row * w + in_col) * inch + in_c];
+ float filter_value =
+ filter_data[((filter_y * kernel_w + filter_x) * inch + in_c) * outch + out_c];
+ sum += (input_value * filter_value);
+ }
+ }
+ }
+ }
+ output_data[(out_row * outw + out_col) * outch + out_c] = sum;
+ }
+ }
+ }
+}
+
+static void direct_sgemm_colmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B,
+ float *C)
+{
+ float *aa, *bb;
+
+ if (Atrans)
+ {
+ aa = (float *)malloc(m * k * sizeof(float));
+ if (!aa)
+ return;
+
+ for (int i = 0; i < k; i++)
+ {
+ for (int j = 0; j < m; j++)
+ {
+ aa[i * m + j] = A[j * k + i];
+ }
+ }
+ }
+ else
+ {
+ aa = A;
+ }
+
+ if (Btrans)
+ {
+ bb = (float *)malloc(n * k * sizeof(float));
+ if (!bb)
+ return;
+
+ for (int i = 0; i < n; i++)
+ {
+ for (int j = 0; j < k; j++)
+ {
+ bb[i * k + j] = B[j * n + i];
+ }
+ }
+ }
+ else
+ {
+ bb = B;
+ }
+
+ for (int i = 0; i < m; i++)
+ {
+ for (int j = 0; j < n; j++)
+ {
+ float res = 0.f;
+ for (int l = 0; l < k; l++)
+ {
+ res += bb[j * k + l] * aa[l * m + i];
+ }
+ C[j * m + i] = res;
+ }
+ }
+}
+
+#if 0
+static int test_sgemm(int m, int n, int k, int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ const int mb = 180;
+ const int nb = 1440;
+ const int kb = 512;
+
+ const int mr = 4;
+ const int nr = 12;
+
+#if 0
+ const int pm = (m + mr - 1) / mr * mr;
+ const int pn = (n + nr - 1) / nr * nr;
+ const int pk = k;
+#else
+ const int pm = (mb + mr - 1) / mr * mr;
+ const int pn = (nb + nr - 1) / nr * nr;
+ const int pk = kb;
+#endif
+ const int nm = (m + mb - 1) / mb;
+ const int nn = (n + nb - 1) / nb;
+ const int nk = (k + kb - 1) / kb;
+
+ const int rm = m % mb;
+ const int rn = n % nb;
+ const int rk = k % kb;
+
+ float *A = (float *)malloc(m * k * sizeof(float));
+ if(!A) return 0;
+
+ for(int i = 0 ; i < m * k; i++)
+ {
+ A[i] = 0.001 + i * 0.000001;
+ }
+
+ float *B = (float *)malloc(k * n * sizeof(float));
+ if(!B) return 0;
+
+ for(int i = 0 ; i < n * k; i++)
+ {
+ B[i] = 0.001 - i * 0.000001;
+ }
+
+ float *C = (float *)malloc(m * n * sizeof(float));
+ if(!C) return 0;
+
+#if 0
+ float *PA = (float *)malloc(pm * pk * sizeof(float));
+ if(!PA) return 0;
+
+ float *PB = (float *)malloc(pk * pn * sizeof(float));
+ if(!PB) return 0;
+#else
+ float PA[pm * pk];
+ float PB[pk * pn];
+#endif
+
+ for(int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ //pack_rowmajor_notrans_lhs(mr, m, k, k, A, PA);
+ //pack_rowmajor_notrans_rhs(nr, n, k, n, B, PB);
+#if 1
+ for (int j = 0; j < nn; j++)
+ {
+ const int _nb = (j != nn - 1 || rn == 0) ? nb : rn;
+ for (int l = 0; l < nk; l++)
+ {
+ const int _kb = (l != nk - 1 || rk == 0) ? kb : rk;
+ pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + j * nb], PB);
+ for(int i = 0; i < nm; i++)
+ {
+ const int _mb = (i != nm - 1 || rm == 0) ? mb : rm;
+ pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[i * mb * k + l * kb], PA);
+ sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, PA, PB, &C[i * mb * n + j * nb], l, n, _kb);
+ //sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk);
+ }
+ }
+ }
+#else
+ for (int j = 0; j < nm; j++)
+ {
+ const int _mb = (j != nm - 1 || rm == 0) ? mb : rm;
+ for (int l = 0; l < nk; l++)
+ {
+ const int _kb = (l != nk - 1 || rk == 0) ? kb : rk;
+ pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[j * mb * k + l * kb], PA);
+ for(int i = 0; i < nn; i++)
+ {
+ const int _nb = (i != nn - 1 || rn == 0) ? nb : rn;
+ pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + i * nb], PB);
+ sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, PA, PB, &C[j * mb * n + i * nb], l, n, _kb);
+ //sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk);
+ }
+ }
+ }
+#endif
+ gettimeofday(&end, NULL);
+ total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec))/1000;
+ }
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ float *c_ptr = &C[0];
+ for(int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if((i + 1) % div == 0) printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &C[m * n - num];
+ for(int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if((i + 1) % div == 0) printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m *n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops , total_size, (double)total_size/(total_time / loops)/1000000);
+
+ free(A);
+ free(B);
+ free(C);
+
+ //free(PA);
+ //free(PB);
+
+}
+#endif
+
+static int test_sgemm(int m, int n, int k, int type, int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ // printf("1.\n");
+
+ float *A = (float *)malloc(m * k * sizeof(float));
+ if (!A)
+ return 0;
+
+ for (int i = 0; i < m * k; i++)
+ {
+ A[i] = 0.001 + i * 0.001; // i * 0.000001;
+ }
+
+ float *B = (float *)malloc(k * n * sizeof(float));
+ if (!B)
+ return 0;
+
+ for (int i = 0; i < n * k; i++)
+ {
+ B[i] = 0.001 - i * 0.001; // - i * 0.000001;
+ }
+
+ float *C = (float *)malloc(m * n * sizeof(float));
+ if (!C)
+ return 0;
+
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ if (type == 0)
+ {
+ // direct_sgemm_rowmajor(notrans, notrans, m, n, k, A, B, C);
+ direct_sgemm_colmajor(notrans, notrans, m, n, k, A, B, C);
+ }
+
+ else if (type == 1)
+ {
+ class sgemm_singlethread my_gemm(colMajor, notrans, notrans, m, n, k, A, B, C, 1);
+ my_gemm.run();
+ }
+
+ /*else if(type == 2)
+ {
+ for(int i = 0; i < m / 8; i++)
+ {
+ for(int j = 0; j < n / 8; j++)
+ {
+ direct_sgemm_kernel(k, k, n, n, A + i * 8 * k, B + j * 8, C + i * 8 * n + j * 8);
+ }
+ }
+ }*/
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ float *c_ptr = &C[0];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &C[m * n - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m * n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+ total_size, (double)total_size / (total_time / loops) / 1000000);
+
+ free(A);
+ free(B);
+ free(C);
+
+ return 0;
+}
+
+void weight_tensorflow2caffe(float *out, float *in, int H, int W, int C, int N)
+{ // HWCN ---> NCHW
+ for (int h = 0; h < H; ++h)
+ {
+ for (int w = 0; w < W; ++w)
+ {
+ for (int c = 0; c < C; ++c)
+ {
+ for (int n = 0; n < N; ++n)
+ {
+ int index_in = h * W * C * N + w * C * N + c * N + n;
+ int index_out = n * C * H * W + c * H * W + h * W + w;
+ // printf("%3d <--- %3d\n", index_out, index_in);
+ out[index_out] = in[index_in];
+ }
+ }
+ }
+ }
+}
+
+void trans_weight2winograd(const convMat_t &_kernel, float **winograd_weight)
+{
+ const double *G;
+ const int kernel_size = _kernel.h;
+ const int channels = _kernel.c;
+ const int num_output = _kernel.n;
+
+ int tile_h_in_, tile_w_in_;
+ int M, N;
+
+ /*Step 1: transfer weight to winograd domain*/
+ if (kernel_size == 3)
+ {
+ M = winograd_para_3x3s1::M;
+ N = winograd_para_3x3s1::N;
+ G = winograd_para_3x3s1::getG();
+ }
+ else
+ {
+ M = winograd_para_5x5s1::M;
+ N = winograd_para_5x5s1::N;
+ G = winograd_para_5x5s1::getG();
+ }
+
+ tile_h_in_ = tile_w_in_ = M;
+
+ float *winograd_g = new float[M * M * N * N];
+ if (NULL == winograd_g)
+ return;
+ kronecker_product(winograd_g, G, G, M, N, M, N);
+
+ *winograd_weight = new float[tile_h_in_ * tile_w_in_ * channels * num_output];
+
+ if (NULL == *winograd_weight)
+ return;
+
+ float *weight_data_tran = new float[_kernel.h * _kernel.w * _kernel.c * _kernel.n];
+ if (NULL == weight_data_tran)
+ return;
+ weight_tensorflow2caffe(weight_data_tran, _kernel.data, kernel_size, kernel_size, channels,
+ num_output);
+
+ class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_,
+ channels * num_output, kernel_size * kernel_size, winograd_g,
+ weight_data_tran, *winograd_weight, 1);
+
+ sgemm.run();
+
+ delete[] weight_data_tran;
+
+ /*With winograd, original weight data is useless.*/
+ delete[] winograd_g;
+}
+
+static int test_conv(const int w, const int h, const int kernel_size, const int stride,
+ const int inch, const int outch, const int padding, const int conv_type,
+ const int thread_num, const int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ struct timeval start1, end1;
+ float total_time1 = 0.f;
+
+ const int dilation = 1;
+
+ const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+ convMat_t input;
+ convMat_t output;
+ convMat_t filter;
+ convParams_t params;
+
+ int pad_l, pad_r, pad_t, pad_b;
+ if (padding)
+ {
+ int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+ int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+ pad_l = pad_w / 2;
+ pad_r = pad_w - pad_l;
+ pad_t = pad_h / 2;
+ pad_b = pad_h - pad_t;
+ }
+ else
+ {
+ pad_l = pad_r = pad_t = pad_b = 0;
+ }
+
+ input.w = w;
+ input.h = h;
+ input.c = inch;
+ input.n = 1;
+#ifdef NCNN
+ input.data =
+ (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float));
+#else
+ input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+#endif
+
+ if (!input.data)
+ return 0;
+
+ output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+ output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+ output.c = outch;
+ output.n = 1;
+#ifdef NCNN
+ output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c *
+ sizeof(float));
+#else
+ output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+#endif
+
+ if (!output.data)
+ return 0;
+
+ for (int i = 0; i < output.w * output.h * output.c; i++)
+ {
+ output.data[i] = 0;
+ }
+
+ filter.w = kernel_size;
+ filter.h = kernel_size;
+ filter.c = inch;
+ filter.n = outch;
+ filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+ if (!filter.data)
+ return 0;
+
+ for (int i = 0; i < input.w * input.h * input.c; i++)
+ {
+ input.data[i] = 0.001 + i * 0.000001;
+ }
+
+#if 1
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ filter.data[i] = 0.001 - i * 0.000001;
+ }
+#else
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ if ((i + 1) % 15 == 0)
+ filter.data[i] = 0.001 - i * 0.000001;
+ else
+ filter.data[i] = 0;
+ }
+#endif
+ params.kernel_w = kernel_size;
+ params.kernel_h = kernel_size;
+ params.stride_w = stride;
+ params.stride_h = stride;
+ params.padding = padding;
+ params.pad_w = pad_l;
+ params.pad_h = pad_t;
+ params.dilation_w = dilation;
+ params.dilation_h = dilation;
+
+ const int m = output.c;
+ const int n = output.w * output.h;
+ const int k = params.kernel_h * params.kernel_w * input.c;
+
+ // ocl_context_t context;
+ size_t local_min[2];
+ /**
+ if(conv_type == 14 || conv_type == 15 || conv_type == 6)
+ {
+ if(init_gpu(&context) < 0) return -1;
+ //if(conv_type ==14 || conv_type == 5) sgemm_ocltune(&context, m, n, (k < 1024 ? k :
+ 1024), local_min);
+ //else if(conv_type == 6)
+ {
+ if(kernel_size == 3) directconv_3x3S1_tune(&context, &input, &filter, &output,
+ local_min);
+ else if(kernel_size == 1) directconv_1x1S1_tune(&context, &input, &filter, &output,
+ local_min);
+ }
+ //local_min[0] = 1; local_min[1] = 1;
+ }
+ **/
+ if (conv_type == 0)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ direct_conv_rowmajor(&input, &output, &filter, &params);
+ // direct_conv_colmajor(&input, &output, &filter, &params);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 1)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+ {
+ // printf("nloop = %d, thread_num = %d\n", nloop, thread_num);
+ // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major);
+ gettimeofday(&start, NULL);
+
+ /*if(thread_num == 1)
+ {
+ class conv_sgemm_singlethread my_gemm(input, filter, output, params, col_major);
+ my_gemm.run();
+ }
+ else
+ {
+ class conv_sgemm_multithreads my_gemm(input, filter, output, params, thread_num,
+ col_major);
+ my_gemm.run();
+ }*/
+
+ srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major);
+
+ // printf("sync\n");
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 2)
+ {
+ float *winograd_weight;
+
+ // trans_weight2winograd(filter, &winograd_weight);
+
+ winogradParams_t wparams = {params.kernel_w,
+ params.kernel_h,
+ params.stride_w,
+ params.stride_h,
+ params.dilation_w,
+ params.dilation_h,
+ 1,
+ w,
+ h,
+ input.c,
+ output.c,
+ thread_num,
+ col_major,
+ filter.data};
+ winograd_weight = trans_weight2winograd(wparams);
+
+ for (int nloop = 0; nloop < loops; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ // class conv_winograd my_sgemm(input, output, params, col_major, winograd_weight, thread_num,
+ // w * h, n);
+ // my_sgemm.run();
+
+ srcn_convolution2D(input, filter, output, params, winograd_weight, thread_num, row_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 3)
+ {
+ void *sparse_weight = trans_weight2sparse(filter);
+
+ for (int nloop = 0; nloop < loops; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ srcn_sparse_convolution2D(input, output, params, sparse_weight, thread_num, row_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+
+ sparse_release(outch, sparse_weight);
+ } /**
+else if(conv_type == 4)
+{
+#if 0
+ cl_int err;
+ convlib::load_opencl("./libmali.so");
+ const int mpad = (m + 4 - 1) / 4 * 4;
+ const int npad = (n + 4 - 1) / 4 * 4;
+ cl_mem lhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, mpad * k * sizeof(float), NULL, &err);
+ if(err != CL_SUCCESS)
+ {
+ printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+ return -1;
+ }
+
+ cl_image_format rhs_format = {CL_RGBA, CL_FLOAT};
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE2D,
+ (size_t)npad / 4,
+ (size_t)k,
+ 0, 0,
+ 0,
+ 0, 0, 0, 0
+ };
+ cl_mem rhs_gpu = convlib::clCreateImage(context.context, CL_MEM_READ_ONLY |
+CL_MEM_ALLOC_HOST_PTR, &rhs_format, &desc, NULL, &err);
+ if(err != CL_SUCCESS)
+ {
+ printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+ return -1;
+ }
+
+ cl_mem rhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, npad * k * sizeof(float), NULL, &err);
+ if(err != CL_SUCCESS)
+ {
+ printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+ return -1;;
+ }
+
+ cl_mem res_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, mpad * npad * sizeof(float), NULL, &err);
+ if(err != CL_SUCCESS)
+ {
+ printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+ return -1;
+ }
+#endif
+ for(int nloop = 0; nloop < loops + 1; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ //cl_mem _res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, &params, local_min,
+lhs_gpu, rhs_gpu, res_gpu);
+
+ //get_result_gpu(&context, output.data + gpu_data_off, _res_gpu, m, n);
+ srcn_convolution2D_gpu(input, filter, output, params, row_major);
+
+ gettimeofday(&end, NULL);
+
+ if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000
++ start.tv_usec))/1000;
+ }
+}
+else if(conv_type == 5)
+{
+
+ for(int nloop = 0; nloop < loops + 1; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ //cl_mem res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, &params, local_min);
+
+ //clFlush(context.cmdQueue);
+ gettimeofday(&start1, NULL);
+ #if 1
+ srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major
+
+ #endif
+ //usleep(80 * 1000);
+ gettimeofday(&end1, NULL);
+ total_time1 += ((end1.tv_sec * 1000000 + end1.tv_usec) - (start1.tv_sec * 1000000 +
+start1.tv_usec))/1000;
+
+ //get_result_gpu(&context, output.data + gpu_data_off, res_gpu, m, n);
+
+ srcn_convolution2D_dpu(input, filter, output, params, row_major);
+
+ gettimeofday(&end, NULL);
+ if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000
++ start.tv_usec))/1000;
+ }
+}
+else if(conv_type == 6)
+{
+ for(int nloop = 0; nloop < loops; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ if(kernel_size == 3 && stride == 1 && padding == 0)
+ {
+ conv2D_gpu_directconv_3x3S1(&context, &input, &filter, &output, &params, local_min);
+ }
+ else if(kernel_size == 1 && stride == 1 && padding == 0)
+ {
+ conv2D_gpu_directconv_1x1S1(&context, &input, &filter, &output, &params, local_min);
+ }
+
+ gettimeofday(&end, NULL);
+ total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 +
+start.tv_usec))/1000;
+ }
+}**/
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ if (conv_type < 4)
+ printf("[CPU RESULT]\n");
+ else if (conv_type == 4)
+ printf("[GPU RESULT]\n");
+ else if (conv_type == 5)
+ printf("[DPU RESULT]\n");
+ float *c_ptr = output.data;
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &output.data[m * n - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m * n * k * 2;
+ printf(
+ "AVER Time consuming: %.2fms, CPU Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n",
+ total_time / loops, total_time1 / loops, total_size,
+ (double)total_size / (total_time / loops) / 1000000);
+
+ free(input.data);
+ free(output.data);
+ free(filter.data);
+
+ return 0;
+}
+
+static int test_deconv(const int w, const int h, const int kernel_size, const int stride,
+ const int inch, const int outch, const int padding, const int conv_type,
+ const int thread_num, const int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ const int dilation = 1;
+
+ const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+ convMat_t input;
+ convMat_t output;
+ convMat_t filter;
+ convParams_t params;
+
+ int pad_l, pad_r, pad_t, pad_b;
+ if (padding)
+ {
+ int pad_w = kernel_dilation - 1;
+ int pad_h = kernel_dilation - 1;
+ pad_l = pad_w / 2;
+ pad_r = pad_w - pad_l;
+ pad_t = pad_h / 2;
+ pad_b = pad_h - pad_t;
+ }
+ else
+ {
+ pad_l = pad_r = pad_t = pad_b = 0;
+ }
+
+ input.w = w;
+ input.h = h;
+ input.c = inch;
+ input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+ if (!input.data)
+ return 0;
+
+ // output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+ // output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+ output.w = stride * (w - 1) + kernel_dilation - (pad_l + pad_r);
+ output.h = stride * (h - 1) + kernel_dilation - (pad_t + pad_b);
+ output.c = outch;
+ output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+ if (!output.data)
+ return 0;
+
+ filter.w = kernel_size;
+ filter.h = kernel_size;
+ filter.c = outch;
+ filter.n = inch;
+ filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+ if (!filter.data)
+ return 0;
+
+ for (int i = 0; i < input.w * input.h * input.c; i++)
+ {
+ input.data[i] = 0.001 + i * 0.000001;
+ }
+
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ filter.data[i] = 0.001 - i * 0.000001;
+ }
+
+ params.kernel_w = kernel_size;
+ params.kernel_h = kernel_size;
+ params.stride_w = stride;
+ params.stride_h = stride;
+ params.padding = padding;
+ params.pad_w = pad_l;
+ params.pad_h = pad_t;
+ params.dilation_w = dilation;
+ params.dilation_h = dilation;
+
+ const int m = params.kernel_h * params.kernel_w * output.c;
+ const int n = input.w * input.h;
+ const int k = input.c;
+
+ if (conv_type == 0)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ direct_deconv_rowmajor(&input, &output, &filter, &params);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 1)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ for (int i = 0; i < output.w * output.h * output.c; i++)
+ {
+ output.data[i] = 0;
+ }
+
+ srcn_deconvolution2D(input, filter, output, params, thread_num, row_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+
+ const int output_size = output.w * output.h * output.c;
+
+ int div = output_size < 16 ? output_size : 16;
+ int num = output_size > 64 ? 64 : output_size;
+
+ float *c_ptr = output.data;
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &output.data[output_size - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m * n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+ total_size, (double)total_size / (total_time / loops) / 1000000);
+
+ free(input.data);
+ free(output.data);
+ free(filter.data);
+
+ return 0;
+}
+
+static int test_batch_conv(const int batch, const int w, const int h, const int kernel_size,
+ const int stride, const int inch, const int outch, const int padding,
+ const int conv_type, const int thread_num, const int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ const int dilation = 1;
+
+ const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+ convMat_t input;
+ convMat_t output;
+ convMat_t filter;
+ convParams_t params;
+
+ int pad_l, pad_r, pad_t, pad_b;
+ if (padding)
+ {
+ int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+ int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+ pad_l = pad_w / 2;
+ pad_r = pad_w - pad_l;
+ pad_t = pad_h / 2;
+ pad_b = pad_h - pad_t;
+ }
+ else
+ {
+ pad_l = pad_r = pad_t = pad_b = 0;
+ }
+
+ input.w = w;
+ input.h = h;
+ input.c = inch;
+ input.n = batch;
+ input.data = (float *)malloc(input.n * input.w * input.h * input.c * sizeof(float));
+ if (!input.data)
+ return 0;
+
+ output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+ output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+ output.c = outch;
+ output.n = batch;
+ output.data = (float *)malloc(output.n * output.w * output.h * output.c * sizeof(float));
+ if (!output.data)
+ return 0;
+
+ filter.w = kernel_size;
+ filter.h = kernel_size;
+ filter.c = inch;
+ filter.n = outch;
+ filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+ if (!filter.data)
+ return 0;
+
+ for (int i = 0; i < input.w * input.h * input.c * input.n; i++)
+ {
+ input.data[i] = 0.001 + i * 0.000001;
+ }
+
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ filter.data[i] = 0.001 - i * 0.000001;
+ }
+
+ params.kernel_w = kernel_size;
+ params.kernel_h = kernel_size;
+ params.stride_w = stride;
+ params.stride_h = stride;
+ params.padding = padding;
+ params.pad_w = pad_l;
+ params.pad_h = pad_t;
+ params.dilation_w = dilation;
+ params.dilation_h = dilation;
+
+ const int m = output.c;
+ const int n = output.w * output.h;
+ const int k = params.kernel_h * params.kernel_w * input.c;
+
+ if (conv_type == 1)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ // printf("nloop = %d, thread_num = %d\n", nloop, thread_num);
+ // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major);
+
+ gettimeofday(&start, NULL);
+
+ srcn_batch_convolution2D(input, filter, output, params, NULL, thread_num, col_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 2)
+ {
+ float *winograd_weight;
+
+ // trans_weight2winograd(filter, &winograd_weight);
+
+ winogradParams_t wparams = {params.kernel_w,
+ params.kernel_h,
+ params.stride_w,
+ params.stride_h,
+ params.dilation_w,
+ params.dilation_h,
+ input.n,
+ w,
+ h,
+ input.c,
+ output.c,
+ thread_num,
+ col_major,
+ filter.data};
+ winograd_weight = trans_weight2winograd(wparams);
+
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ srcn_batch_convolution2D(input, filter, output, params, winograd_weight, thread_num,
+ col_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ float *c_ptr = output.data;
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &output.data[m * n * batch - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)batch * m * n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+ total_size, (double)total_size / (total_time / loops) / 1000000);
+
+ free(input.data);
+ free(output.data);
+ free(filter.data);
+
+ return 0;
+}
+
+static int test_depthwise_conv(const int w, const int h, const int kernel_size, const int stride,
+ const int inch, const int outch, const int padding,
+ const int conv_type, const int thread_num, const int loops)
+{
+ if (outch != inch)
+ return -1;
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ const int dilation = 1;
+
+ const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+ convMat_t input;
+ convMat_t output;
+ convMat_t filter;
+ convMat_t bias;
+ convParams_t params;
+
+ int pad_l, pad_r, pad_t, pad_b;
+ if (padding)
+ {
+ int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+ int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+ pad_l = pad_w / 2;
+ pad_r = pad_w - pad_l;
+ pad_t = pad_h / 2;
+ pad_b = pad_h - pad_t;
+ }
+ else
+ {
+ pad_l = pad_r = pad_t = pad_b = 0;
+ }
+
+ input.w = w;
+ input.h = h;
+ input.c = inch;
+ input.n = 1;
+#ifdef NCNN
+ input.data =
+ (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float));
+#else
+ input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+#endif
+ if (!input.data)
+ return 0;
+
+ output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+ output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+ output.c = outch;
+ output.n = 1;
+
+#ifdef NCNN
+ output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c *
+ sizeof(float));
+#else
+ output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+#endif
+ const int gpu_data_off = output.w * output.h * output.c;
+ if (!output.data)
+ return 0;
+
+ for (int i = 0; i < output.w * output.h * output.c; i++)
+ {
+ output.data[i] = 1.f;
+ }
+
+ filter.w = kernel_size;
+ filter.h = kernel_size;
+ filter.c = 1;
+ filter.n = outch;
+ filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+ if (!filter.data)
+ return 0;
+
+ for (int i = 0; i < input.w * input.h * input.c; i++)
+ {
+ input.data[i] = 0.001 + i * 0.000001;
+ }
+
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ filter.data[i] = 0.001 - i * 0.000001;
+ }
+
+ bias.w = outch;
+ bias.data = (float *)malloc(bias.w * sizeof(float));
+ if (!bias.data)
+ return 0;
+ for (int i = 0; i < bias.w; i++)
+ {
+ bias.data[i] = 0.f;
+ }
+
+ params.kernel_w = kernel_size;
+ params.kernel_h = kernel_size;
+ params.stride_w = stride;
+ params.stride_h = stride;
+ params.padding = padding;
+ params.pad_w = pad_l;
+ params.pad_h = pad_t;
+ params.dilation_w = dilation;
+ params.dilation_h = dilation;
+
+ const int m = output.c;
+ const int n = output.w * output.h;
+ const int k = params.kernel_h * params.kernel_w * input.c;
+
+ // ocl_context_t context;
+ size_t local_min[2] = {4, 4};
+ /**
+ if(conv_type == 1)
+ {
+ if(init_gpu(&context) < 0) return -1;
+ depthwise_conv_3x3S1_tune(&context, &input, &filter, &output, local_min);
+ }**/
+
+ gettimeofday(&start, NULL);
+ if (conv_type == 0)
+ srcn_depthwise_conv(input, filter, output, bias, params, 4,
+ row_major); // convdw3x3s1_neon(input, output, filter, filter);
+ // else if(conv_type == 1) depthwise_conv_gpu3x3S1(&context, &input, &filter, &output, &params,
+ // local_min);
+ else if (conv_type == 2)
+ {
+ for (int i = 0; i < input.c; i++)
+ {
+ convMat_t _input;
+ convMat_t _output;
+ convMat_t _filter;
+ convParams_t _params = params;
+
+ _input.w = input.w;
+ _input.h = input.h;
+ _input.c = 1;
+ _input.n = 1;
+#ifdef NCNN
+ _input.data = input.data + i * alignSize(input.w * input.h, 16 / sizeof(float));
+#else
+ _input.data = input.data + i * input.w * input.h;
+#endif
+
+ _output.w = output.w;
+ _output.h = output.h;
+ _output.c = 1;
+ _output.n = 1;
+#ifdef NCNN
+ _output.data = output.data + i * alignSize(output.w * output.h, 16 / sizeof(float));
+#else
+ _output.data = output.data + i * output.w * output.h;
+#endif
+ _filter.w = filter.w;
+ _filter.h = filter.h;
+ _filter.c = 1; // filter.c;
+ _filter.n = 1; // filter.n;
+ _filter.data = filter.data + i * 9;
+
+ srcn_convolution2D(_input, _filter, _output, _params, NULL, 1, row_major);
+ // direct_conv_rowmajor(&_input, &_output, &_filter, &_params);
+ }
+ }
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ if (conv_type == 0)
+ printf("[CPU RESULT]\n");
+ else if (conv_type == 1)
+ printf("[GPU RESULT]\n");
+ float *c_ptr = output.data;
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &output.data[m * n - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m * n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+ total_size, (double)total_size / (total_time / loops) / 1000000);
+
+ free(input.data);
+ free(output.data);
+ free(filter.data);
+ free(bias.data);
+
+ return 0;
+}
+
+//#define TEST_SGEMM
+#define TEST_CONV
+//#define TEST_DECONV
+//#define TEST_BATCH_CONV
+//#define TEST_DEPTHWISE_CONV
+
+int main(int argc, char **argv)
+{
+#ifdef TEST_SGEMM
+ if (argc < 6)
+ return 0;
+
+ const int m = atoi(argv[1]);
+ const int n = atoi(argv[2]);
+ const int k = atoi(argv[3]);
+ const int type = atoi(argv[4]);
+ const int loops = atoi(argv[5]);
+
+ test_sgemm(m, n, k, type, loops);
+#elif (defined TEST_CONV)
+ if (argc < 10)
+ return 0;
+ const int w = atoi(argv[1]);
+ const int h = atoi(argv[2]);
+ const int kernel_size = atoi(argv[3]);
+ const int stride = atoi(argv[4]);
+ const int outch = atoi(argv[5]);
+ const int inch = atoi(argv[6]);
+ const int padding = atoi(argv[7]);
+ const int conv_type = atoi(argv[8]);
+ const int thread_num = atoi(argv[9]);
+ int loops = 1;
+ if (argc > 10)
+ loops = atoi(argv[10]);
+ test_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops);
+#elif (defined TEST_DECONV)
+ if (argc < 10)
+ return 0;
+ const int w = atoi(argv[1]);
+ const int h = atoi(argv[2]);
+ const int kernel_size = atoi(argv[3]);
+ const int stride = atoi(argv[4]);
+ const int outch = atoi(argv[5]);
+ const int inch = atoi(argv[6]);
+ const int padding = atoi(argv[7]);
+ const int conv_type = atoi(argv[8]);
+ const int thread_num = atoi(argv[9]);
+ int loops = 1;
+ if (argc > 10)
+ loops = atoi(argv[10]);
+ test_deconv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops);
+#elif (defined TEST_BATCH_CONV)
+ if (argc < 11)
+ return 0;
+ const int batch = atoi(argv[1]);
+ const int w = atoi(argv[2]);
+ const int h = atoi(argv[3]);
+ const int kernel_size = atoi(argv[4]);
+ const int stride = atoi(argv[5]);
+ const int outch = atoi(argv[6]);
+ const int inch = atoi(argv[7]);
+ const int padding = atoi(argv[8]);
+ const int conv_type = atoi(argv[9]);
+ const int thread_num = atoi(argv[10]);
+ int loops = 1;
+ if (argc > 11)
+ loops = atoi(argv[11]);
+ test_batch_conv(batch, w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num,
+ loops);
+#elif (defined TEST_DEPTHWISE_CONV)
+ if (argc < 10)
+ return 0;
+ const int w = atoi(argv[1]);
+ const int h = atoi(argv[2]);
+ const int kernel_size = atoi(argv[3]);
+ const int stride = atoi(argv[4]);
+ const int outch = atoi(argv[5]);
+ const int inch = atoi(argv[6]);
+ const int padding = atoi(argv[7]);
+ const int conv_type = atoi(argv[8]);
+ const int thread_num = atoi(argv[9]);
+ int loops = 1;
+ if (argc > 10)
+ loops = atoi(argv[10]);
+ test_depthwise_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num,
+ loops);
+#endif
+
+ return 0;
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/srcn_conv.cc b/compute/ncnn/src/srcn/srcn_conv.cc
new file mode 100644
index 000000000..bb8e4f13e
--- /dev/null
+++ b/compute/ncnn/src/srcn/srcn_conv.cc
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+#include "sgemm_singlethread.h"
+#include "conv_sgemm_singlethread.h"
+#include "conv_sgemm_multithreads.h"
+#include "conv_winograd.h"
+#include "direct_conv_colmajor.h"
+#include "winograd.h"
+
+#include "deconv_sgemm_multithreads.h"
+#include "conv_sparse.h"
+#include "conv_winograd_batch.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+static inline void weight_transfer(float *out, float *in, int H, int W, int C, int N)
+{
+ // HWCN ---> NCHW
+ for (int h = 0; h < H; ++h)
+ {
+ for (int w = 0; w < W; ++w)
+ {
+ for (int c = 0; c < C; ++c)
+ {
+ for (int n = 0; n < N; ++n)
+ {
+ int index_in = h * W * C * N + w * C * N + c * N + n;
+ int index_out = n * C * H * W + c * H * W + h * W + w;
+ out[index_out] = in[index_in];
+ }
+ }
+ }
+ }
+}
+
+int check_winograd(winogradParams_t &params)
+{
+ int winograd_flag =
+ ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) &&
+ (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) &&
+ (params.dilation_w == 1) && (params.dilation_h == 1));
+
+ int winograd_channel_cond = 64 * 64;
+ int winograd_image_cond = 10 * 10;
+
+#ifdef TIZEN
+ if (params.num_threads > 1)
+ {
+ winograd_channel_cond = 128 * 128;
+ winograd_image_cond = 20 * 20;
+ }
+#endif // TIZEN
+
+ winograd_flag &= (params.inch * params.outch >= winograd_channel_cond);
+
+ if (params.w > 0 && params.h > 0 && params.batch == 1)
+ {
+ winograd_flag &= (params.w * params.h >= winograd_image_cond);
+ }
+
+ return winograd_flag;
+}
+
+float *trans_weight2winograd(winogradParams_t &params, unsigned int *size = NULL)
+{
+ int M, N;
+ const double *G;
+
+ float *winograd_weight;
+
+ int winograd_channel_cond = 64 * 64;
+ int winograd_image_cond = 10 * 10;
+
+#ifdef TIZEN
+ if (params.num_threads > 1)
+ {
+ winograd_channel_cond = 128 * 128;
+ // int winograd_image_cond = 20 * 20;
+ }
+#endif // TIZEN
+
+ int winograd_flag =
+ ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) &&
+ (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) &&
+ (params.dilation_w == 1) && (params.dilation_h == 1));
+ if (!winograd_flag)
+ return NULL;
+
+ winograd_flag = (params.inch * params.outch >= winograd_channel_cond);
+
+ if (!winograd_flag)
+ return NULL;
+
+ if (params.w > 0 && params.h > 0 && params.batch == 1)
+ {
+ winograd_flag &= (params.w * params.h >= winograd_image_cond);
+ if (!winograd_flag)
+ return NULL;
+ }
+
+ const int kernel_size = params.kernel_w;
+ const int inch = params.inch;
+ const int outch = params.outch;
+ float *weight_data = params.weight_data;
+
+ /*Step 1: transfer weight to winograd domain*/
+ if (kernel_size == 3)
+ {
+ if (params.w == 4 && params.batch > 1)
+ {
+ M = winograd_para_3x3s1_2::M;
+ N = winograd_para_3x3s1_2::N;
+ G = winograd_para_3x3s1_2::getG();
+ }
+ else
+ {
+ M = winograd_para_3x3s1::M;
+ N = winograd_para_3x3s1::N;
+ G = winograd_para_3x3s1::getG();
+ }
+ }
+ else
+ {
+ M = winograd_para_5x5s1::M;
+ N = winograd_para_5x5s1::N;
+ G = winograd_para_5x5s1::getG();
+ }
+
+ int tile_h_in_, tile_w_in_;
+ tile_h_in_ = tile_w_in_ = M;
+
+ if (size)
+ *size = tile_h_in_ * tile_w_in_ * inch * outch;
+
+ winograd_weight = new float[tile_h_in_ * tile_w_in_ * inch * outch];
+ if (!winograd_weight)
+ return NULL;
+
+ float *winograd_g = new float[M * M * N * N];
+ if (!winograd_g)
+ {
+ delete[] winograd_weight;
+ return NULL;
+ }
+
+ kronecker_product(winograd_g, G, G, M, N, M, N);
+
+ if (params.conv_type == col_major)
+ {
+ weight_data = new float[kernel_size * kernel_size * inch * outch];
+ if (!weight_data)
+ {
+ delete[] winograd_weight;
+ delete[] winograd_g;
+ return NULL;
+ }
+ weight_transfer(weight_data, params.weight_data, kernel_size, kernel_size, inch, outch);
+ }
+
+ class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_, inch * outch,
+ kernel_size * kernel_size, winograd_g, weight_data,
+ winograd_weight, 1);
+
+ sgemm.run();
+
+ if (params.conv_type == col_major)
+ delete[] weight_data;
+
+ delete[] winograd_g;
+
+ return winograd_weight;
+}
+
+void winograd_release(float *winograd_weight)
+{
+ if (winograd_weight)
+ delete[] winograd_weight;
+}
+
+void srcn_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, const float *winograd_weight, int num_threads,
+ convType_t conv_type)
+{
+ const int outw = out_mat.w;
+ const int outh = out_mat.h;
+ const int inch = in_mat.c;
+ const int outch = out_mat.c;
+
+ int winograd_flag =
+ ((in_param.kernel_w == in_param.kernel_h) && (in_param.stride_w == in_param.stride_h) &&
+ (in_param.kernel_w == 3 || in_param.kernel_w == 5) && (in_param.stride_w == 1) &&
+ (winograd_weight) && (in_param.dilation_w == 1) && (in_param.dilation_h == 1));
+
+ int direct_flag = ((conv_type == col_major) && (in_param.stride_w == in_param.stride_h) &&
+ (in_param.dilation_w == 1) && (in_param.dilation_h == 1));
+
+ int winograd_image_cond = 10 * 10;
+ int winograd_channel_cond = 64 * 64;
+ int direct_image_cond = 4 * 4;
+ int direct_channel_cond = 16 * 16;
+
+#ifdef TIZEN
+ if (num_threads > 1)
+ {
+ winograd_image_cond = 20 * 20;
+ winograd_channel_cond = 128 * 128;
+ }
+#endif
+
+ winograd_flag &=
+ ((outw * outh >= winograd_image_cond) && (inch * outch >= winograd_channel_cond));
+ direct_flag &= ((outw * outh <= direct_image_cond) || (inch * outch <= direct_channel_cond));
+
+ if (num_threads == 1)
+ {
+ if (winograd_flag)
+ {
+ class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, num_threads,
+ in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ else if (direct_flag)
+ {
+ direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads);
+ }
+ else
+ {
+ class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type);
+ conv.run();
+ }
+ }
+ else if (num_threads > 1)
+ {
+ if (winograd_flag)
+ {
+ const int npart = num_threads > 4 ? 4 : num_threads;
+
+ omp_set_num_threads(npart);
+
+ if (conv_type == col_major)
+ {
+ if (outch < 512)
+ {
+ const int _H = (outh + npart - 1) / npart;
+
+ if (_H < in_param.pad_h)
+ {
+ class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1,
+ in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ return;
+ }
+
+ // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w;
+ // const int oh = _H;
+ const int nh = (outh + _H - 1) / _H;
+ int rh = outh % _H;
+ if (rh == 0)
+ rh = _H;
+
+#pragma omp parallel for
+ for (int i = 0; i < nh; i++)
+ {
+ int pad_h_part = 0;
+ convMat_t in_part;
+ convMat_t out_part;
+ const int oh = (i != nh - 1 || rh == 0) ? _H : rh;
+ const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w;
+
+ in_part.w = in_mat.w;
+ in_part.c = inch;
+ out_part.w = outw;
+ out_part.c = outch;
+ in_part.h = ih;
+ out_part.h = oh;
+
+ int bottom_offset = i * _H - in_param.pad_h;
+ if (bottom_offset < 0)
+ {
+ bottom_offset = 0;
+ pad_h_part = in_param.pad_h;
+ }
+ in_part.data = in_mat.data + bottom_offset * in_mat.w * inch * in_param.stride_w;
+ if (ih + bottom_offset > in_mat.h)
+ {
+ in_part.h = in_mat.h - bottom_offset;
+ }
+
+ out_part.data = out_mat.data + i * _H * outw * outch;
+
+ convParams_t params = {
+ in_param.kernel_w, in_param.kernel_h, in_param.stride_w, in_param.stride_h, 1, 1,
+ in_param.padding, in_param.pad_w, pad_h_part};
+
+ class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight,
+ num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ }
+ else
+ {
+ const int _OUTC = (outch + npart - 1) / npart;
+
+ const int nc = (outch + _OUTC - 1) / _OUTC;
+ int rc = out_mat.c % _OUTC;
+ if (rc == 0)
+ rc = _OUTC;
+
+#pragma omp parallel for
+ for (int i = 0; i < nc; i++)
+ {
+ const float *weight_part;
+ convMat_t out_part;
+
+ const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc;
+
+ out_part.w = outw;
+ out_part.h = outh;
+ out_part.c = oc;
+ out_part.data = out_mat.data + i * _OUTC;
+ weight_part = winograd_weight + i * _OUTC * inch;
+ class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part,
+ num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ }
+ }
+ else if (conv_type == row_major)
+ {
+#ifdef TIZEN
+ if (outch < 512)
+#else // TIZEN
+ if (outh >= 20)
+#endif // TIZEN
+ {
+ const int _H = (outh + npart - 1) / npart;
+
+ if (_H < in_param.pad_h)
+ {
+ class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1,
+ in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ return;
+ }
+
+ // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w;
+ // const int oh = _H;
+ const int nh = (outh + _H - 1) / _H;
+ int rh = outh % _H;
+ if (rh == 0)
+ rh = _H;
+
+#pragma omp parallel for
+ for (int i = 0; i < nh; i++)
+ {
+ int pad_h_part = 0;
+ convMat_t in_part;
+ convMat_t out_part;
+ const int oh = (i != nh - 1 || rh == 0) ? _H : rh;
+ const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w;
+
+ in_part.w = in_mat.w;
+ in_part.c = inch;
+ out_part.w = outw;
+ out_part.c = outch;
+ in_part.h = ih;
+ out_part.h = oh;
+
+ int bottom_offset = i * _H - in_param.pad_h;
+ if (bottom_offset < 0)
+ {
+ bottom_offset = 0;
+ pad_h_part = in_param.pad_h;
+ }
+ in_part.data = in_mat.data + bottom_offset * in_mat.w * in_param.stride_w;
+ if (ih + bottom_offset > in_mat.h)
+ {
+ in_part.h = in_mat.h - bottom_offset;
+ }
+
+ out_part.data = out_mat.data + i * _H * outw;
+
+ convParams_t params = {
+ in_param.kernel_w, in_param.kernel_h, in_param.stride_w, 1, 1,
+ in_param.stride_h, in_param.padding, in_param.pad_w, pad_h_part};
+
+ class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight,
+ num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ }
+ else
+ {
+ const int _OUTC = (outch + npart - 1) / npart;
+
+ const int nc = (outch + _OUTC - 1) / _OUTC;
+ int rc = out_mat.c % _OUTC;
+ if (rc == 0)
+ rc = _OUTC;
+
+#pragma omp parallel for
+ for (int i = 0; i < nc; i++)
+ {
+ const float *weight_part;
+ convMat_t out_part;
+
+ const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc;
+
+ out_part.w = outw;
+ out_part.h = outh;
+ out_part.c = oc;
+ out_part.data = out_mat.data + i * _OUTC * outw * outh;
+ weight_part = winograd_weight + i * _OUTC * inch;
+ class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part,
+ num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ }
+ }
+ }
+ else if (direct_flag)
+ {
+ direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads);
+ }
+ else
+ {
+ class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads,
+ conv_type);
+ conv.run();
+ }
+ }
+}
+
+void srcn_deconvolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, int num_threads, convType_t conv_type)
+{
+ class deconv_sgemm_multithreads deconv(in_mat, weights_mat, out_mat, in_param, num_threads,
+ conv_type);
+ deconv.run();
+}
+
+void *trans_weight2sparse(const convMat_t &weights_mat)
+{
+ const int kernel_w = weights_mat.w;
+ const int kernel_h = weights_mat.h;
+ const int inch = weights_mat.c;
+ const int outch = weights_mat.n;
+
+ const int nch = (outch + BCH - 1) / BCH;
+ const int rch = outch % BCH;
+
+ const float *data = weights_mat.data;
+ const int klength = inch * kernel_h * kernel_w;
+
+ sparse_weight_t *sparse_weight = new sparse_weight_t[nch];
+ if (!sparse_weight)
+ return NULL;
+
+ for (int i = 0; i < nch; i++)
+ {
+ int _bch = (i != nch - 1 || rch == 0) ? BCH : rch;
+ sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+ sparse_weight_n->mxk = 0;
+
+ for (int j = 0; j < _bch; j++)
+ {
+ for (int l = 0; l < klength; l++)
+ {
+ float val = *(data + (i * BCH + j) * klength + l);
+ if (val != 0)
+ {
+ sparse_weight_n->mxk++;
+ }
+ }
+ }
+ }
+
+ for (int i = 0; i < nch; i++)
+ {
+ int _bch = (i != nch - 1 || rch == 0) ? BCH : rch;
+ sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+ sparse_weight_n->wdata = new weight_data_t[sparse_weight_n->mxk];
+ int index = 0;
+
+ for (int l = 0; l < klength; l++)
+ {
+ for (int j = 0; j < _bch; j++)
+ {
+ float val = *(data + (i * BCH + j) * klength + l);
+ if (val != 0)
+ {
+ sparse_weight_n->wdata[index].m = i * BCH + j;
+ sparse_weight_n->wdata[index].k = l;
+ sparse_weight_n->wdata[index++].data = val;
+ }
+ }
+ }
+ }
+
+ return (void *)sparse_weight;
+}
+
+void sparse_release(const int outch, void *ptr)
+{
+ sparse_weight_t *sparse_weight = (sparse_weight_t *)ptr;
+ const int nch = (outch + BCH - 1) / BCH;
+
+ if (!sparse_weight)
+ return;
+
+ for (int i = 0; i < nch; i++)
+ {
+ sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+ if (sparse_weight_n->wdata)
+ delete[] sparse_weight_n->wdata;
+ }
+
+ if (sparse_weight)
+ delete[] sparse_weight;
+}
+
+void srcn_sparse_convolution2D(const convMat_t &in_mat, convMat_t &out_mat,
+ const convParams_t &in_param, const void *sparse_weight,
+ int number_threas, convType_t conv_type)
+{
+ class conv_sparse conv(in_mat, out_mat, in_param, (const sparse_weight_t *)sparse_weight,
+ number_threas, conv_type);
+
+ for (int i = 0; i < out_mat.c * out_mat.h * out_mat.w; i++)
+ {
+ *(out_mat.data + i) = 0;
+ }
+
+ conv.run();
+}
+
+void srcn_batch_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat,
+ convMat_t &out_mat, const convParams_t &in_param,
+ const float *winograd_weight, int num_threads, convType_t conv_type)
+{
+ int winograd_flag = (winograd_weight != NULL);
+
+ if (winograd_flag)
+ {
+ if (num_threads > 1)
+ {
+ omp_set_num_threads(num_threads);
+ const int batch = in_mat.n;
+ const int npart = (batch + num_threads - 1) / num_threads;
+ const int nn = (batch + npart - 1) / npart;
+ const int rn = batch % npart;
+
+#pragma omp parallel for
+ for (int i = 0; i < nn; i++)
+ {
+ const int pn = (i != nn - 1 || rn == 0) ? npart : rn;
+ convMat_t in_mat_part = {in_mat.w, in_mat.h, in_mat.c, pn,
+ in_mat.data + i * npart * in_mat.w * in_mat.h * in_mat.c};
+ convMat_t out_mat_part = {out_mat.w, out_mat.h, out_mat.c, pn,
+ out_mat.data + i * npart * out_mat.w * out_mat.h * out_mat.c};
+
+ class conv_winograd_batch conv(in_mat_part, out_mat_part, in_param, conv_type,
+ winograd_weight, num_threads);
+ conv.run();
+ }
+ }
+ else
+ {
+ class conv_winograd_batch conv(in_mat, out_mat, in_param, conv_type, winograd_weight,
+ num_threads);
+ conv.run();
+ }
+ }
+ else
+ {
+ if (num_threads == 1)
+ {
+ class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type);
+ conv.run();
+ }
+ else
+ {
+ class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads,
+ conv_type);
+ conv.run();
+ }
+ }
+}
+
+} // namespace srcn
+} // namespace nnfw
diff --git a/compute/ncnn/src/srcn/winograd.h b/compute/ncnn/src/srcn/winograd.h
new file mode 100644
index 000000000..5ad8f1126
--- /dev/null
+++ b/compute/ncnn/src/srcn/winograd.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SRCN_WINOGRAD_H__
+#define __NNFW_SRCN_WINOGRAD_H__
+
+namespace nnfw
+{
+namespace srcn
+{
+
+struct winograd_para_3x3s1
+{
+ static const int M = 3 + 4 - 1;
+ static const int N = 3;
+
+ static const double *getG()
+ {
+ static const double G[M * N] = {
+ 1. / 4., 0, 0, -1. / 6., -1. / 6., -1. / 6., -1. / 6., 1. / 6., -1. / 6.,
+ 1. / 24., 1. / 12., 1. / 6., 1. / 24., -1. / 12., 1. / 6., 0, 0, 1,
+ };
+ return G;
+ }
+
+ static const double *getA()
+ {
+ static const double A[M * (M - N + 1)] = {
+ 1, 0, 0, 0, 1, 1, 1, 1, 1, -1, 1, -1, 1, 2, 4, 8, 1, -2, 4, -8, 0, 0, 0, 1,
+ };
+ return A;
+ }
+
+ static const double *getB()
+ {
+ static const double B[M * M] = {
+ 4, 0, 0, 0, 0, 0, 0, -4, 4, -2, 2, 4, -5, -4, -4, -1, -1, 0,
+ 0, 1, -1, 2, -2, -5, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
+ };
+ return B;
+ };
+};
+
+struct winograd_para_3x3s1_2
+{
+ static const int M = 3 + 2 - 1;
+ static const int N = 3;
+
+ static const double *getG()
+ {
+ static const double G[M * N] = {
+ 1, 0, 0, 1. / 2., 1. / 2., 1. / 2., 1. / 2., -1. / 2., 1. / 2., 0, 0, 1,
+ };
+ return G;
+ }
+
+ static const double *getA()
+ {
+ static const double A[M * (M - N + 1)] = {
+ 1, 0, 1, 1, 1, -1, 0, 1,
+ };
+ return A;
+ }
+
+ static const double *getB()
+ {
+ static const double B[M * M] = {
+ 1, 0, 0, 0, 0, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1,
+ };
+ return B;
+ };
+};
+
+struct winograd_para_5x5s1
+{
+ static const int M = 5 + 4 - 1;
+ static const int N = 5;
+
+ static const double *getG()
+ {
+ static const double G[M * N] = {
+ 1, 0, 0, 0, 0, -2. / 9., -2. / 9., -2. / 9.,
+ -2. / 9., -2. / 9., -2. / 9., 2. / 9., -2. / 9., 2. / 9., -2. / 9., 1. / 90.,
+ 1. / 45., 2. / 45., 4. / 45., 8. / 45., 1. / 90., -1. / 45., 2. / 45., -4. / 45.,
+ 8. / 45., 4. / 45., 2. / 45., 1. / 45., 1. / 90., 1. / 180., 4. / 45., -2. / 45.,
+ 1. / 45., -1. / 90., 1. / 180., 0, 0, 0, 0, 1,
+ };
+ return G;
+ }
+
+ static const double *getA()
+ {
+ static const double A[M * (M - N + 1)] = {1, 0, 0, 0, 1, 1, 1, 1, 1, -1, 1, -1, 1, 2, 4, 8,
+ 1, -2, 4, -8, 8, 4, 2, 1, 8, -4, 2, -1, 0, 0, 0, 1};
+ return A;
+ }
+
+ static const double *getB()
+ {
+ static const double B[M * M] = {
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ -1, 1. / 2, -1. / 2, 2, -2, -1, -21. / 4, 1, 1, 1. / 4,
+ 1. / 4, 4, 4, 0, 0, -17. / 4, 17. / 4, -5. / 2, 5. / 2, -5. / 2,
+ 5. / 2, 21. / 4, 21. / 4, -17. / 4, -17. / 4, -5. / 4, -5. / 4, -5, -5, 0,
+ 0, 1, -1, 2, -2, 1. / 2, -1. / 2, -21. / 4, -1, 1,
+ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1,
+ };
+ return B;
+ }
+};
+
+static void kronecker_product(float *out, const double *in1, const double *in2, int m, int n, int p,
+ int q)
+{
+ for (int i = 0; i < m; ++i)
+ {
+ for (int j = 0; j < n; ++j)
+ {
+ for (int k = 0; k < p; ++k)
+ {
+ for (int l = 0; l < q; ++l)
+ {
+ out[(p * i + k) * n * q + q * j + l] = in1[n * i + j] * in2[k * q + l];
+ /* compute in double precision and then convert it back to Dtype for accuracy */
+ }
+ }
+ }
+ }
+}
+
+} // namespace srcn
+} // namespace nnfw
+
+#endif // __NNFW_SRCN_WINOGRAD_H__