diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2020-12-14 14:43:43 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2020-12-14 14:43:43 +0900 |
commit | 62529acabbafce7730601ed01d5709d7bc0d378a (patch) | |
tree | bf6912cfa8fac4a2997292bfcb3c82055734c97e /compute | |
parent | 6ea13af5257155ff993c205cf997b870cc627f73 (diff) | |
download | nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.tar.gz nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.tar.bz2 nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.zip |
Imported Upstream version 1.12.0upstream/1.12.0
Diffstat (limited to 'compute')
137 files changed, 5543 insertions, 2371 deletions
diff --git a/compute/.clang-format b/compute/.clang-format new file mode 120000 index 000000000..0ff66f331 --- /dev/null +++ b/compute/.clang-format @@ -0,0 +1 @@ +../.clang-format.8
\ No newline at end of file diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h index d29886a9d..4a3717885 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -255,14 +255,14 @@ private: cl::Device _device; /**< Underlying CL device. */ std::string _kernel_path; /**< Path to the kernels folder. */ mutable std::map<std::string, const Program> - _programs_map; /**< Map with all already loaded program data. */ + _programs_map; /**< Map with all already loaded program data. */ mutable std::map<std::string, cl::Program> - _built_programs_map; /**< Map with all already built program data. */ + _built_programs_map; /**< Map with all already built program data. */ static const std::map<std::string, std::string> - _kernel_program_map; /**< Map that associates kernel names with programs. */ + _kernel_program_map; /**< Map that associates kernel names with programs. */ static const std::map<std::string, std::string> - _program_source_map; /**< Contains sources for all programs. - Used for compile-time kernel inclusion. >*/ + _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ }; } #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h index a614d5259..fb689f747 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -54,8 +54,8 @@ namespace arm_compute class ICLTensor; /** -* @brief Class to perform EmbeddingLookup operation with opencl kernel -*/ + * @brief Class to perform EmbeddingLookup operation with opencl kernel + */ class CLEmbeddingLookupKernel : public ICLKernel { public: diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h index 99cfa61ec..96f830898 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -55,8 +55,8 @@ namespace arm_compute class ICLTensor; /** -* @brief Class to perform HashtableLookup operation with opencl kernel -*/ + * @brief Class to perform HashtableLookup operation with opencl kernel + */ class CLHashtableLookupKernel : public ICLKernel { public: diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h index 99bb351bc..963d7b821 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h @@ -68,34 +68,37 @@ public: const char *name() const override { return "NEOneHotKernel"; } /** Initialise the kernel's inputs and outputs * - * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the - * following types: U32/S32 - * @param[in] depth The tensor for depth of the one hot dimension. Supported tensor rank: up to - * 3. Must be one of the following types: U32/S32 - * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: - * U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same - * as @p on_value - * @param[out] output Destination tensor. Data type supported: Same as @p on_value - * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. - * The value must be in range [-indices.rank , indices.rank) + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. + * Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) */ void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, const ITensor *off_value, ITensor *output, int axis = -1); /** Static function to check if given info will lead to a valid configuration of @ref - * NEOneHotKernel + * NEOneHotKernel * - * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the - * following types: U32/S32 - * @param[in] depth The tensor info for depth of the one hot dimension. Supported tensor rank: - * up to 3. Must be one of the following types: U32/S32 - * @param[in] on_value On value tensor info. Supported tensor rank: only 1. Data type supported: - * U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. Data type supported: - * Same as @p on_value - * @param[out] output Destination tensor info. Data type supported: Same as @p on_value - * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. - * The value must be in range [-indices.rank , indices.rank) + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h index 1e69f0912..2aaab6b3a 100644 --- a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h +++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h @@ -72,10 +72,10 @@ namespace shape_calculator * @return the calculated shape */ inline TensorShape compute_transposeconv_upsampled_shape( - const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, - std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, - unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, - unsigned int &pad_top, unsigned int &pad_bottom) + const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, + std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, + unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, + unsigned int &pad_top, unsigned int &pad_bottom) { unsigned int sx = info.stride().first; unsigned int sy = info.stride().second; @@ -103,7 +103,7 @@ inline TensorShape compute_transposeconv_upsampled_shape( unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right; unsigned int pady_all_except_invallid = - pady + info.pad_top() + info.pad_bottom() - invalid_bottom; + pady + info.pad_top() + info.pad_bottom() - invalid_bottom; pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left(); pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right; pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top(); @@ -135,7 +135,7 @@ compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> & const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int channel_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); TensorShape out_shape{input_shape}; @@ -160,7 +160,7 @@ inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int idx_channel = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); TensorShape output_shape{input->tensor_shape()}; output_shape.set(idx_width, input->dimension(idx_width) * block); diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h index 409eaf593..026209f69 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h @@ -106,22 +106,24 @@ public: CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default; /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. - * Data type supported: Should match @p input data type, except for - * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type - * @param[out] output Output tensor. The output has the same number of dimensions as the - * @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this - * is decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, + * except for input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type + * @param[out] output Output tensor. + * The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with + * @ref CLWeightsReshapeKernel. * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, @@ -130,23 +132,24 @@ public: /** Set the input, weights, biases and output tensors. * * @param[in] compile_context The compile context to be used. - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and - * an optional 4th dimension for batch of inputs. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. * @param[in] bias (Optional) The biases have one dimension. * Data type supported: Should match @p input data type, except for - * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type * @param[out] output Output tensor. The output has the same number of dimensions as - * the @p input. + * the @p input. * @param[in] info Contains padding and policies to be used in the deconvolution, - * this is decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref - * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref - * CLWeightsReshapeKernel. + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, @@ -154,24 +157,26 @@ public: unsigned int invalid_right, unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLDirectTransposeConvLayer + * CLDirectTransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. - * Data type supported: Should match @p input data type, except for input - * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type - * @param[in] output Output tensor info. The output has the same number of dimensions as the - * @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, + * except for input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type + * @param[in] output Output tensor info. The output has the same number of dimensions + * as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped + * with @ref CLWeightsReshapeKernel. * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h index e65a646dc..f27e9913e 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h @@ -216,7 +216,7 @@ private: CLConvertFullyConnectedWeights _convert_weights; weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed; weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged - _reshape_weights_managed_function; + _reshape_weights_managed_function; CLFlattenLayer _flatten_layer; CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function; CLGEMM _mm_gemm; diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h index 289ab167f..bdb168664 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h @@ -43,8 +43,8 @@ public: public: CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) - : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, - _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false) + : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, + _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false) { // DO NOTHING } diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h index b01ec4255..167554c9e 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h @@ -66,7 +66,7 @@ public: * @param[out] output The output tensor, Data types supported: same as @p input. * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 * @return N/A - */ + */ void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); /** diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h index 5fb102e47..5b27d362a 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h @@ -63,20 +63,22 @@ public: /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same - * as @p input. - * @param[out] output Output tensor. The output has the same number of dimensions as the - * @p input. - * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this - * is described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions + * as the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, @@ -85,22 +87,22 @@ public: /** Set the input, weights, biases and output tensors. * * @param[in] compile_context The compile context to be used. - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and - * an optional 4th dimension for batch of inputs. Data types supported: - * QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: - * Same as @p input. - * @param[out] output Output tensor. The output has the same number of dimensions as - * the @p input. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions + * as the @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, - * this is described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref - * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref - * CLWeightsReshapeKernel. + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, @@ -108,22 +110,24 @@ public: unsigned int invalid_right, unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLTransposeConvLayer + * CLTransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as - * @p input. - * @param[in] output Output tensor info. The output has the same number of dimensions as the - * @p input. - * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is - * described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[in] output Output tensor info. The output has the same number of dimensions + * as the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with + * @ref CLWeightsReshapeKernel. * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h index 18cb61bf9..e34b4dcb0 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h @@ -43,8 +43,8 @@ public: public: NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) - : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), - _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) + : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), + _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) { // DO NOTHING } diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h index b2ea6270f..1a68f801a 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h @@ -66,19 +66,20 @@ public: void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, const ITensor *off_value, ITensor *output, int axis = -1); /** Static function to check if given info will lead to a valid configuration of @ref - * NEOneHotKernel + * NEOneHotKernel * - * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the - * following types: U32/S32 - * @param[in] depth The tensor info for depth of the one hot dimension. Supported tensor rank: - * up to 3. Must be one of the following types: U32/S32 - * @param[in] on_value On value tensor info. Supported tensor rank: only 1. Data type supported: - * U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. Data type supported: - * Same as @p on_value - * @param[out] output Destination tensor info. Data type supported: Same as @p on_value - * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. - * The value must be in range [-indices.rank , indices.rank) + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h index 24ff5dac9..7a08dae97 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -110,39 +110,42 @@ public: /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type - * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 - * for F16 input. - * @param[out] output Output tensor. The output has the same number of dimensions as the @p - * input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. + * Data type supported: Data types supported: S32 for QASYMM8 and + * QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. * */ void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom); /** Static function to check if given info will lead to a valid configuration of @ref - * NETransposeConvLayer + * NETransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types - * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. - * @param[in] output Output tensor info. The output has the same number of dimensions as the @p - * input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] innvalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, + * F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. * * @return a status */ diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 81d0cb70f..1a8ff3e71 100644 --- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -54,123 +54,123 @@ using namespace arm_compute; const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { - // ARMComputeEx kernels - {"arg_min_max_ex_x", "arg_min_max_ex.cl"}, - {"arg_min_max_ex_y", "arg_min_max_ex.cl"}, - {"arg_min_max_ex_z", "arg_min_max_ex.cl"}, - {"arg_min_max_ex_w", "arg_min_max_ex.cl"}, - {"binary_logical_op", "binary_logical_op.cl"}, - {"cast_bool", "cast.cl"}, - {"embedding_lookup", "embedding_lookup.cl"}, - {"gather_ex", "gather_ex.cl"}, - {"gather_ex_1d", "gather_ex.cl"}, - {"gather_ex_1d_out", "gather_ex.cl"}, - {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, - {"hashtable_lookup", "hashtable_lookup.cl"}, - {"instance_normalization_ex", "instance_normalization_ex.cl"}, - {"multiply_scale_factor", "multiply_scale_factor.cl"}, - {"neg_tensor", "neg_tensor.cl"}, - {"one_hot", "one_hot.cl"}, - {"one_hot_only_on_value", "one_hot.cl"}, - {"quantization_symm8", "quantization_symm8.cl"}, - {"reduce_min_max", "reduce_operation.cl"}, - {"reduce_sum_mean", "reduce_operation.cl"}, - {"topkv2_init", "topkv2.cl"}, - {"topkv2_find_first_negative", "topkv2.cl"}, - {"topkv2_reorder_negatives", "topkv2.cl"}, - {"topkv2_store", "topkv2.cl"}, - {"radixsort_histogram", "topkv2_radixsort.cl"}, - {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, - {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, - {"radixsort_reorder", "topkv2_radixsort.cl"}, - {"topkv2_quicksort", "topkv2_quicksort.cl"}, - {"scale_factor_symm8", "scale_factor.cl"}, + // ARMComputeEx kernels + {"arg_min_max_ex_x", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_y", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_z", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_w", "arg_min_max_ex.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, + {"cast_bool", "cast.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"gather_ex", "gather_ex.cl"}, + {"gather_ex_1d", "gather_ex.cl"}, + {"gather_ex_1d_out", "gather_ex.cl"}, + {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"multiply_scale_factor", "multiply_scale_factor.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"one_hot", "one_hot.cl"}, + {"one_hot_only_on_value", "one_hot.cl"}, + {"quantization_symm8", "quantization_symm8.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"topkv2_init", "topkv2.cl"}, + {"topkv2_find_first_negative", "topkv2.cl"}, + {"topkv2_reorder_negatives", "topkv2.cl"}, + {"topkv2_store", "topkv2.cl"}, + {"radixsort_histogram", "topkv2_radixsort.cl"}, + {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, + {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, + {"radixsort_reorder", "topkv2_radixsort.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"scale_factor_symm8", "scale_factor.cl"}, }; const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { #ifdef EMBEDDED_KERNELS - { - "arg_min_max_ex.cl", + { + "arg_min_max_ex.cl", #include "./cl_kernels/arg_min_max_ex.clembed" - }, - { - "cast.cl", + }, + { + "cast.cl", #include "./cl_kernels/cast.clembed" - }, - { - "embedding_lookup.cl", + }, + { + "embedding_lookup.cl", #include "./cl_kernels/embedding_lookup.clembed" - }, - { - "gather_ex.cl", + }, + { + "gather_ex.cl", #include "./cl_kernels/gather_ex.clembed" - }, - { - "gemmlowp_ex.cl", + }, + { + "gemmlowp_ex.cl", #include "./cl_kernels/gemmlowp_ex.clembed" - }, - { - "hashtable_lookup.cl", + }, + { + "hashtable_lookup.cl", #include "./cl_kernels/hashtable_lookup.clembed" - }, - { - "helpers.h", + }, + { + "helpers.h", #include "./cl_kernels/helpers.hembed" - }, - { - "helpers_asymm.h", + }, + { + "helpers_asymm.h", #include "./cl_kernels/helpers_asymm.hembed" - }, - { - "instance_normalization_ex.cl", + }, + { + "instance_normalization_ex.cl", #include "./cl_kernels/instance_normalization_ex.clembed" - }, - { - "binary_logical_op.cl", + }, + { + "binary_logical_op.cl", #include "./cl_kernels/binary_logical_op.clembed" - }, - { - "multiply_scale_factor.cl", + }, + { + "multiply_scale_factor.cl", #include "./cl_kernels/multiply_scale_factor.clembed" - }, - { - "neg_tensor.cl", + }, + { + "neg_tensor.cl", #include "./cl_kernels/neg_tensor.clembed" - }, - { - "one_hot.cl", + }, + { + "one_hot.cl", #include "./cl_kernels/one_hot.clembed" - }, - { - "quantization_symm8.cl", + }, + { + "quantization_symm8.cl", #include "./cl_kernels/quantization_symm8.clembed" - }, - { - "reduce_operation.cl", + }, + { + "reduce_operation.cl", #include "./cl_kernels/reduce_operation.clembed" - }, - { - "scale_factor.cl", + }, + { + "scale_factor.cl", #include "./cl_kernels/scale_factor.clembed" - }, - { - "topkv2.cl", + }, + { + "topkv2.cl", #include "./cl_kernels/topkv2.clembed" - }, - { - "topkv2_radixsort.cl", + }, + { + "topkv2_radixsort.cl", #include "./cl_kernels/topkv2_radixsort.clembed" - }, - { - "topkv2_quicksort.cl", + }, + { + "topkv2_quicksort.cl", #include "./cl_kernels/topkv2_quicksort.clembed" - }, + }, #endif /* EMBEDDED_KERNELS */ }; CLKernelLibraryEx::CLKernelLibraryEx() - : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the // CLKernelLibraryEx is built @@ -337,8 +337,8 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); ARM_COMPUTE_ERROR_ON_MSG( - err != 0, - "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); ARM_COMPUTE_UNUSED(err); return result; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl index 0a014d15c..135cacf59 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl @@ -119,15 +119,15 @@ inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); - idx_sel.s0123 = (in.s0123 < in.s4567) || - (in.s0123 == in.s4567 && - CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + idx_sel.s0123 = + (in.s0123 < in.s4567) || + (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); idx_sel.s01 = - (in.s01 < in.s23) || - (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + (in.s01 < in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); in.s01 = select(in.s23, in.s01, idx_sel.s01); res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); @@ -204,15 +204,15 @@ inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); - idx_sel.s0123 = (in.s0123 > in.s4567) || - (in.s0123 == in.s4567 && - CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + idx_sel.s0123 = + (in.s0123 > in.s4567) || + (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); idx_sel.s01 = - (in.s01 > in.s23) || - (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + (in.s01 > in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); in.s01 = select(in.s23, in.s01, idx_sel.s01); res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); @@ -296,22 +296,21 @@ __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src), const uint x_idx = get_global_id(0); const uint y_idx = get_global_id(1); const __global DATA_TYPE *src_in_row = - (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + - y_idx * src_step_y); + (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y); for (unsigned int y = 0; y < get_local_size(1); ++y) { #if defined(ARG_MAX) #if defined(PREV_OUTPUT) - local_results[lid] = arg_idx_max_prev_out( - src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); + local_results[lid] = + arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); #else // !defined(PREV_OUTPUT) local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx); #endif // defined(PREV_OUTPUT) #else // defined(ARG_MIN) #if defined(PREV_OUTPUT) - local_results[lid] = arg_idx_min_prev_out( - src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); + local_results[lid] = + arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); #else // !defined(PREV_OUTPUT) local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx); #endif // defined(PREV_OUTPUT) @@ -334,12 +333,12 @@ __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src), DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]); #if defined(ARG_MAX) condition_check3 = - ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1); + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1); local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3); #else // defined(ARG_MIN) local_results[lid] = select( - local_results[lid], local_results[lid + i], - ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1)); + local_results[lid], local_results[lid + i], + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1)); #endif // defined(ARG_MAX) || defined(ARG_MIN) } barrier(CLK_LOCAL_MEM_FENCE); @@ -403,7 +402,7 @@ __kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output) { VEC_DATA_TYPE(DATA_TYPE, 16) in = - CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16)); + CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16)); VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl index e249663bc..f8b5bbeb8 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -111,14 +111,14 @@ __kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATI #if OP_CODE == 1 // LOGICAL AND VSTORE(VEC_SIZE) (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) && - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); #elif OP_CODE == 2 // LOGICAL OR VSTORE(VEC_SIZE) (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) || - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl index 92e5dfbee..5ebc78d23 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -117,15 +117,15 @@ __kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION // lookup ids for based on the tensor dimensions int lup_id[4] = {0}; - lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) - : get_global_id(0); - lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) - : get_global_id(1); + lup_id[0] = + (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0); + lup_id[1] = + (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1); lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) : get_global_id(2) % DEPTH_OUT; lup_id[3] = (NUM_DIMS == 4) - ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - : get_global_id(2) / DEPTH_OUT; + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl index 80ba73d1d..85fc09de4 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl @@ -41,7 +41,7 @@ #include "helpers.h" #if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \ - defined(COLS_A) + defined(COLS_A) #define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X) #define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X) #define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X) @@ -117,7 +117,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( , uint dst_cross_plane_pad #endif // REINTERPRET_OUTPUT_AS_3D - ) +) { int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; @@ -208,9 +208,9 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 // Load values from matrix B VECTOR_CHAR b0 = - VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)( - 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y)); + 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y)); // Accumulate acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0; @@ -251,7 +251,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 // Load values from matrix B VECTOR_CHAR b0 = - VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); // Accumulate acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl index a4f7dbd48..3ace1fde8 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -115,15 +115,15 @@ __kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION int lup_id[4] = {0}; - lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) - : get_global_id(0); - lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) - : get_global_id(1); + lup_id[0] = + (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0); + lup_id[1] = + (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1); lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) : get_global_id(2) % DEPTH_OUT; lup_id[3] = (NUM_DIMS == 4) - ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - : get_global_id(2) / DEPTH_OUT; + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; if (lup_id[NUM_DIMS - 1] < 0) { diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h index e07a25ec9..4a3bc1369 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -49,7 +49,7 @@ #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ - defined(cl_arm_integer_dot_product_accumulate_int8) + defined(cl_arm_integer_dot_product_accumulate_int8) #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && // defined(cl_arm_integer_dot_product_accumulate_int8) @@ -288,21 +288,21 @@ #define VECTOR_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ - uint name##_offset_first_element_in_bytes + uint name##_offset_first_element_in_bytes #define IMAGE_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_offset_first_element_in_bytes #define TENSOR3D_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ - uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ + uint name##_offset_first_element_in_bytes #define TENSOR4D_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ - uint name##_step_w, uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ + uint name##_step_w, uint name##_offset_first_element_in_bytes #define CONVERT_TO_VECTOR_STRUCT(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ @@ -406,9 +406,9 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ uint stride_x, uint step_x) { Vector vector = { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, }; vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; return vector; @@ -436,7 +436,7 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el .stride_x = stride_x, .stride_y = stride_y}; img.ptr += - img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; return img; } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h index 5f1b3f902..d7f1d0814 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -100,16 +100,16 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return quantized values */ -#define QUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(type, size) \ - quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ - { \ - VEC_DATA_TYPE(float, size) \ - out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ - VEC_DATA_TYPE(type, size) \ - res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \ - VEC_DATA_TYPE(type, size)); \ - return res; \ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = \ + CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ + return res; \ } /** Dequantize a vector of values to floating-point @@ -119,11 +119,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return dequantized values in floating point */ -#define DEQUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(float, size) \ - dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ - { \ - return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ } /** Correctly-rounded-to-nearest division by a power-of-two. @@ -134,7 +134,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \ - VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ { \ const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ @@ -152,32 +152,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Product of two fixed-point numbers. */ -#define ASYMM_MULT_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(int, size) \ - overflow = a == b && a == INT_MIN; \ - VEC_DATA_TYPE(long, size) \ - a_64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b_64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - ab_64 = a_64 * b_64; \ - /* Revert COMPMID-907 */ \ - VEC_DATA_TYPE(long, size) \ - mask1 = 1 << 30; \ - VEC_DATA_TYPE(long, size) \ - mask2 = 1 - (1 << 30); \ - VEC_DATA_TYPE(long, size) \ - is_positive_or_zero = ab_64 >= 0; \ - VEC_DATA_TYPE(long, size) \ - nudge = select(mask2, mask1, is_positive_or_zero); \ - VEC_DATA_TYPE(long, size) \ - mask = 1ll << 31; \ - VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ - return select(ab_x2_high32, INT_MAX, overflow); \ +#define ASYMM_MULT_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(int, size) \ + overflow = a == b && a == INT_MIN; \ + VEC_DATA_TYPE(long, size) \ + a_64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ + /* Revert COMPMID-907 */ \ + VEC_DATA_TYPE(long, size) \ + mask1 = 1 << 30; \ + VEC_DATA_TYPE(long, size) \ + mask2 = 1 - (1 << 30); \ + VEC_DATA_TYPE(long, size) \ + is_positive_or_zero = ab_64 >= 0; \ + VEC_DATA_TYPE(long, size) \ + nudge = select(mask2, mask1, is_positive_or_zero); \ + VEC_DATA_TYPE(long, size) \ + mask = 1ll << 31; \ + VEC_DATA_TYPE(int, size) \ + ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ + return select(ab_x2_high32, INT_MAX, overflow); \ } /** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). @@ -186,32 +186,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ - a) \ - { \ - const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ - const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ - const int k_fractional_bits = 31; \ - VEC_DATA_TYPE(int, size) \ - x = a + (1 << (k_fractional_bits - 3)); \ - VEC_DATA_TYPE(int, size) \ - x2 = ASYMM_MULT(x, x, size); \ - VEC_DATA_TYPE(int, size) \ - x3 = ASYMM_MULT(x2, x, size); \ - VEC_DATA_TYPE(int, size) \ - x4 = ASYMM_MULT(x2, x2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2 = \ - ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ - ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ - return constant_term + \ - ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ + a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = \ + ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + \ + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ } /** Each bit of the result is set to the corresponding bit of either then_val or @@ -263,15 +263,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) #define EXP_BARREL_SHIFTER_IMPL(size) \ inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ - VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ - int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ { \ if (k_integer_bits > exponent) \ { \ const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ return ASYMM_SELECT_USING_MASK( \ - ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ - ASYMM_MULT(result, fp_multiplier, size), result, size); \ + ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ } \ \ return result; \ @@ -285,7 +285,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ { \ const int k_fractional_bits = 31 - k_integer_bits; \ VEC_DATA_TYPE(int, size) \ @@ -298,7 +298,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ VEC_DATA_TYPE(int, size) \ result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ - a_mod_quarter_minus_one_quarter_scaled, size); \ + a_mod_quarter_minus_one_quarter_scaled, size); \ VEC_DATA_TYPE(int, size) \ remainder = a_mod_quarter_minus_one_quarter - a; \ \ @@ -312,10 +312,10 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) remainder, size); \ result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ - size); \ result = \ - EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ + result = \ + EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ \ if (k_integer_bits > 5) \ { \ @@ -335,27 +335,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Arithmetic left or right shift. */ -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - if (exponent < 0) \ - { \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) min = INT_MIN; \ - const VEC_DATA_TYPE(int, size) max = INT_MAX; \ - int threshold = ((1 << (31 - exponent)) - 1); \ - VEC_DATA_TYPE(int, size) \ - positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ - VEC_DATA_TYPE(int, size) \ - negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ - VEC_DATA_TYPE(int, size) \ - result = x << exponent; \ - result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ - result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ - return result; \ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ } /** Calculates (a+b)/2, rounded to the nearest integer. @@ -365,21 +365,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return (a+b)/2, rounded to the nearest integer. */ -#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(long, size) \ - a64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - sum = a64 + b64; \ - const VEC_DATA_TYPE(long, size) one = 1; \ - const VEC_DATA_TYPE(long, size) minus_one = -1; \ - VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, sum >= 0); \ - return convert_int##size((sum + sign) / 2); \ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, sum >= 0); \ + return convert_int##size((sum + sign) / 2); \ } /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). @@ -390,7 +390,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ inline VEC_DATA_TYPE(int, size) \ - asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ { \ const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ @@ -462,14 +462,14 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ asymm_rescale##size(value, src_integer_bits, dst_integer_bits) -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ - { \ - const int left_shift = shift > 0 ? shift : 0; \ - const int right_shift = shift > 0 ? 0 : -shift; \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ - right_shift, size); \ +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ + right_shift, size); \ } #define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ multiply_by_quantized_multiplier##size(input, qmul, shift) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl index 014842680..96a243110 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl @@ -41,7 +41,7 @@ #include "helpers.h" #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ - defined(DIM_Y) && defined(DIM_Z) + defined(DIM_Y) && defined(DIM_Z) /** This function normalizes the input 2D tensor across the first dimension with respect to mean and * standard deviation of the same dimension. * @@ -108,14 +108,14 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output) #endif /* IN_PLACE */ #ifdef GAMMA - , + , VECTOR_DECLARATION(gamma) #endif // GAMMA #ifdef BETA - , + , VECTOR_DECLARATION(beta) #endif // BETA - ) +) { Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); #ifndef IN_PLACE @@ -213,12 +213,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (int i_h = 0; i_h < DIM_Z; ++i_h) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); #endif /* IN_PLACE */ *(output_address) = (*(input_address)-mean) * multip + beta; } @@ -231,12 +231,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); #endif /* IN_PLACE */ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) @@ -251,12 +251,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (; x < DIM_X; ++x) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); #endif /* IN_PLACE */ *(output_address) = (*(input_address)-mean) * multip + beta; } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl index 3943fc4c2..abbfbd275 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl @@ -114,8 +114,8 @@ __kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION (val, 0, (__global DATA_TYPE *)output.ptr); #else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) *((__global DATA_TYPE *)(output.ptr)) = - ((DATA_TYPE)(*((__global int *)(input.ptr)))) * - *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier); + ((DATA_TYPE)(*((__global int *)(input.ptr)))) * + *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier); #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl index c274aba62..784a8d6aa 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl @@ -206,16 +206,16 @@ __kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLAR #if AXIS == 0 *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) = - *((__global const DATA_TYPE *)on_value_ptr); + *((__global const DATA_TYPE *)on_value_ptr); #elif AXIS == 1 *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) = - *((__global const DATA_TYPE *)on_value_ptr); + *((__global const DATA_TYPE *)on_value_ptr); #elif AXIS == 2 *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) = - *((__global const DATA_TYPE *)on_value_ptr); + *((__global const DATA_TYPE *)on_value_ptr); #elif AXIS == 3 *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) = - *((__global const DATA_TYPE *)on_value_ptr); + *((__global const DATA_TYPE *)on_value_ptr); #endif // AXIS } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl index 76fda9041..532000e9e 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -138,7 +138,7 @@ __kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARAT // Multiply with a multiplier smaller than 1 out_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl index 4ae9adb0b..c829f264d 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl @@ -116,7 +116,7 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc // Create scale vector const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = - *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1)); + *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1)); // Quantize VEC_DATA_TYPE(int, VEC_SIZE) @@ -127,10 +127,10 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); #else //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP( - CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) / - (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))), - int), - MIN_QUANT_VAL, MAX_QUANT_VAL); + CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) / + (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))), + int), + MIN_QUANT_VAL, MAX_QUANT_VAL); #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } #endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl index 832ac1270..d0ef31b20 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -100,12 +100,14 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(o Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); int indices[4] = { - get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, }; DATA_TYPE value = - *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); for (int i = 1; i < dim; ++i) { indices[axis] = i; @@ -186,16 +188,18 @@ __kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION( Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); int indices[4] = { - get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, }; DATA_TYPE sum_value = (DATA_TYPE)0; for (int i = 0; i < dim; ++i) { indices[axis] = i; - sum_value += *( - (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + sum_value += + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); } #if OP_CODE == 3 // REDUCE_SUM diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp index 047004d5e..45307fad7 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp @@ -63,10 +63,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_outp { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && - op != ReductionOperation::ARG_IDX_MIN, + op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); @@ -101,13 +102,13 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, output_shape.set(axis, 1); DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32; auto_init_if_empty(*output, input->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); - Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), - Steps(vector_size)); + Window win = + calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size)); bool window_changed = false; switch (axis) @@ -137,15 +138,15 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, } Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_tuple(err, win); } } // namespace CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx() - : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), - _op(ReductionOperation::ARG_IDX_MAX) + : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), + _op(ReductionOperation::ARG_IDX_MAX) { } @@ -155,11 +156,11 @@ void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, - output->info(), axis, op)); + validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, + output->info(), axis, op)); auto win_config = validate_and_configure_window( - input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, - op); + input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, + op); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); _input = input; @@ -213,7 +214,7 @@ void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor ARM_COMPUTE_ERROR("Not supported"); } _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( - "arg_min_max_ex_" + kernel_axis_name, build_opts.options())); + "arg_min_max_ex_" + kernel_axis_name, build_opts.options())); // Configure kernel window ICLKernel::configure_internal(std::get<1>(win_config), lws_hint); @@ -225,8 +226,8 @@ Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITenso { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, - output->clone().get(), axis, op))); + input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, + output->clone().get(), axis, op))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp index fbc76f5e1..ffa2c5a67 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -55,7 +55,7 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); @@ -68,15 +68,15 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); } return Status{}; } } // namespace CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) + : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -111,13 +111,13 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); const ValidRegion &valid_region = broadcast_pair.second; @@ -130,8 +130,8 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); output_access.set_valid_region(win, valid_region); @@ -151,7 +151,7 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) { can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) { can_collapse = (in_shape1[d] == in_shape2[d]); @@ -160,13 +160,13 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) bool has_collapsed = false; Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; Window slice = collapsed.first_slice_window_3D(); Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); @@ -189,9 +189,9 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) BorderSize CLBinaryLogicalOpKernel::border_size() const { const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); return BorderSize(0, border, 0, 0); } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp index 6e0bcde7f..3f2ae357d 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp @@ -103,7 +103,7 @@ void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output) // Create kernel const std::string kernel_name = "cast_bool"; _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); // Configure kernel ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp index 67aaf2db6..e4c617c8d 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -61,14 +61,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen input_access.set_valid_region(win, output->valid_region()); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_pair(err, win); } } // namespace CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() - : _input(nullptr), _output(nullptr), _lookups(nullptr) + : _input(nullptr), _output(nullptr), _lookups(nullptr) { } @@ -77,8 +77,8 @@ Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensor { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -108,8 +108,8 @@ void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *outpu build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); // Configure kernel window auto win_config = validate_and_configure_window(input->info(), output->info()); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp index 3bfe3e407..8b5885225 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -62,15 +62,15 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), actual_axis); + input->tensor_shape(), indices->tensor_shape(), actual_axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -86,7 +86,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); std::unique_ptr<ITensorInfo> output_info = input->clone(); output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), actual_axis)); + input->tensor_shape(), indices->tensor_shape(), actual_axis)); // Output auto initialization if not yet initialized auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); @@ -100,7 +100,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLGatherExKernel::CLGatherExKernel() - : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) + : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) { } @@ -109,11 +109,11 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), indices->info(), output->info(), axis)); + validate_arguments(input->info(), indices->info(), output->info(), axis)); // Configure kernel window auto win_config = - validate_and_configure_window(input->info(), indices->info(), output->info(), axis); + validate_and_configure_window(input->info(), indices->info(), output->info(), axis); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); _input = input; @@ -133,7 +133,7 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); ICLKernel::configure_internal(win_config.second); } @@ -144,7 +144,7 @@ Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *i ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis) - .first); + .first); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp index 930e7c944..f0a761b97 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -61,8 +61,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen input_access.set_valid_region(win, output->valid_region()); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_pair(err, win); } } // namespace @@ -78,8 +78,8 @@ Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); @@ -102,7 +102,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso const ICLTensor *input, ICLTensor *output, ICLTensor *hits) { ARM_COMPUTE_ERROR_THROW_ON( - validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); _lookups = lookups; _keys = keys; @@ -113,7 +113,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso // Make _lookup_indices tensor _lookup_indices = support::cpp14::make_unique<CLTensor>(); _lookup_indices->allocator()->init( - TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); _lookup_indices->allocator()->allocate(); // Set kernel build options @@ -127,8 +127,8 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); // Configure kernel window auto win_config = validate_and_configure_window(input->info(), output->info()); @@ -148,7 +148,7 @@ void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) // Set values of hits const int32_t *lookups_buf = - reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp index 61c14d271..dab6480b2 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -94,8 +94,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe } // namespace CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx() - : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), - _run_in_place(false) + : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), + _run_in_place(false) { } @@ -132,7 +132,7 @@ void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); // Configure kernel window auto win_config = validate_and_configure_window(_input->info(), _output->info()); @@ -147,7 +147,7 @@ Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp index 6b27c9917..1d4b141a7 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp @@ -99,7 +99,7 @@ std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *inpu } // namespace CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) { } @@ -108,7 +108,7 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -123,9 +123,9 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen Window win = calculate_max_window(*output->info()); if (multi_access_x) { - win.set(Window::DimX, - Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), - vec_size_x)); + win.set( + Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -134,11 +134,11 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option_if( - multi_access_x, "-DLAST_ACCESSED_X=" + - support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + multi_access_x, "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options())); } Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input, @@ -147,7 +147,7 @@ Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); ARM_COMPUTE_RETURN_ON_ERROR( - std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp index 643c8b110..ee633d437 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -80,9 +80,9 @@ void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) std::set<std::string> build_opts; build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); // Configure window Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp index 35d70d689..0b8e7cc41 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp @@ -65,7 +65,7 @@ inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo * { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output); TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( - indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } return Status{}; @@ -79,7 +79,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices, const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); // Output auto initialization if not yet initialized TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( - indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); auto_init_if_empty((*output), output_shape, 1, on_value->data_type()); // Create window Window win = calculate_max_window(*output, Steps()); @@ -88,8 +88,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices, } } // namespace CLOneHotKernel::CLOneHotKernel() - : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr), - _is_off_value_memset(false) + : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr), + _is_off_value_memset(false) { } void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value, @@ -114,10 +114,10 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor ICLTensor *output, int depth, int axis) { ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis)); + validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis)); // Configure kernel window auto win_config = - validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis); + validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); if (_is_off_value_memset) { @@ -131,7 +131,7 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size( - data_size_from_type(on_value->info()->data_type()))); + data_size_from_type(on_value->info()->data_type()))); build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis)); build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth)); build_opts.add_option("-DOUTPUT_DIM_Z=" + @@ -139,7 +139,7 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor // Create kernel const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot"; _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); ICLKernel::configure_internal(win_config.second); } Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, @@ -153,7 +153,7 @@ Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *o ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), on_value->clone().get(), output->clone().get(), depth, axis) - .first); + .first); return Status{}; } Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, @@ -163,7 +163,7 @@ Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *o ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), on_value->clone().get(), output->clone().get(), depth, axis) - .first); + .first); return Status{}; } void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp index 1a7a18cfa..b417a7103 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp @@ -87,9 +87,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen if (multi_access_x) { - win.set(Window::DimX, - Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), - vec_size_x)); + win.set( + Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } Coordinates coord; @@ -101,7 +101,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr) { } @@ -110,7 +110,7 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT { ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -132,11 +132,11 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option_if( - multi_access_x, "-DLAST_ACCESSED_X=" + - support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); + multi_access_x, + "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options())); } Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input, @@ -145,7 +145,7 @@ Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input->clone().get(), output->clone().get()).first); + validate_and_configure_window(input->clone().get(), output->clone().get()).first); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp index 3fbebf25a..3906009c2 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -145,7 +145,7 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu // Create kernel _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); // Configure kernel window Window win = calculate_max_window(*output_info, Steps()); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp index 8d8853c81..4a6374444 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp @@ -94,8 +94,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_tuple(err, win); } } // namespace @@ -115,7 +115,7 @@ void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *outp // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts)); + CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts)); auto win_config = validate_and_configure_window(input->info(), output->info()); @@ -128,7 +128,7 @@ Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITenso { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); ARM_COMPUTE_RETURN_ON_ERROR( - std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp index dfe5d59b0..c88bef6d7 100644 --- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp +++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp @@ -53,12 +53,12 @@ namespace using namespace arm_compute; template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> void elementwise_op_templ( - const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), - int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, - OutputScalarType *, const bool), - int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, - OutputScalarType *)) + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, + OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, + OutputScalarType *)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -88,26 +88,26 @@ void elementwise_op_templ( Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, - [&](const Coordinates &) { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = - reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = - *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, - non_broadcast_input_ptr, broadcast_value, - output_ptr, !is_broadcast_input_2); - for (; x < window_end_x; ++x) - { - const auto a = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = - (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, - !is_broadcast_input_2 ? a : broadcast_value); - } - }, - broadcast_input, non_broadcast_input, output); + execute_window_loop( + win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = + (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_value, output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -119,24 +119,23 @@ void elementwise_op_templ( Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, - [&](const Coordinates &) { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto input1_ptr = - reinterpret_cast<const InputScalarType *>(input1.ptr()); - const auto input2_ptr = - reinterpret_cast<const InputScalarType *>(input2.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, - input1_ptr, input2_ptr, output_ptr); - for (; x < window_end_x; ++x) - { - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = (*scalar_func)(a, b); - } - }, - input1, input2, output); + execute_window_loop( + win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, + output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); } } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp index 32d7d6237..a8464afce 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp @@ -103,8 +103,10 @@ template <BinaryLogicalOperation op> inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b) { uint8x16x4_t out = {{ - elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]), - elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]), + elementwise_logic_op<op>(a.val[0], b.val[0]), + elementwise_logic_op<op>(a.val[1], b.val[1]), + elementwise_logic_op<op>(a.val[2], b.val[2]), + elementwise_logic_op<op>(a.val[3], b.val[3]), }}; return out; } @@ -160,8 +162,8 @@ void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, } std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( - const ITensor *input1, const ITensor *input2, ITensor *output, - std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) + const ITensor *input1, const ITensor *input2, ITensor *output, + std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) { std::string function_to_call("op_"); function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; @@ -184,8 +186,8 @@ std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) { static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { - {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, - {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; + {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, + {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; return configure_func(input1, input2, output, map_function); } @@ -223,7 +225,7 @@ Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &inp ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); const TensorShape out_shape = - TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -232,8 +234,8 @@ Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &inp if (output.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), - "Wrong shape for output"); + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); } return Status{}; diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp index 12017e543..f935596e6 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp @@ -129,125 +129,125 @@ void NECastBoolKernel::run(const Window &window, const ThreadInfo &info) case DataType::S8: { /* Conversion U8 -> S8 */ - execute_window_loop(win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8( - texels_u8, vdupq_n_u8(true_val)))); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_s8(output_ptr + x, + vreinterpretq_s8_u8(vandq_u8(texels_u8, vdupq_n_u8(true_val)))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } case DataType::S16: { /* Up-conversion U8 -> S16 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const int16x8x2_t texels = { - {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), - vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; - - vst1q_s16(output_ptr + x, texels.val[0]); - vst1q_s16(output_ptr + x + 8, texels.val[1]); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s16(output_ptr + x, texels.val[0]); + vst1q_s16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } case DataType::S32: { /* Up-conversion U8 -> S32 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const int16x8x2_t texels = { - {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), - vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; - - vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } case DataType::F32: { /* Up-conversion U8 -> F32 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const int16x8x2_t texels = { - {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), - vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; - vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(output_ptr + x + 12, - vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val); - *(output_ptr + x) = static_cast<float>(in); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val); + *(output_ptr + x) = static_cast<float>(in); + } + }, + input, output); break; } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -255,86 +255,87 @@ void NECastBoolKernel::run(const Window &window, const ThreadInfo &info) { /* Up-conversion U8 -> F16 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const int16x8x2_t texels = { - {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), - vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; - vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::U8: { /* Conversion U8 -> S8 */ - execute_window_loop(win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val))); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } case DataType::U16: { /* Up-conversion U8 -> U16 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)), - vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}}; - - vst1q_u16(output_ptr + x, texels.val[0]); - vst1q_u16(output_ptr + x + 8, texels.val[1]); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)), + vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}}; + + vst1q_u16(output_ptr + x, texels.val[0]); + vst1q_u16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } default: diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp index 091d38c56..e3a77c6b1 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -50,7 +50,7 @@ using namespace arm_compute; NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() - : _input(nullptr), _lookups(nullptr), _output(nullptr) + : _input(nullptr), _lookups(nullptr), _output(nullptr) { } @@ -79,8 +79,8 @@ Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input, { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); @@ -119,16 +119,17 @@ void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, - [&](const Coordinates &id) { - const int32_t lookup = *reinterpret_cast<int32_t *>( - _lookups->ptr_to_element(Coordinates{id[lookup_dim]})); - Coordinates input_id{id}; - input_id.set(lookup_dim, lookup); - memcpy(output_it.ptr(), _input->ptr_to_element(input_id), - _output->info()->dimension(0) * _output->info()->element_size()); - }, - output_it); + execute_window_loop( + out_slice, + [&](const Coordinates &id) { + const int32_t lookup = + *reinterpret_cast<int32_t *>(_lookups->ptr_to_element(Coordinates{id[lookup_dim]})); + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + }, + output_it); } while (window.slide_window_slice_4D(out_slice)); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp index 93963a504..c9f0799d4 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -71,7 +71,7 @@ template <typename U> void validate_indices(const ITensor *indices) } // namespace NEGatherKernelEx::NEGatherKernelEx() - : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{} + : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{} { } @@ -85,36 +85,35 @@ inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadIn Iterator output_it(_output, window); execute_window_loop( - window, - [&](const Coordinates &id) { - Coordinates gather_id(id); - gather_id.collapse(_indices_rank); - - U new_index; - switch (_indices_rank) - { - case 1: - new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); - break; - case 2: - new_index = - *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); - break; - case 3: - new_index = *( - reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); - break; - default: - ARM_COMPUTE_ERROR("Wrong num of dimensions"); - break; - } - - gather_id.set(0, new_index); - - std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), - output_it.ptr()); - }, - output_it); + window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); + break; + case 2: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); + break; + case 3: + new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(0, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), + output_it.ptr()); + }, + output_it); } template <typename U> @@ -130,37 +129,36 @@ void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &inf Iterator output_it(_output, output_window); execute_window_loop( - output_window, - [&](const Coordinates &id) { - Coordinates gather_id(id); - gather_id.collapse(_indices_rank, _axis); - - U new_index; - switch (_indices_rank) - { - case 1: - new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); - break; - case 2: - new_index = *(reinterpret_cast<U *>( - _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); - break; - case 3: - new_index = *(reinterpret_cast<U *>( - _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); - break; - default: - ARM_COMPUTE_ERROR("Wrong num of dimensions"); - break; - } - - gather_id.set(_axis, new_index); - - std::copy_n(_input->ptr_to_element(gather_id), - _input->info()->dimension(0) * _output->info()->element_size(), - output_it.ptr()); - }, - output_it); + output_window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank, _axis); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); + break; + case 2: + new_index = *( + reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); + break; + case 3: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(_axis, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), + _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr()); + }, + output_it); } void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, @@ -170,8 +168,8 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); _input = input; _indices = indices; @@ -217,7 +215,7 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I } // Output auto initialization if not yet initialized TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); // Create window @@ -243,15 +241,15 @@ Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), axis); + input->tensor_shape(), indices->tensor_shape(), axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp index 30787c0a4..52b40e767 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -57,7 +57,7 @@ constexpr size_t NOT_HIT = 0xFFFFFFFF; } // namespace NEHashtableLookupKernel::NEHashtableLookupKernel() - : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} + : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} { } @@ -66,7 +66,7 @@ void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *k { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_THROW_ON( - validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); _lookups = lookups; _keys = keys; @@ -92,8 +92,8 @@ Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); @@ -134,8 +134,8 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) const size_t lookup_dim = _output->info()->num_dimensions() - 1; const int const_0 = _output->info()->data_type() == DataType::QASYMM8 - ? _output->info()->quantization_info().uniform().offset - : 0; + ? _output->info()->quantization_info().uniform().offset + : 0; std::unordered_map<int32_t, size_t> key_index_map; for (size_t n = 0; n < _keys->info()->dimension(0); ++n) @@ -174,24 +174,24 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, - [&](const Coordinates &id) { - const auto lookup = lookup_indices.at(id[lookup_dim]); - if (lookup == NOT_HIT) - { - memset(output_it.ptr(), const_0, - _output->info()->dimension(0) * _output->info()->element_size()); - } - else - { - Coordinates input_id{id}; - input_id.set(lookup_dim, lookup); - memcpy(output_it.ptr(), _input->ptr_to_element(input_id), - _output->info()->dimension(0) * _output->info()->element_size()); - } - - }, - output_it); + execute_window_loop( + out_slice, + [&](const Coordinates &id) { + const auto lookup = lookup_indices.at(id[lookup_dim]); + if (lookup == NOT_HIT) + { + memset(output_it.ptr(), const_0, + _output->info()->dimension(0) * _output->info()->element_size()); + } + else + { + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + } + }, + output_it); } while (window.slide_window_slice_4D(out_slice)); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp index 49adf1462..4dc0f5535 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -63,7 +63,7 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma { /** NEON vector tag type. */ using ExactTagType = - typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; // Clear X/Y dimensions on execution window as we handle the planes manually Window win = window; @@ -73,107 +73,107 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma constexpr int window_step_x = 16 / sizeof(T); const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); const auto channel_idx = - get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); Iterator input_it(input, win); execute_window_loop( - win, - [&](const Coordinates &id) { - Window win_plane = window; - win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); - win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); - - Iterator input_plane_it(input, win_plane); - Iterator output_plane_it(output, win_plane); - - auto sum_h_w = static_cast<T>(0.f); - auto sum_squares_h_w = static_cast<T>(0.f); - - execute_window_loop( - win_plane, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); - - auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - - // Compute S elements per iteration - int x = window.x().start(); - for (; x <= (window.x().end() - window_step_x); x += window_step_x) - { - auto vec_input_val = wrapper::vloadq(input_ptr + x); - vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); - vec_sum_squares_h_w = - wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); - } - - auto vec2_sum_h_w = - wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); - auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), - wrapper::vgetlow(vec_sum_squares_h_w)); - for (int i = 0; i < window_step_x / 4; ++i) - { - vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); - vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); - } - sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); - sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); - - // Compute left-over elements - for (; x < window.x().end(); ++x) - { - const auto value = *(input_ptr + x); - sum_h_w += value; - sum_squares_h_w += value * value; - } - }, - input_plane_it, output_plane_it); - - const auto mean_h_w = sum_h_w / elements_plane; - const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; - - auto gamma_val = 1.0f; - if (gamma != nullptr) - { - gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); - } - const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); - const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); - const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); - auto beta_val = 0.0f; - if (beta != nullptr) - { - beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); - } - const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); - - execute_window_loop( - win_plane, - [&](const Coordinates &) { - auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); - auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); - - // Compute S elements per iteration - int x = window.x().start(); - auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); - for (; x <= (window.x().end() - window_step_x); x += window_step_x) - { - vec_val = wrapper::vloadq(input_ptr + x); - vec_val = wrapper::vadd( - wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); - wrapper::vstore(output_ptr + x, vec_val); - } - - // Compute left-over elements - for (; x < window.x().end(); ++x) - { - *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; - } - }, - input_plane_it, output_plane_it); - }, - input_it); + win, + [&](const Coordinates &id) { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<T>(0.f); + auto sum_squares_h_w = static_cast<T>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); + vec_sum_squares_h_w = + wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); + } + + auto vec2_sum_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), + wrapper::vgetlow(vec_sum_squares_h_w)); + for (int i = 0; i < window_step_x / 4; ++i) + { + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + } + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = *(input_ptr + x); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + auto gamma_val = 1.0f; + if (gamma != nullptr) + { + gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); + } + const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); + auto beta_val = 0.0f; + if (beta != nullptr) + { + beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); + } + const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + vec_val = wrapper::vloadq(input_ptr + x); + vec_val = wrapper::vadd( + wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); + wrapper::vstore(output_ptr + x, vec_val); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; + } + }, + input_plane_it, output_plane_it); + }, + input_it); } Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, @@ -199,8 +199,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( - input->data_layout(), DataLayoutDimension::CHANNEL)) != - gamma->dimension(0), + input->data_layout(), DataLayoutDimension::CHANNEL)) != + gamma->dimension(0), "Gamma's size must be the same as size of input's channel"); } @@ -208,8 +208,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( - input->data_layout(), DataLayoutDimension::CHANNEL)) != - beta->dimension(0), + input->data_layout(), DataLayoutDimension::CHANNEL)) != + beta->dimension(0), "Beta's size must be the same as size of input's channel"); } @@ -234,8 +234,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe } // namespace NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx() - : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), - _epsilon(1e-12) + : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), + _epsilon(1e-12) { } @@ -251,7 +251,7 @@ void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *ou _epsilon = epsilon; ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); + validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); if (_input->info()->data_type() == DataType::F32) { @@ -282,7 +282,7 @@ Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp index b92130cec..ad4728175 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -123,15 +123,17 @@ inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale) const float32x4_t vscale = vdupq_n_f32(scale); const float32x4x4_t ret = {{ - vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), - vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), }}; return ret; } } // namespace NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) { } @@ -140,7 +142,7 @@ void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -180,25 +182,25 @@ template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &w Iterator output(_output, win_collapsed); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); execute_window_loop( - win_collapsed, - [&](const Coordinates &id) { - auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); - scale *= _multiplier; - - const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); - auto output_ptr = reinterpret_cast<T *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = input_ptr[x] * scale; - } - }, - input, output); + win_collapsed, + [&](const Coordinates &id) { + auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); + scale *= _multiplier; + + const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = input_ptr[x] * scale; + } + }, + input, output); } void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info) diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp index 0a11eb509..0daff5c6a 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp @@ -101,8 +101,8 @@ bool isOnValue(U index, U depth) } // namespace NEOneHotKernel::NEOneHotKernel() - : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1}, - _output{nullptr}, _func{} + : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, + _off_value{nullptr}, _axis{-1}, _output{nullptr}, _func{} { } @@ -117,22 +117,22 @@ void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info) Iterator output_it(_output, output_window); const U off_value = *reinterpret_cast<U *>(_off_value->buffer()); execute_window_loop( - output_window, - [&](const Coordinates &id) { - std::fill_n(output_it.ptr(), - _output->info()->dimension(0) * _output->info()->element_size(), off_value); - Coordinates indices_id(id); - indices_id.remove(0); - const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); - if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) - { - Coordinates onehot_id(id); - onehot_id.set(0, new_index); - std::copy_n(_on_value->buffer(), _output->info()->element_size(), - _output->ptr_to_element(onehot_id)); - } - }, - output_it); + output_window, + [&](const Coordinates &id) { + std::fill_n(output_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size(), + off_value); + Coordinates indices_id(id); + indices_id.remove(0); + const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(0, new_index); + std::copy_n(_on_value->buffer(), _output->info()->element_size(), + _output->ptr_to_element(onehot_id)); + } + }, + output_it); } template <typename U> @@ -142,22 +142,22 @@ inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo // Validate that the indices are not negative validate_depth<U>(_depth, _output, _axis); Iterator output_it(_output, window); - execute_window_loop(window, - [&](const Coordinates &id) { - Coordinates indices_id(id); - indices_id.remove(_axis); - const U new_index = - *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); - if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) - { - Coordinates onehot_id(id); - onehot_id.set(_axis, new_index); - std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer() - : _off_value->buffer(), - _output->info()->element_size(), output_it.ptr()); - } - }, - output_it); + execute_window_loop( + window, + [&](const Coordinates &id) { + Coordinates indices_id(id); + indices_id.remove(_axis); + const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(_axis, new_index); + std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer() + : _off_value->buffer(), + _output->info()->element_size(), output_it.ptr()); + } + }, + output_it); } void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth, @@ -215,7 +215,7 @@ Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *d const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ON_ERROR( - validate_arguments(indices, depth, on_value, off_value, output, axis)); + validate_arguments(indices, depth, on_value, off_value, output, axis)); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp index 5841f1d69..2306228d5 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -107,19 +107,15 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, const int32x4x4_t rf = {{ #ifdef __aarch64__ - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), #else //__aarch64__ - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), #endif //__aarch64__ }}; const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); @@ -129,7 +125,7 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, } // namespace NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel() - : _input(nullptr), _output(nullptr), _scale_factor(nullptr) + : _input(nullptr), _output(nullptr), _scale_factor(nullptr) { } @@ -138,7 +134,7 @@ void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *out { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), output->info(), scale_factor->info())); + validate_arguments(input->info(), output->info(), scale_factor->info())); _input = input; _output = output; @@ -182,40 +178,40 @@ template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window const auto dim_x = _input->info()->dimension(0); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); execute_window_loop( - win_collapsed, - [&](const Coordinates &id) { - const auto start = reinterpret_cast<const T *>(input.ptr()); - const auto min_max = std::minmax_element(start, start + dim_x); - const auto int8_scale = 127; - auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); - if (range == 0) - { - *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; - range = 1; - } - else - { - *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; - } - const auto scale_factor_inv = int8_scale / range; - - auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], - vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); - quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); - output_ptr[x] = static_cast<int8_t>(quantized); - } - }, - input, output); + win_collapsed, + [&](const Coordinates &id) { + const auto start = reinterpret_cast<const T *>(input.ptr()); + const auto min_max = std::minmax_element(start, start + dim_x); + const auto int8_scale = 127; + auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); + if (range == 0) + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; + range = 1; + } + else + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; + } + const auto scale_factor_inv = int8_scale / range; + + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], + vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); + quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); + output_ptr[x] = static_cast<int8_t>(quantized); + } + }, + input, output); } void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info) diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp index 267228eac..b02a48ef2 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp @@ -50,8 +50,8 @@ namespace arm_compute { CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), - _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() + : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), + _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() { } @@ -60,13 +60,13 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && - op != ReductionOperation::ARG_IDX_MIN, + op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const unsigned int num_of_stages = - calculate_number_of_stages_only_x_axis(input->dimension(0), axis); + calculate_number_of_stages_only_x_axis(input->dimension(0), axis); DataType output_data_type = DataType::S32; TensorInfo not_reshaped_output; @@ -76,9 +76,9 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT if (output->total_size() != 0) { output_data_type = output->data_type(); - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( - arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, - false)); + const TensorInfo expected_output_shape = + output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape( + input->tensor_shape(), axis, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } @@ -87,9 +87,9 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) { ti.set_data_type(data_type) - .set_tensor_shape(shape) - .set_num_channels(num_channels) - .set_quantization_info(qinfo); + .set_tensor_shape(shape) + .set_num_channels(num_channels) + .set_quantization_info(qinfo); }; initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, @@ -98,7 +98,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT if (num_of_stages == 1) { ARM_COMPUTE_RETURN_ON_ERROR( - CLArgMinMaxLayerKernelEx::validate(input, nullptr, ¬_reshaped_output, axis, op)); + CLArgMinMaxLayerKernelEx::validate(input, nullptr, ¬_reshaped_output, axis, op)); } else { @@ -118,19 +118,19 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT // Validate ReductionOperation only on first kernel ARM_COMPUTE_RETURN_ON_ERROR( - CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op)); + CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op)); // Validate ReductionOperation on intermediate stages for (unsigned int i = 1; i < num_of_stages - 1; ++i) { - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], - &sums_vector[i], axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op)); } // Validate ReductionOperation on the last stage const unsigned int last_stage = num_of_stages - 1; ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate( - input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); + input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); } ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(¬_reshaped_output, output)); return Status{}; @@ -144,16 +144,16 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor * _reduction_axis = axis; const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape( - input->info()->tensor_shape(), axis, false); + input->info()->tensor_shape(), axis, false); DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) - ? DataType::S32 - : output->info()->data_type(); + ? DataType::S32 + : output->info()->data_type(); auto_init_if_empty(*output->info(), input->info() - ->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); // Configure reduction operation kernels _reduction_kernels_vector.resize(_num_of_stages); @@ -166,11 +166,11 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor * TensorShape output_shape{input->info()->tensor_shape()}; output_shape.set(axis, 1); auto_init_if_empty(*_not_reshaped_output.info(), input->info() - ->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _not_reshaped_output.info()->set_tensor_shape(output_shape); _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op); } @@ -182,7 +182,7 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor * { shape.set(0, ceil(shape.x() / 128.f)); _results_vector[i].allocator()->init( - input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); + input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); } // Apply ReductionOperation only on first kernel diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp index 3dede0562..6359b4bcb 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp @@ -53,16 +53,10 @@ namespace arm_compute using namespace arm_compute::misc::shape_calculator; CLDirectTransposeConvLayer::CLDirectTransposeConvLayer( - std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _scale_f(), - _conv_f(), - _flip_weights(), - _scaled_output(), - _original_weights(nullptr), - _weights_flipped(), - _flip_axis(), - _is_prepared(false) + std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), _scale_f(), _conv_f(), _flip_weights(), + _scaled_output(), _original_weights(nullptr), _weights_flipped(), _flip_axis(), + _is_prepared(false) { } @@ -74,7 +68,7 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); const DataLayout data_layout = input->data_layout(); @@ -86,8 +80,8 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); auto out_dims = transposeconv_output_dimensions( - input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), - weights->dimension(idx_h), info, invalid_right, invalid_bottom); + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); @@ -117,19 +111,19 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen unsigned int pad_right = 0; unsigned int pad_top = 0; unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, - pad_bottom); + const TensorShape scale_out_shape = + compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); TensorInfo scale_out_info(input->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(scale_out_shape) - .set_data_layout(data_layout)); + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, - conv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); return Status{}; } @@ -171,22 +165,22 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); auto out_dims = transposeconv_output_dimensions( - input->info()->dimension(idx_w), input->info()->dimension(idx_h), - weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, - invalid_bottom); + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); const TensorShape output_shape = - compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); // Output auto initialization if not yet initialized auto_init_if_empty( - *output->info(), - input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate( - input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), - info, invalid_right, invalid_bottom)); + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, + invalid_right, invalid_bottom)); _is_prepared = weights_info.retain_internal_weights(); @@ -195,8 +189,8 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order // to match output shape const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, - pad_right, pad_top, pad_bottom); + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp index 01989461e..79d0929a9 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp @@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I ARM_COMPUTE_UNUSED(weights); ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); return Status{}; } @@ -80,12 +80,12 @@ Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in } CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), - _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(), - _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(), - _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false), - _original_weights(nullptr) + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), + _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(), + _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(), + _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false), + _original_weights(nullptr) { } void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, @@ -107,8 +107,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _accumulate_biases = false; @@ -140,10 +140,10 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen bool is_fc_after_conv = false; if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -158,28 +158,28 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen { // Reshape the weights _reshape_weights_output.allocator()->init( - weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights->info()))); + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output); weights_to_use = &_reshape_weights_output; } // Extract scale factor _scale_factor.allocator()->init( - TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type())); + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type())); _memory_group.manage(&_scale_factor); _scale_factor_kernel.configure(input, &_scale_factor); // Quantize input _quantized_input.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); _memory_group.manage(&_quantized_input); _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input); // GEMMLowp _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); _memory_group.manage(&_gemmlowp_output); configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output, fc_info.retain_internal_weights); @@ -209,15 +209,15 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe const GPUTarget gpu_target = CLScheduler::get().target(); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); + CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); } // With the Fully Connected layer we can have 4 different cases: @@ -247,33 +247,32 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); weights_to_use = &reshaped_weights; } // Validate Scale factor kernel const ITensorInfo &scale_factor = - TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type()); + TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor)); // Validate quantization symm8 kernel - const ITensorInfo &quantized_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED)); ARM_COMPUTE_RETURN_ON_ERROR( - CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); + CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); // Fully Connected layer after a Fully Connected Layer without batches ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); // Validate matrix multiply kernel const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); // Multiply scale ARM_COMPUTE_RETURN_ON_ERROR( - CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output)); + CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output)); return Status{}; } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp index 2ff4b9659..13d3acbac 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp @@ -79,7 +79,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn int output_multiplier = 0; int output_shift = 0; ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one( - multiplier, &output_multiplier, &output_shift)); + multiplier, &output_multiplier, &output_shift)); // Set the GEMMLowp output stage info gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; @@ -99,7 +99,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I { GEMMLowpOutputStageInfo gemmlowp_output_stage; ARM_COMPUTE_RETURN_ON_ERROR( - construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); + construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -125,14 +125,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I // Validate gemmlowp function ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( - &input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, - gemm_info)); + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, + gemm_info)); } else { ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); + CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); } return Status{}; @@ -154,12 +154,12 @@ Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input, CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), - _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), - _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager), - _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), - _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), - _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) + : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), + _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), + _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager), + _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), + _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), + _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) { } void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights, @@ -190,9 +190,9 @@ void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTens const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); input->info()->set_quantization_info(QuantizationInfo( - input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); weights->info()->set_quantization_info(QuantizationInfo( - weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp.configure(input, weights, bias, output, gemm_info); @@ -214,8 +214,8 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC const FullyConnectedLayerInfo &fc_info) { ARM_COMPUTE_ERROR_ON( - (weights->info()->dimension(1) != - (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be // linearized @@ -223,11 +223,11 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC // Initialize output tensor for flatten TensorShape shape_flatten = compute_flatten_shape(input->info()); _flatten_output.allocator()->init(input->info() - ->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(shape_flatten) - .set_data_layout(DataLayout::NCHW)); + ->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(shape_flatten) + .set_data_layout(DataLayout::NCHW)); // Configure flatten kernel _memory_group.manage(&_flatten_output); @@ -258,8 +258,8 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_converted = true; _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; @@ -285,10 +285,10 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor const bool is_batched_fc_layer = output->info()->dimension(1) > 1; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -302,7 +302,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor { _reshape_weights_managed_function.configure(weights); weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->acquire(weights, &_reshape_weights_managed_function)); + _weights_manager->acquire(weights, &_reshape_weights_managed_function)); } else { @@ -320,7 +320,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(), fc_info.weights_trained_layout); weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->acquire(weights, &_convert_weights_managed)); + _weights_manager->acquire(weights, &_convert_weights_managed)); } else { @@ -359,16 +359,16 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor bool is_fc_after_conv = true; const ITensorInfo &flatten_input = TensorInfo(input->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(compute_flatten_shape(input)) - .set_data_layout(DataLayout::NCHW)); + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(input)) + .set_data_layout(DataLayout::NCHW)); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = - weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) - : TensorInfo(*reshaped_weights.clone()); + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -396,7 +396,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); + CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -404,7 +404,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate convert weights kernel ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate( - weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } @@ -412,8 +412,8 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Fully Connected layer after a Convolution Layer without batches ARM_COMPUTE_RETURN_ERROR_ON( - (weights_to_use->dimension(1) != - (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); @@ -427,7 +427,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor // Validate matrix multiply kernel ARM_COMPUTE_RETURN_ON_ERROR( - validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); + validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); return Status{}; } @@ -457,7 +457,7 @@ void CLFullyConnectedLayerEx::run() if (_weights_manager && _weights_manager->are_weights_managed(cur_weights)) { _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + _weights_manager->run(cur_weights, &_reshape_weights_managed_function)); } else { diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp index 157b4d977..ac6982e6f 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -41,7 +41,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp // reshape auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( - _input->info()->data_layout())); + _input->info()->data_layout())); _cl_reshape.configure(_input, &_cl_buffer); input_to_use = &_cl_buffer; } @@ -57,7 +57,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp { bool is_hybrid = (input->info()->data_type() == DataType::F32 || input->info()->data_type() == DataType::F16) && - (weights->info()->data_type() == DataType::S8 || + (weights->info()->data_type() == DataType::QSYMM8 || weights->info()->data_type() == DataType::QASYMM8_SIGNED); if (is_hybrid) @@ -81,7 +81,6 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp { throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type"); } - }(); if (_needs_reshape) diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp index 02ee4ad8a..c246041bb 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -46,8 +46,8 @@ using namespace arm_compute; CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), - _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() + : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), + _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() { } @@ -91,13 +91,13 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo * for (size_t i = 0; i < num_of_kernels; ++i, ++it) { ARM_COMPUTE_RETURN_ON_ERROR( - CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); } if (!keep_dims) { ARM_COMPUTE_RETURN_ON_ERROR( - CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); + CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); } return Status{}; diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp index a502f032e..12c0aa829 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp @@ -134,8 +134,8 @@ void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &ou // Output auto inizialitation if not yet initialized TensorInfo tmp_output_info = *output->info()->clone(); auto_init_if_empty( - tmp_output_info, - input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); + tmp_output_info, + input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); // Update coordinate on axis start_coords.set(split_dim, axis_offset); @@ -153,7 +153,7 @@ void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &ou } // namespace CLSplitVEx::CLSplitVEx() - : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions() + : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions() { } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp index 3ac95a8e6..accd51302 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -49,14 +49,14 @@ namespace arm_compute { CLTopKV2::CLTopKV2() - : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), - _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), - _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), - _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), - _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), - _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), - _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), - _reorder_negatives_kernel(), _store_kernel()*/ + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), _p_out_key_buf(nullptr), + _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel()*/ { } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp index 3215d01a7..0754fd813 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -53,7 +53,7 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _function() + : _memory_manager(std::move(memory_manager)), _function() { } @@ -105,20 +105,20 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); switch (CLTransposeConvLayer::get_deconvolution_method( - input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) { case DeconvolutionMethod::DIRECT: { // Validate direct convolution layer ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate( - input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); break; } case DeconvolutionMethod::GEMM: { // Validate gemm-based convolution layer ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); + CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); break; } default: @@ -130,9 +130,9 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf } DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method( - const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, - ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, - unsigned int invalid_bottom, const WeightsInfo &weights_info) + const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, + ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(output, bias, weights_info); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp index a123439d9..e212a03c7 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -58,7 +58,7 @@ namespace Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) { ARM_COMPUTE_RETURN_ON_ERROR( - NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); return Status{}; } @@ -78,11 +78,11 @@ Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in } NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), - _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), - _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), - _accumulate_biases(false), _is_prepared(false) + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), + _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), + _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), + _accumulate_biases(false), _is_prepared(false) { } @@ -103,8 +103,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _accumulate_biases = false; @@ -132,10 +132,10 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor bool _is_fc_after_conv; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -150,23 +150,23 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor { // Reshape the weights _reshape_weights_output.allocator()->init( - weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights->info()))); + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output); weights_to_use = &_reshape_weights_output; } // Quantize input _quantized_input.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); _scale_factor.allocator()->init( - TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); // GEMM _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output); // Multiply scale @@ -195,8 +195,8 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr) @@ -217,7 +217,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -225,20 +225,19 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); // Validate quantization kernel - const ITensorInfo &quantized_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED)); const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR( - NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); // Validate matrix multiply kernel ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate( - &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale)); + &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale)); return Status{}; } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp index cb7557a5a..a639f2979 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -69,14 +69,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I // Validate gemmlowp function ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( - &input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate( - &input, &weights, nullptr, &output, 1.f, 0.0f, - GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */))); } return Status{}; @@ -84,12 +84,12 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } // namespace NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), - _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), - _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), - _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), - _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), - _accumulate_biases(false), _is_quantized(false), _is_prepared(false) + : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), + _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), + _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false), + _is_quantized(false), _is_prepared(false) { } @@ -105,9 +105,9 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor * const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); input->info()->set_quantization_info(QuantizationInfo( - input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); weights->info()->set_quantization_info(QuantizationInfo( - weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp.configure(input, weights, nullptr, output); @@ -129,8 +129,8 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen ITensor *output) { ARM_COMPUTE_ERROR_ON( - (weights->info()->dimension(1) != - (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be // linearized @@ -138,8 +138,7 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen // Initialize output tensor for flatten TensorShape shape_flatten = compute_flatten_shape(input->info()); _flatten_output.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - shape_flatten)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten)); // Configure flatten kernel _memory_group.manage(&_flatten_output); @@ -169,8 +168,8 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_converted = true; _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; @@ -183,8 +182,7 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei if (_is_quantized) { _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); } // Configure accumulate biases kernel for non quantized asymmetric types @@ -208,10 +206,10 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei const bool is_batched_fc_layer = output->info()->dimension(1) > 1; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -284,16 +282,16 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); const ITensorInfo &flatten_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_flatten_shape(input))); + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = - weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) - : TensorInfo(*reshaped_weights.clone()); + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr && !is_quantized) @@ -330,7 +328,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -338,7 +336,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate convert weights kernel ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate( - weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } @@ -346,8 +344,8 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Fully Connected layer after a Convolution Layer without batches ARM_COMPUTE_RETURN_ERROR_ON( - (weights_to_use->dimension(1) != - (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); @@ -365,7 +363,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor if (is_quantized) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( - &gemmlowp_output, biases, output)); + &gemmlowp_output, biases, output)); } return Status{}; diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp index dc6c78478..234c783f9 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -56,7 +56,7 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); bool is_hybrid = input->info()->data_type() == DataType::F32 && - (weights->info()->data_type() == DataType::S8 || + (weights->info()->data_type() == DataType::QSYMM8 || weights->info()->data_type() == DataType::QASYMM8_SIGNED); if (is_hybrid) diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp index 16d74e62d..451aa0997 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp @@ -46,9 +46,9 @@ namespace arm_compute { NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), - _permute_input(), _permute_output(), _permuted_input(), _permuted_output() + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), + _permute_input(), _permute_output(), _permuted_input(), _permuted_output() { } @@ -88,8 +88,8 @@ Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const float epsilon) { return NEInstanceNormalizationLayerKernelEx::validate( - &input->clone()->set_data_layout(DataLayout::NCHW), - &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); + &input->clone()->set_data_layout(DataLayout::NCHW), + &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); } void NEInstanceNormalizationLayerEx::run() diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp index cb1a26304..c45c335b3 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -49,8 +49,8 @@ using namespace arm_compute; NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), - _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() { } @@ -125,7 +125,7 @@ void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_a for (unsigned int i = 0; i < _reduction_ops; ++i) { TensorShape out_shape = - i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp index 26a887912..b21717e86 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -47,8 +47,8 @@ using namespace arm_compute; NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), - _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() { } @@ -122,7 +122,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b for (unsigned int i = 0; i < _reduction_ops; ++i) { TensorShape out_shape = - i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); @@ -135,7 +135,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()) - .set_data_layout(input->info()->data_layout())); + .set_data_layout(input->info()->data_layout())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::SUM); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp index aa165cc15..50311071b 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -51,17 +51,9 @@ namespace arm_compute { NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _conv_f(), - _upsample_f(), - _flip_weights(), - _scaled_output(), - _weights_flipped(), - _flip_axis(), - _original_weights(nullptr), - _input(nullptr), - _info(), - _is_prepared(false) + : _memory_group(std::move(memory_manager)), _conv_f(), _upsample_f(), _flip_weights(), + _scaled_output(), _weights_flipped(), _flip_axis(), _original_weights(nullptr), _input(nullptr), + _info(), _is_prepared(false) { } @@ -76,15 +68,15 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); const unsigned int width_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); const unsigned int height_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); auto out_dims = transposeconv_output_dimensions( - input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), - weights->dimension(height_idx), info, invalid_right, invalid_bottom); + input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), + weights->dimension(height_idx), info, invalid_right, invalid_bottom); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); if (bias != nullptr) @@ -117,24 +109,24 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf unsigned int pad_right = 0; unsigned int pad_top = 0; unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, - pad_bottom); + const TensorShape scale_out_shape = + compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); TensorInfo scale_out_info( - input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); const unsigned int batches_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); const unsigned int channel_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx)); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, - conv_info, WeightsInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo())); return Status{}; } @@ -146,21 +138,21 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( - input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), - info, invalid_right, invalid_bottom)); + input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); auto out_dims = transposeconv_output_dimensions( - input->info()->dimension(width_idx), input->info()->dimension(height_idx), - weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, - invalid_right, invalid_bottom); + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, + invalid_right, invalid_bottom); const TensorShape output_shape = - compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); _input = input; _original_weights = weights; @@ -188,8 +180,8 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, - pad_right, pad_top, pad_bottom); + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, DimensionRoundingType::FLOOR); diff --git a/compute/cker/include/cker/CpuBackendThreadpool.h b/compute/cker/include/cker/CpuBackendThreadpool.h new file mode 100644 index 000000000..cc6a9dbfc --- /dev/null +++ b/compute/cker/include/cker/CpuBackendThreadpool.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_ +#define __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_ + +#include <ruy/context.h> // from @ruy +#include <ruy/thread_pool.h> // from @ruy + +namespace nnfw +{ +namespace cker +{ +namespace cpu_backend_threadpool +{ + +using Task = ruy::Task; + +template <typename TaskType> +void Execute(int tasks_count, TaskType *tasks, ruy::Context *ruy_context) +{ + assert(tasks_count <= ruy_context->max_num_threads()); + ruy_context->mutable_thread_pool()->Execute(tasks_count, tasks); +} + +} // namespace cpu_backend_threadpool +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_ diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h index e08040632..8bf0bee03 100644 --- a/compute/cker/include/cker/NeonTensorUtils.h +++ b/compute/cker/include/cker/NeonTensorUtils.h @@ -131,7 +131,7 @@ inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, co const int kWeightsPerUint32 = 4; int8 *shuffled_vectors = reinterpret_cast<int8 *>( - aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free)); + aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free)); for (int i = 0; i < n_batch; i += 4) { @@ -145,25 +145,25 @@ inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, co while (unshuffled_vec0_ptr != end_vec0_ptr) { asm volatile( - // This code path requires that (n_cols % 16) == 0 so we can safely - // read in 16-byte chunks from each row. - "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n" - "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n" - "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n" - "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n" - - "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n" - "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n" - "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n" - "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n" - - : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr), - [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr), - [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr), - [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr), - [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr) - : - : "v0", "v1", "v2", "v3", "cc", "memory"); + // This code path requires that (n_cols % 16) == 0 so we can safely + // read in 16-byte chunks from each row. + "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n" + "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n" + "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n" + "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n" + + "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n" + + : [ unshuffled_vec0_ptr ] "+r"(unshuffled_vec0_ptr), + [ unshuffled_vec1_ptr ] "+r"(unshuffled_vec1_ptr), + [ unshuffled_vec2_ptr ] "+r"(unshuffled_vec2_ptr), + [ unshuffled_vec3_ptr ] "+r"(unshuffled_vec3_ptr), + [ shuffled_vectors_ptr ] "+r"(shuffled_vectors_ptr) + : + : "v0", "v1", "v2", "v3", "cc", "memory"); } } @@ -204,104 +204,104 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols); asm volatile( - // Zero out the accumulator registers. - "dup v0.4s, wzr\n" - "dup v1.4s, wzr\n" - "dup v2.4s, wzr\n" - "dup v3.4s, wzr\n" - - "1:\n" // batch_cols_loop - - // Read 16 more bytes from a pair of matrix rows. - "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" - - // Prefetch two rows ahead. - "prfm pldl1strm, [%[mat_ptr2]]\n" - "prfm pldl1strm, [%[mat_ptr3]]\n" - - // Read from input vectors 4 times; 64 bytes total. - // Each 16-byte register contains parts of 4 vectors; see the - // shuffle logic above. - - // From Benoit, places to look in the future: - // - Move load instructions further from sdot - // - Switch loop use-then-reload - // - Do partial unrolling to use register space better - "ld1 {v8.16b}, [%[vec_ptr]], #16\n" - ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" - "ld1 {v9.16b}, [%[vec_ptr]], #16\n" - ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" - "ld1 {v10.16b}, [%[vec_ptr]], #16\n" - ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" - "ld1 {v11.16b}, [%[vec_ptr]], #16\n" - ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" - - // Update prefetch pointers. - "add %[mat_ptr2], %[mat_ptr2], #16\n" - "add %[mat_ptr3], %[mat_ptr3], #16\n" - - // Re-use those vectors for the next row as well. - "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" - ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" - ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" - ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" - ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" - - // If we're not done with these rows, continue. - "cmp %[mat_ptr0], %[mat_ptr0_end]\n" - "bne 1b\n" // batch_cols_loop - - // Done with the rows, sum the results. - "add v0.4s, v0.4s, v1.4s\n" - "add v2.4s, v2.4s, v3.4s\n" - - // Convert the per-vector sums to floating point. - "scvtf v0.4s, v0.4s\n" - "scvtf v1.4s, v2.4s\n" - - // Fetch scale factors. - "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" - - // Multiply scale factors times sums. - "fmul v0.4s, v4.4s, v0.4s\n" - "fmul v1.4s, v4.4s, v1.4s\n" - - // Load previous result values. - // The result position is: - // result[batch * m_rows + row] - // Here that is factored into: - // result_ptr = result + row - // *result_ptr = res[0] - // (uint8*)result_ptr += (m_rows * sizeof(float)) - // *result_ptr = res[1] - // ... - // Since we're reading two rows at a time, though, we read both - // result[batch * m_rows + row] - // and - // result[batch * m_rows + row + 1] - "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" - - // Go back to the starting position (subtract wide_rows * 4). - "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" - - // Add previous result values. - "fadd v9.4s, v9.4s, v0.4s\n" - "fadd v10.4s, v10.4s, v1.4s\n" - - // Store results. - "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" - : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr), - [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3) - : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr), - [wide_rows] "r"(wide_rows) - : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "cc", "memory"); + // Zero out the accumulator registers. + "dup v0.4s, wzr\n" + "dup v1.4s, wzr\n" + "dup v2.4s, wzr\n" + "dup v3.4s, wzr\n" + + "1:\n" // batch_cols_loop + + // Read 16 more bytes from a pair of matrix rows. + "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" + + // Prefetch two rows ahead. + "prfm pldl1strm, [%[mat_ptr2]]\n" + "prfm pldl1strm, [%[mat_ptr3]]\n" + + // Read from input vectors 4 times; 64 bytes total. + // Each 16-byte register contains parts of 4 vectors; see the + // shuffle logic above. + + // From Benoit, places to look in the future: + // - Move load instructions further from sdot + // - Switch loop use-then-reload + // - Do partial unrolling to use register space better + "ld1 {v8.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" + "ld1 {v9.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" + "ld1 {v10.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" + "ld1 {v11.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" + + // Update prefetch pointers. + "add %[mat_ptr2], %[mat_ptr2], #16\n" + "add %[mat_ptr3], %[mat_ptr3], #16\n" + + // Re-use those vectors for the next row as well. + "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" + ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" + ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" + ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" + ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" + + // If we're not done with these rows, continue. + "cmp %[mat_ptr0], %[mat_ptr0_end]\n" + "bne 1b\n" // batch_cols_loop + + // Done with the rows, sum the results. + "add v0.4s, v0.4s, v1.4s\n" + "add v2.4s, v2.4s, v3.4s\n" + + // Convert the per-vector sums to floating point. + "scvtf v0.4s, v0.4s\n" + "scvtf v1.4s, v2.4s\n" + + // Fetch scale factors. + "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" + + // Multiply scale factors times sums. + "fmul v0.4s, v4.4s, v0.4s\n" + "fmul v1.4s, v4.4s, v1.4s\n" + + // Load previous result values. + // The result position is: + // result[batch * m_rows + row] + // Here that is factored into: + // result_ptr = result + row + // *result_ptr = res[0] + // (uint8*)result_ptr += (m_rows * sizeof(float)) + // *result_ptr = res[1] + // ... + // Since we're reading two rows at a time, though, we read both + // result[batch * m_rows + row] + // and + // result[batch * m_rows + row + 1] + "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + + // Go back to the starting position (subtract wide_rows * 4). + "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" + + // Add previous result values. + "fadd v9.4s, v9.4s, v0.4s\n" + "fadd v10.4s, v10.4s, v1.4s\n" + + // Store results. + "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr), + [ result_ptr ] "+r"(result_ptr), [ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3) + : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr), + [ wide_rows ] "r"(wide_rows) + : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "cc", "memory"); } } @@ -309,9 +309,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr } static void DotprodMatrixBatchFourVectorMultiplyAccumulate( - const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, - const float *scaling_factors, int n_batch, float *__restrict__ result, - const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result, + const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) { void *shuffled_vectors_free; const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free); @@ -332,102 +332,102 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate( const int32_t *batch_offsets_ptr = input_offset + batch; const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr; const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr; - asm volatile("dup v0.4s, wzr\n" - "dup v1.4s, wzr\n" - "dup v2.4s, wzr\n" - "dup v3.4s, wzr\n" - // Load zero points. - "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n" - "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" - // Zero out zero point accumulators. - "dup v14.4s, wzr\n" - "dup v15.4s, wzr\n" - - // Load per channel scales if not null. - "cmp %w[is_channel_scale_nullptr], #0\n" - "bne 1f\n" - "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n" - "ld1r {v17.4s}, [%[channel_scales_ptr]]\n" - "fmul v16.4s, v16.4s, v4.4s\n" - "fmul v17.4s, v17.4s, v4.4s\n" - "b 2f\n" - "1:\n" - "mov v16.16b, v4.16b\n" - "mov v17.16b, v4.16b\n" - "2:\n" - "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" - "ld1 {v8.16b}, [%[vec_ptr]], #16\n" - ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" - "ld1 {v9.16b}, [%[vec_ptr]], #16\n" - ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" - "ld1 {v10.16b}, [%[vec_ptr]], #16\n" - ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" - "ld1 {v11.16b}, [%[vec_ptr]], #16\n" - ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" - "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" - ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" - ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" - ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" - ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" - "cmp %w[is_row_sums_nullptr], #1\n" - "bne 3f\n" - // Accumulate row_sums for zero point calculations. - "saddlp v12.8h, v12.16b\n" - "saddlp v13.8h, v13.16b\n" - "sadalp v14.4s, v12.8h\n" - "sadalp v15.4s, v13.8h\n" - "3:\n" - "cmp %[mat_ptr0], %[mat_ptr0_end]\n" - "bne 2b\n" - "add v0.4s, v0.4s, v1.4s\n" - "add v2.4s, v2.4s, v3.4s\n" - - "cmp %w[is_row_sums_nullptr], #1\n" - "bne 4f\n" - // Calculate zero point offsets. - "addv s14, v14.4s\n" - "addv s15, v15.4s\n" - "dup v14.4s, v14.s[0]\n" - "dup v15.4s, v15.s[0]\n" - "b 5f\n" - "4:\n" - "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n" - "ld1r {v15.4s}, [%[row_sums_ptr]]\n" - "5:\n" - - "mul v14.4s, v14.4s, v7.4s\n" - "mul v15.4s, v15.4s, v7.4s\n" - "sub v0.4s, v0.4s, v14.4s\n" - "sub v2.4s, v2.4s, v15.4s\n" - - "scvtf v0.4s, v0.4s\n" - "scvtf v1.4s, v2.4s\n" - - // Multiply scale. - "fmul v0.4s, v16.4s, v0.4s\n" - "fmul v1.4s, v17.4s, v1.4s\n" - - "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" - "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" - "fadd v9.4s, v9.4s, v0.4s\n" - "fadd v10.4s, v10.4s, v1.4s\n" - "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" - : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr), - [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr) - : [mat_ptr0_end] "r"(mat_ptr0_end), - [scaling_factors_ptr] "r"(scaling_factors_ptr), [wide_rows] "r"(wide_rows), - [channel_scales_ptr] "r"(channel_scales_ptr), - [batch_offsets_ptr] "r"(batch_offsets_ptr), - [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr), - [is_row_sums_nullptr] "r"(is_row_sums_nullptr) - : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory"); + asm volatile( + "dup v0.4s, wzr\n" + "dup v1.4s, wzr\n" + "dup v2.4s, wzr\n" + "dup v3.4s, wzr\n" + // Load zero points. + "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n" + "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" + // Zero out zero point accumulators. + "dup v14.4s, wzr\n" + "dup v15.4s, wzr\n" + + // Load per channel scales if not null. + "cmp %w[is_channel_scale_nullptr], #0\n" + "bne 1f\n" + "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n" + "ld1r {v17.4s}, [%[channel_scales_ptr]]\n" + "fmul v16.4s, v16.4s, v4.4s\n" + "fmul v17.4s, v17.4s, v4.4s\n" + "b 2f\n" + "1:\n" + "mov v16.16b, v4.16b\n" + "mov v17.16b, v4.16b\n" + "2:\n" + "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" + "ld1 {v8.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" + "ld1 {v9.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" + "ld1 {v10.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" + "ld1 {v11.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" + "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" + ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" + ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" + ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" + ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" + "cmp %w[is_row_sums_nullptr], #1\n" + "bne 3f\n" + // Accumulate row_sums for zero point calculations. + "saddlp v12.8h, v12.16b\n" + "saddlp v13.8h, v13.16b\n" + "sadalp v14.4s, v12.8h\n" + "sadalp v15.4s, v13.8h\n" + "3:\n" + "cmp %[mat_ptr0], %[mat_ptr0_end]\n" + "bne 2b\n" + "add v0.4s, v0.4s, v1.4s\n" + "add v2.4s, v2.4s, v3.4s\n" + + "cmp %w[is_row_sums_nullptr], #1\n" + "bne 4f\n" + // Calculate zero point offsets. + "addv s14, v14.4s\n" + "addv s15, v15.4s\n" + "dup v14.4s, v14.s[0]\n" + "dup v15.4s, v15.s[0]\n" + "b 5f\n" + "4:\n" + "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n" + "ld1r {v15.4s}, [%[row_sums_ptr]]\n" + "5:\n" + + "mul v14.4s, v14.4s, v7.4s\n" + "mul v15.4s, v15.4s, v7.4s\n" + "sub v0.4s, v0.4s, v14.4s\n" + "sub v2.4s, v2.4s, v15.4s\n" + + "scvtf v0.4s, v0.4s\n" + "scvtf v1.4s, v2.4s\n" + + // Multiply scale. + "fmul v0.4s, v16.4s, v0.4s\n" + "fmul v1.4s, v17.4s, v1.4s\n" + + "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" + "fadd v9.4s, v9.4s, v0.4s\n" + "fadd v10.4s, v10.4s, v1.4s\n" + "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr), + [ result_ptr ] "+r"(result_ptr), [ row_sums_ptr ] "+r"(row_sums_ptr) + : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr), + [ wide_rows ] "r"(wide_rows), [ channel_scales_ptr ] "r"(channel_scales_ptr), + [ batch_offsets_ptr ] "r"(batch_offsets_ptr), + [ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr), + [ is_row_sums_nullptr ] "r"(is_row_sums_nullptr) + : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory"); } } @@ -458,9 +458,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate( // We don't use this kernel when n_batch = 1 because the baseline kernel // is fine for that case. inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( - const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, - const float *scaling_factors, int n_batch, float *__restrict__ result, - const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result, + const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) { const int kWeightsPerUint32 = 4; @@ -475,14 +475,14 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( void *padded_vectors_free; const int padded_vectors_size = batch_round_up * m_cols; int8_t *padded_vectors = reinterpret_cast<int8_t *>( - aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free)); + aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free)); memset(padded_vectors, 0, padded_vectors_size); void *padded_result_free; const int result_size = n_batch * m_rows * sizeof(float); const int padded_result_size = batch_round_up * m_rows * sizeof(float); float *padded_result = reinterpret_cast<float *>( - aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free)); + aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free)); memcpy(padded_result, result, result_size); memset(reinterpret_cast<char *>(padded_result) + result_size, 0, padded_result_size - result_size); @@ -494,7 +494,7 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( void *padded_scaling_factors_free; const int padded_scaling_factors_size = batch_round_up * sizeof(float); float *padded_scaling_factors = reinterpret_cast<float *>( - aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free)); + aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free)); assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size); assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size); memset(padded_scaling_factors, 0, batch_round_up * sizeof(float)); @@ -505,7 +505,7 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( void *padded_input_offset_free; const int padded_input_offset_size = batch_round_up * sizeof(int32_t); int32_t *padded_input_offset = reinterpret_cast<int32_t *>( - aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free)); + aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free)); assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size); assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size); memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t)); @@ -513,8 +513,8 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( // Call the main kernel. DotprodMatrixBatchFourVectorMultiplyAccumulate( - matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, - padded_result, per_channel_scale, padded_input_offset, row_sums); + matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, padded_result, + per_channel_scale, padded_input_offset, row_sums); free(padded_input_offset_free); } @@ -533,13 +533,13 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( } inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( - const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, - const float *scaling_factors, int n_batch, float *__restrict__ result) + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result) { DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( - matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, - /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr, - /*row_sums=*/nullptr); + matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, + /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr, + /*row_sums=*/nullptr); } #endif // __aarch64__ @@ -736,7 +736,7 @@ inline void NeonSymmetricQuantizeFloats(const float *values, const int size, for (int i = postamble_start; i < size; ++i) { const int32_t quantized_value = - static_cast<int32_t>(std::round(scaling_factor_inv * values[i])); + static_cast<int32_t>(std::round(scaling_factor_inv * values[i])); quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); } } @@ -830,7 +830,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m // Here the assumption is that each buffer is 4-byte aligned. Otherwise, // performance may suffer significantly. assert( // NOLINT - ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); + ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col)); const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col)); // Multiply the low bits (i.e. the lower 8 8bit numbers in the @@ -855,7 +855,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m // Here the assumption is that each buffer is 4-bytes aligned. // Otherwise, performance may suffer significantly. assert( // NOLINT - ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); + ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col)); const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col)); const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8); @@ -952,7 +952,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1); const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0); const float32x4_t result1 = - vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1); + vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1); vst1q_f32(result, result0); vst1q_f32(result + 4 * result_stride, result1); } diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h index 3b3b27f72..2a58a2ec9 100644 --- a/compute/cker/include/cker/PortableTensorUtils.h +++ b/compute/cker/include/cker/PortableTensorUtils.h @@ -138,7 +138,7 @@ inline void PortableSymmetricQuantizeFloats(const float *values, const int size, for (int i = 0; i < size; ++i) { const int32_t quantized_value = - static_cast<int32_t>(std::round(values[i] * scaling_factor_inv)); + static_cast<int32_t>(std::round(values[i] * scaling_factor_inv)); // Clamp: just in case some odd numeric offset. quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); } diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h index acb6cac55..10f3ecbd3 100644 --- a/compute/cker/include/cker/Types.h +++ b/compute/cker/include/cker/Types.h @@ -389,6 +389,11 @@ struct SpaceToDepthParams int32_t block_size; }; +struct LeakyReluParams +{ + float alpha; +}; + enum class Order { kColMajor, @@ -475,9 +480,9 @@ enum class QuantizationFlavor // (only those that need perchannel quantization do). template <typename AccumScalar, typename DstScalar, QuantizationFlavor quantization_flavor = - std::is_floating_point<AccumScalar>::value - ? QuantizationFlavor::kFloatingPoint - : QuantizationFlavor::kIntegerWithUniformMultiplier> + std::is_floating_point<AccumScalar>::value + ? QuantizationFlavor::kFloatingPoint + : QuantizationFlavor::kIntegerWithUniformMultiplier> struct GemmParams { // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa) @@ -504,12 +509,12 @@ struct GemmParams const AccumScalar *bias = nullptr; // min clamp bound of destination values. DstScalar clamp_min = std::is_floating_point<DstScalar>::value - ? -std::numeric_limits<DstScalar>::infinity() - : std::numeric_limits<DstScalar>::lowest(); + ? -std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::lowest(); // max clamp bound of destination values. DstScalar clamp_max = std::is_floating_point<DstScalar>::value - ? std::numeric_limits<DstScalar>::infinity() - : std::numeric_limits<DstScalar>::max(); + ? std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::max(); }; // Validates self-consistency of GemmParams. diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h index 2abb998d0..f73c01523 100644 --- a/compute/cker/include/cker/Utils.h +++ b/compute/cker/include/cker/Utils.h @@ -88,8 +88,8 @@ inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multip int left_shift = shift > 0 ? shift : 0; int right_shift = shift > 0 ? 0 : -shift; return gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), - right_shift); + gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), + right_shift); } inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier, @@ -103,7 +103,7 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int left_shift) { return gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift); + gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift); } inline int NodeOffset(int b, int h, int w, int height, int width) @@ -162,7 +162,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, const F3 fixedpoint_input = F3::FromRaw(input >> 1); const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); const F3 fixedpoint_half_three = - GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); // Newton-Raphson iteration // Naive unoptimized starting guess: x = 1 F3 x = F3::One(); @@ -173,7 +173,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); } const F0 fixedpoint_half_sqrt_2 = - GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); x = x * fixedpoint_half_sqrt_2; *output_inv_sqrt = x.raw(); if (*output_shift < 0) @@ -429,7 +429,7 @@ template <typename T> class SequentialTensorWriter { public: SequentialTensorWriter(const T *input_data, T *output_data) - : input_data_(input_data), output_ptr_(output_data) + : input_data_(input_data), output_ptr_(output_data) { } diff --git a/compute/cker/include/cker/eigen/EigenSupport.h b/compute/cker/include/cker/eigen/EigenSupport.h index 49c34211a..e3b10990e 100644 --- a/compute/cker/include/cker/eigen/EigenSupport.h +++ b/compute/cker/include/cker/eigen/EigenSupport.h @@ -39,17 +39,17 @@ namespace eigen_support // library. typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> - EigenMatrix; + EigenMatrix; typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> - ConstEigenMatrix; + ConstEigenMatrix; typedef Eigen::TensorMap<Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> - EigenTensor; + EigenTensor; typedef Eigen::TensorMap<Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> - ConstEigenTensor; + ConstEigenTensor; // Utility functions we need for the EigenTensor API. template <typename Device, typename T> struct MatMulConvFunctor diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h index f9c706370..40cb85432 100644 --- a/compute/cker/include/cker/eigen/Utils.h +++ b/compute/cker/include/cker/eigen/Utils.h @@ -36,9 +36,9 @@ namespace cker // Eigen::Map<Eigen::Matrix<const float, ...>> template <typename Scalar> using VectorMap = typename std::conditional< - std::is_const<Scalar>::value, - Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>, - Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Shape &shape) { @@ -51,10 +51,10 @@ template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Sha // above also applies here. template <typename Scalar> using MatrixMap = typename std::conditional< - std::is_const<Scalar>::value, - Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, - Eigen::Dynamic>>, - Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; + std::is_const<Scalar>::value, + Eigen::Map< + const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, Eigen::Dynamic>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; template <typename Scalar> MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape) diff --git a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h index dc3e2552d..9d4fd2eaf 100644 --- a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h +++ b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h @@ -49,20 +49,19 @@ class TensorEvaluatorHasPartialPacket public: template <typename TensorEvaluatorT, typename PacketT, typename IndexT> static auto functionExistsSfinae( - typename std::enable_if< - unpacket_traits<PacketT>::masked_load_available && - std::is_same< - PacketT, - decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>( - std::declval<IndexT>(), - std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *) - -> std::true_type; + typename std::enable_if< + unpacket_traits<PacketT>::masked_load_available && + std::is_same<PacketT, + decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>( + std::declval<IndexT>(), + std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *) + -> std::true_type; template <typename TensorEvaluatorT, typename PacketT, typename IndexT> static auto functionExistsSfinae(...) -> std::false_type; typedef decltype( - functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status; + functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status; static constexpr bool value = status::value; }; @@ -71,9 +70,9 @@ public: // [from, to) range. If the mask bit is 1, element will be loaded/stored. template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename std::enable_if<unpacket_traits<Packet>::masked_load_available, - typename unpacket_traits<Packet>::mask_t>::type - mask(int from, int to) + typename std::enable_if<unpacket_traits<Packet>::masked_load_available, + typename unpacket_traits<Packet>::mask_t>::type + mask(int from, int to) { const Index packet_size = internal::unpacket_traits<Packet>::size; eigen_assert(0 <= from && to <= (packet_size + 1) && from < to); diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h index 92e1614d1..c931ac518 100644 --- a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h +++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h @@ -62,30 +62,27 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side, int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionInputMapper< - Scalar_, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Scalar_, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { public: typedef Scalar_ Scalar; typedef TensorContractionInputMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - Self; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Self; typedef TensorContractionSubMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - SubMapper; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; typedef SubMapper VectorMapper; typedef SubMapper LinearMapper; @@ -95,11 +92,11 @@ public: EIGEN_DEVICE_FUNC TensorContractionInputMapper( - const TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device> &tensor, - const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &) - : m_impl(tensor.impl().impl()) + const TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device> + &tensor, + const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &) + : m_impl(tensor.impl().impl()) { Index patch_rows; Index patch_depth; @@ -167,7 +164,7 @@ public: EIGEN_DEVICE_FUNC TensorContractionInputMapper(const TensorContractionInputMapper &base_mapper) - : m_impl(base_mapper.m_impl) + : m_impl(base_mapper.m_impl) { m_patch_cols = base_mapper.m_patch_cols; m_num_patches = base_mapper.m_num_patches; @@ -280,11 +277,10 @@ public: private: friend class TensorContractionSubMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>; // Load coefficient from a patch specified by the "within patch offset" // (patchId) and the precomputed indices of the first element of the patch. @@ -298,14 +294,14 @@ private: const Index colOffset = patchOffset / m_fastColStride; const Index inputCol = colIndex + colOffset * m_in_col_strides; const Index origInputCol = (m_patch_col_inflate_strides == 1) - ? inputCol - : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + ? inputCol + : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); const Index rowOffset = patchOffset - colOffset * m_colStride; const Index inputRow = rowIndex + rowOffset * m_in_row_strides; const Index origInputRow = (m_patch_row_inflate_strides == 1) - ? inputRow - : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + ? inputRow + : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols || origInputRow >= m_inputRows || (inputCol != origInputCol * m_patch_col_inflate_strides) || (inputRow != origInputRow * m_patch_row_inflate_strides)) @@ -314,7 +310,7 @@ private: } const Index depth = patchId - patchOffset * patchDepth(); const Index inputIndex = - depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex; + depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex; return m_impl.coeff(inputIndex); } @@ -338,7 +334,7 @@ private: } const Index depth = patchId - patchOffset * patchDepth(); const Index inputIndex = - depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; return m_impl.coeff(inputIndex); } @@ -390,7 +386,7 @@ private: // span[0] all the way upto (and including) span[1]. const Index depth = patchId - patchOffsets[0] * patchDepth(); const Index inputIndex = - depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex; return m_impl.template partialPacket<Packet>(inputIndex - span[0], mask<Packet>(span[0], span[1] + 1)); } @@ -445,10 +441,10 @@ private: // Load partial packets and do bit-wise OR to generate required packet return internal::por<Packet>( - loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0], - patchOffsets2Cols[0], colOffsets[0]), - loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1], - patchOffsets2Cols[1], colOffsets[1])); + loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0], + patchOffsets2Cols[0], colOffsets[0]), + loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1], + patchOffsets2Cols[1], colOffsets[1])); } // Helper function to load a packet that is present in a single columns. @@ -477,7 +473,7 @@ private: // no padding const Index depth = patchId - patchOffsets[0] * patchDepth(); const Index inputIndex = - depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex; + depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex; return m_impl.template packet<Unaligned>(inputIndex); } return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); @@ -490,7 +486,7 @@ private: // load. template <typename PacketT, typename TensorEvaluatorT> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< - !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { const Index packetSize = internal::unpacket_traits<Packet>::size; @@ -538,7 +534,7 @@ private: // packets. template <typename PacketT, typename TensorEvaluatorT> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< - TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { const Index packetSize = internal::unpacket_traits<PacketT>::size; @@ -604,7 +600,7 @@ private: // no padding const Index depth = patchId - patchOffset * patchDepth(); const Index inputIndex = - depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; return m_impl.template packet<Unaligned>(inputIndex); } @@ -627,10 +623,10 @@ private: computeBaseIndices(Index patchIndex, Index &rowIndex, Index &colIndex, Index &otherIndex) const { const size_t NumInputDims = - array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches; const Index patch2DIndex = - (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches); + (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches); otherIndex *= m_patchInputStride; colIndex = patch2DIndex / m_fastOutputRows; rowIndex = patch2DIndex - colIndex * m_outputRows; @@ -689,31 +685,28 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen typename Scalar, typename Index, typename nocontract_t, typename contract_t, int Side, int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionSubMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { public: typedef typename packet_traits<Scalar>::type Packet; typedef typename packet_traits<Scalar>::half HalfPacket; typedef TensorContractionInputMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - ParentMapper; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + ParentMapper; typedef TensorContractionSubMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - Self; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Self; typedef Self LinearMapper; @@ -722,16 +715,16 @@ public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper &base_mapper, Index vert_offset, Index horiz_offset) - : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper) + : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper) { m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self &base_mapper, Index vert_offset, Index horiz_offset) - : m_depth_offset(vert_offset + base_mapper.m_depth_offset), - m_col_offset(horiz_offset + base_mapper.m_col_offset), - m_base_mapper(base_mapper.m_base_mapper) + : m_depth_offset(vert_offset + base_mapper.m_depth_offset), + m_col_offset(horiz_offset + base_mapper.m_col_offset), + m_base_mapper(base_mapper.m_base_mapper) { m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); } @@ -766,7 +759,7 @@ public: { typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT; return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>( - i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); } template <typename Packet> EIGEN_DEVICE_FUNC bool aligned(Index) const { return false; } @@ -781,7 +774,7 @@ public: EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const { const Index max_col = - (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride(); + (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride(); return std::min<Index>(1 + max_col, patchCols()); } @@ -789,8 +782,8 @@ public: EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, const Index col) const { const Index max_row = - (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) / - fastPatchRowStride(); + (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) / + fastPatchRowStride(); return std::min<Index>(1 + max_row, patchRows()); } @@ -862,7 +855,7 @@ public: } template <typename PacketT = Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< - TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type partialPacketNoPadding(const Index depth, const Index baseIndex, Index num_coeffs) const { const Index inputIndex = depth + baseIndex; @@ -913,8 +906,8 @@ public: const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides; *orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1) - ? input_row - : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0); + ? input_row + : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0); return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) || (input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides); @@ -932,8 +925,8 @@ public: const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides; *orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1) - ? input_col - : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0); + ? input_col + : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0); return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) || (input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides); @@ -1033,23 +1026,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> struct gemm_pack_rhs< - Scalar, Index, - TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, - Alignment>, - nr, ColMajor, false, false> + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { typedef TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - SubMapper; + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; typedef SubMapper DataMapper; typedef typename packet_traits<Scalar>::type Packet; @@ -1159,7 +1149,7 @@ struct gemm_pack_rhs< const Index idx3 = dm3.baseIndex(r, c); const Index start_depth = - ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; + ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); eigen_assert((max_depth - start_depth) % packet_size == 0); @@ -1248,22 +1238,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen typename Scalar, typename Index, typename nocontract_t, typename contract_t, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> struct gemm_pack_rhs< - Scalar, Index, - TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>, - nr, ColMajor, false, false> + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { typedef TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment> - SubMapper; + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; typedef SubMapper DataMapper; typedef typename packet_traits<Scalar>::type Packet; @@ -1378,7 +1366,7 @@ struct gemm_pack_rhs< const Index idx3 = dm3.baseIndex(r, c); const Index start_depth = - ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; + ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); eigen_assert((max_depth - start_depth) % packet_size == 0); @@ -1472,22 +1460,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen typename Scalar, typename Index, typename nocontract_t, typename contract_t, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> struct gemm_pack_rhs< - Scalar, Index, - TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>, - nr, ColMajor, false, false> + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { typedef TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> - SubMapper; + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; typedef SubMapper DataMapper; EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -1582,27 +1568,25 @@ struct gemm_pack_rhs< */ template <typename Input, typename Kernel, typename OutputKernel = const NoOpOutputKernel> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional< - internal::traits<Input>::Layout == ColMajor, - TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, - internal::traits<Input>::NumDimensions>, - const TensorContractionOp< - const array<IndexPair<typename internal::traits<Input>::Index>, 1>, - const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, - const Kernel>, - const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, - const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, - const OutputKernel>>, - TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, - internal::traits<Input>::NumDimensions>, - const TensorContractionOp< - const array<IndexPair<typename internal::traits<Input>::Index>, 1>, - const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, - const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, - const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, - const Kernel>, - const OutputKernel>>>::type + internal::traits<Input>::Layout == ColMajor, + TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array<IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const Kernel>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, + const OutputKernel>>, + TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array<IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const Kernel>, + const OutputKernel>>>::type SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_stride = 1, const Index col_stride = 1, const PaddingType padding_type = PADDING_SAME, const Index row_in_stride = 1, const Index col_in_stride = 1, @@ -1612,11 +1596,11 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str typedef typename internal::traits<Input>::Index TensorIndex; TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex>> - in(input); + in(input); TensorRef< - Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, - internal::traits<Kernel>::Layout, TensorIndex>> - kern(kernel); + Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, + internal::traits<Kernel>::Layout, TensorIndex>> + kern(kernel); EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -1735,46 +1719,46 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str } if (padding_explicit) { - return choose( - Cond<internal::traits<Input>::Layout == ColMajor>(), - kernel.reshape(kernel_dims) - .contract(input - .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, - row_in_stride, col_in_stride, - /*row_inflate_stride=*/1, - /*col_inflate_stride=*/1, padding_top, - padding_bottom, padding_left, padding_right, - /*padding_value=*/0) - .reshape(pre_contract_dims), - contract_dims, output_kernel) - .reshape(post_contract_dims), - input - .extract_image_patches( - kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride, - /*row_inflate_stride=*/1, - /*col_inflate_stride=*/1, padding_top, padding_bottom, padding_left, padding_right, - /*padding_value=*/0) - .reshape(pre_contract_dims) - .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) - .reshape(post_contract_dims)); + return choose(Cond<internal::traits<Input>::Layout == ColMajor>(), + kernel.reshape(kernel_dims) + .contract(input + .extract_image_patches(kernelRows, kernelCols, row_stride, + col_stride, row_in_stride, col_in_stride, + /*row_inflate_stride=*/1, + /*col_inflate_stride=*/1, padding_top, + padding_bottom, padding_left, padding_right, + /*padding_value=*/0) + .reshape(pre_contract_dims), + contract_dims, output_kernel) + .reshape(post_contract_dims), + input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, + row_in_stride, col_in_stride, + /*row_inflate_stride=*/1, + /*col_inflate_stride=*/1, padding_top, padding_bottom, + padding_left, padding_right, + /*padding_value=*/0) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) + .reshape(post_contract_dims)); } else { return choose( - Cond<internal::traits<Input>::Layout == ColMajor>(), - kernel.reshape(kernel_dims) - .contract(input - .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, - row_in_stride, col_in_stride, padding_type) - .reshape(pre_contract_dims), - contract_dims, output_kernel) - .reshape(post_contract_dims), - input - .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride, - col_in_stride, padding_type) - .reshape(pre_contract_dims) - .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) - .reshape(post_contract_dims)); + Cond<internal::traits<Input>::Layout == ColMajor>(), + kernel.reshape(kernel_dims) + .contract(input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, + row_in_stride, col_in_stride, padding_type) + .reshape(pre_contract_dims), + contract_dims, output_kernel) + .reshape(post_contract_dims), + input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride, + col_in_stride, padding_type) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) + .reshape(post_contract_dims)); } } diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h index 6149cafa7..a70e39cc9 100644 --- a/compute/cker/include/cker/operation/AveragePool.h +++ b/compute/cker/include/cker/operation/AveragePool.h @@ -73,10 +73,10 @@ void AveragePool<float>(const PoolParams ¶ms, const Shape &input_shape, cons int hpad = h + params.padding_values.height; int wpad = w + params.padding_values.width; int h_start = - (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; int h_end = std::min(hpad / stride_height + 1, output_height); int w_start = - (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; int w_end = std::min(wpad / stride_width + 1, output_width); // compute elementwise sum for (int ph = h_start; ph < h_end; ++ph) @@ -146,11 +146,11 @@ inline void AveragePool16(const PoolParams ¶ms, const Shape &input_shape, const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); const int filter_count = - (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); memset(acc, 0, tranche_depth * sizeof(acc[0])); const uint8_t *input_ptr = - input_data + depth_base + - depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); for (int fy = filter_y_start; fy < filter_y_end; fy++) { const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); @@ -283,11 +283,11 @@ inline void AveragePool32(const PoolParams ¶ms, const Shape &input_shape, const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); const int filter_count = - (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); memset(acc, 0, tranche_depth * sizeof(acc[0])); const uint8_t *input_ptr = - input_data + depth_base + - depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); for (int fy = filter_y_start; fy < filter_y_end; fy++) { const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h index e33b2fba5..980ad48dd 100644 --- a/compute/cker/include/cker/operation/BatchToSpaceND.h +++ b/compute/cker/include/cker/operation/BatchToSpaceND.h @@ -43,7 +43,7 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_ // Similarly, (*end_index) * block_shape_dim is rounded up too (note that // end_index is exclusive). *end_index = - std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim); + std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim); } template <typename T> @@ -116,7 +116,7 @@ inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1 for (int in_w = in_w_start; in_w < in_w_end; ++in_w) { const int out_w = - in_w * block_shape_width + spatial_offset % block_shape_width - crops_left; + in_w * block_shape_width + spatial_offset % block_shape_width - crops_left; assert(out_w >= 0); assert(out_w < output_width); T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0); diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h index d9917a9da..fe5f87746 100644 --- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h @@ -139,7 +139,7 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1, // From this point it is assumed contractually that corresponding dimensions // in shape0 and shape1 are either (a) equal or (b) one or other equals 1. const bool swap_inputs = - params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast; + params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast; const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0; const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1; @@ -281,8 +281,8 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const S break; case nnfw::cker::BinaryArithmeticOpType::MUL: optimized::BroadcastMulDispatchQuant8( - params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape, - const_cast<uint8_t *>(input2_data), output_shape, output_data); + params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape, + const_cast<uint8_t *>(input2_data), output_shape, output_data); break; case nnfw::cker::BinaryArithmeticOpType::DIV: case nnfw::cker::BinaryArithmeticOpType::POW: @@ -320,8 +320,8 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const S break; case nnfw::cker::BinaryArithmeticOpType::POW: reference::BroadcastBinaryArithmeticOpSlow<float>( - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - GetBinaryArtithmeticFn<op_type, float>()); + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + GetBinaryArtithmeticFn<op_type, float>()); break; default: assert(false); diff --git a/compute/cker/include/cker/operation/Common.h b/compute/cker/include/cker/operation/Common.h index d69b38aca..24d4cc4c7 100644 --- a/compute/cker/include/cker/operation/Common.h +++ b/compute/cker/include/cker/operation/Common.h @@ -82,7 +82,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const for (; i < bias_size; i++) { array_ptr[i] = - ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max); + ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max); } } #else // not NEON @@ -91,7 +91,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const for (int i = 0; i < bias_size; i++) { array_data[array_offset + i] = ActivationFunctionWithMinMax( - array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max); + array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max); } } #endif diff --git a/compute/cker/include/cker/operation/Comparison.h b/compute/cker/include/cker/operation/Comparison.h index 47eb6034c..ac6af8487 100644 --- a/compute/cker/include/cker/operation/Comparison.h +++ b/compute/cker/include/cker/operation/Comparison.h @@ -42,7 +42,7 @@ inline void ComparisonImpl(const Shape &input1_shape, const T *input1_data, const Shape &output_shape, bool *output_data) { const int64_t flatsize = // number of data.... - MatchingFlatSize(input1_shape, input2_shape, output_shape); + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int64_t i = 0; i < flatsize; ++i) { output_data[i] = F(input1_data[i], input2_data[i]); @@ -79,9 +79,9 @@ inline void ComparisonWithScaling(ComparisonParams ¶ms, const Shape &input1_ const int32_t shifted_input1_val = input1_val * (1 << left_shift); const int32_t shifted_input2_val = input2_val * (1 << left_shift); const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, input1_shift); + shifted_input1_val, input1_multiplier, input1_shift); const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, input2_shift); + shifted_input2_val, input2_multiplier, input2_shift); output_data[i] = F(scaled_input1_val, scaled_input2_val); } } @@ -111,8 +111,8 @@ BroadcastComparison4DSlowImpl(const Shape &unextended_input1_shape, const T *inp for (int c = 0; c < output_shape.Dims(3); ++c) { output_data[Offset(output_shape, b, y, x, c)] = - F(input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]); + F(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]); } } } @@ -159,15 +159,15 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams ¶ms, for (int c = 0; c < output_shape.Dims(3); ++c) { const int32_t input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; + input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; const int32_t input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; const int32_t shifted_input1_val = input1_val * (1 << left_shift); const int32_t shifted_input2_val = input2_val * (1 << left_shift); const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, input1_shift); + shifted_input1_val, input1_multiplier, input1_shift); const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, input2_shift); + shifted_input2_val, input2_multiplier, input2_shift); output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val); } } @@ -175,55 +175,53 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams ¶ms, } } -#define TFLITE_COMPARISON_OP(name) \ - template <typename T> \ - inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, \ - const T *input2_data, const Shape &output_shape, bool *output_data) \ - { \ - Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \ - output_data); \ - } \ - template <typename T> \ - inline void name##NoScaling(const Shape &input1_shape, const T *input1_data, \ - const Shape &input2_shape, const T *input2_data, \ - const Shape &output_shape, bool *output_data) \ - { \ - ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ - output_shape, output_data); \ - } \ - template <typename T> \ - inline void name##WithScaling(ComparisonParams ¶ms, const Shape &input1_shape, \ - const T *input1_data, const Shape &input2_shape, \ - const T *input2_data, const Shape &output_shape, \ - bool *output_data) \ - { \ - ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape, \ - input2_data, output_shape, output_data); \ - } \ - template <typename T> \ - inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data, \ - const Shape &input2_shape, const T *input2_data, \ - const Shape &output_shape, bool *output_data) \ - { \ - BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, \ - input2_data, output_shape, output_data); \ - } \ - template <typename T> \ - inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data, \ - const Shape &input2_shape, const T *input2_data, \ - const Shape &output_shape, bool *output_data) \ - { \ - BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ - output_shape, output_data); \ - } \ - template <typename T> \ - inline void Broadcast4DSlow##name##WithScaling(ComparisonParams ¶ms, \ - const Shape &input1_shape, const T *input1_data, \ - const Shape &input2_shape, const T *input2_data, \ - const Shape &output_shape, bool *output_data) \ - { \ - BroadcastComparison4DSlowWithScaling<T, name##Fn>( \ - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \ +#define TFLITE_COMPARISON_OP(name) \ + template <typename T> \ + inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, \ + const T *input2_data, const Shape &output_shape, bool *output_data) \ + { \ + Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \ + output_data); \ + } \ + template <typename T> \ + inline void name##NoScaling(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + template <typename T> \ + inline void name##WithScaling( \ + ComparisonParams ¶ms, const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \ + { \ + ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape, \ + input2_data, output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, \ + input2_data, output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name##WithScaling( \ + ComparisonParams ¶ms, const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlowWithScaling<T, name##Fn>( \ + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \ } TFLITE_COMPARISON_OP(Equal); diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h index 394123e30..9aaca00b7 100644 --- a/compute/cker/include/cker/operation/Concatenation.h +++ b/compute/cker/include/cker/operation/Concatenation.h @@ -142,7 +142,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams ¶ms, for (int j = 0; j < copy_size; ++j) { const int32_t value = - static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint; + static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint; output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0)); } } diff --git a/compute/cker/include/cker/operation/DepthToSpace.h b/compute/cker/include/cker/operation/DepthToSpace.h new file mode 100644 index 000000000..e57fef01d --- /dev/null +++ b/compute/cker/include/cker/operation/DepthToSpace.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_DEPTH_TO_SPACE_H__ +#define __NNFW_CKER_DEPTH_TO_SPACE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void DepthToSpace(const Shape &unextended_input_shape, const T *input_data, + const Shape &unextended_output_shape, T *output_data, int32_t block_size) +{ + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + const int input_depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + + const int output_depth = output_shape.Dims(3); + const int batch_size = output_shape.Dims(0); + + // Number of continuous values that we can copy in one interation. + const int stride = block_size * output_depth; + + for (int batch = 0; batch < batch_size; ++batch) + { + for (int in_h = 0; in_h < input_height; ++in_h) + { + const T *input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0); + for (int offset_h = 0; offset_h < block_size; ++offset_h) + { + const T *src = input_ptr; + for (int in_w = 0; in_w < input_width; ++in_w) + { + memcpy(output_data, src, stride * sizeof(T)); + output_data += stride; + src += input_depth; + } + input_ptr += stride; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__ diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h index 814a9e019..436ddd8c9 100644 --- a/compute/cker/include/cker/operation/DepthwiseConv.h +++ b/compute/cker/include/cker/operation/DepthwiseConv.h @@ -22,143 +22,159 @@ #include "cker/Types.h" #include "cker/Utils.h" #include "cker/neon/neon_check.h" +#include "cker/operation/optimized/DepthwiseConvFloat.h" #include "cker/operation/optimized/DepthwiseConvUint8.h" +#include "cker/CpuBackendThreadpool.h" namespace nnfw { namespace cker { -inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, - const uint8_t *input_data, const Shape &filter_shape, - const uint8_t *filter_data, const Shape &bias_shape, - const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +// TODO(luwa): add multithread to per-channel depthwise_conv +// DepthwiseConv can run with multi threads on the dim specified by thread_dim. +// Each thread processes output elements on dim, thread_dim, in the range of +// [thread_start, thread_end). +// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it +// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :]. +template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task { - const int depth_multiplier = params.depth_multiplier; - const int32_t output_activation_min = params.quantized_activation_min; - const int32_t output_activation_max = params.quantized_activation_max; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - assert(dilation_width_factor >= 1); - assert(dilation_height_factor >= 1); - UNUSED_RELEASE(dilation_width_factor); - UNUSED_RELEASE(dilation_height_factor); - assert(input_shape.DimensionsCount() == 4); - assert(filter_shape.DimensionsCount() == 4); - assert(output_shape.DimensionsCount() == 4); - assert(output_activation_min <= output_activation_max); - UNUSED_RELEASE(output_activation_min); - UNUSED_RELEASE(output_activation_max); - const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); - const int input_depth = input_shape.Dims(3); - assert(output_depth == input_depth * depth_multiplier); - assert(bias_shape.FlatSize() == output_depth); - UNUSED_RELEASE(input_depth); - UNUSED_RELEASE(output_depth); - UNUSED_RELEASE(depth_multiplier); - -// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on -// Jetson TX-2. This compiler does not support the offsetof() macro. -#if defined(__aarch64__) -// TODO Use below codes - -// const int stride_width = params.stride_width; -// const int stride_height = params.stride_height; -// const int pad_width = params.padding_values.width; -// const int pad_height = params.padding_values.height; -// const int output_shift = params.output_shift; -// -// // Call kernel optimized for depthwise convolutions using 3x3 filters if -// // parameters are supported. -// if (Fast3x3FilterKernelSupported( -// input_shape, filter_shape, stride_width, stride_height, -// dilation_width_factor, dilation_height_factor, pad_width, pad_height, -// depth_multiplier, output_shape, output_shift)) { -// DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape, -// filter_data, bias_shape, bias_data, output_shape, -// output_data); -// return; -// } -#endif - - optimized::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data, - bias_shape, bias_data, output_shape, output_data); + DepthwiseConvWorkerTask(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const T *input_data, const Shape &filter_shape, const T *filter_data, + const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, + T *output_data, int thread_start, int thread_end, int thread_dim) + : params_(params), input_shape_(input_shape), input_data_(input_data), + filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape), + bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data), + thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim) + { + } + + void Run() override + { + optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_, + bias_shape_, bias_data_, output_shape_, output_data_, + thread_start_, thread_end_, thread_dim_); + } + +private: + const DepthwiseConvParams ¶ms_; + const Shape &input_shape_; + const T *input_data_; + const Shape &filter_shape_; + const T *filter_data_; + const Shape &bias_shape_; + const TS *bias_data_; + const Shape &output_shape_; + T *output_data_; + // const CpuFlags& cpu_flags_; + int thread_start_; + int thread_end_; + int thread_dim_; +}; + +inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape) +{ + // How many scalar multiplications are needed to make it worth using one + // more thread + static constexpr int kMinMulPerThread = 1 << 13; // 8k + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int num_muls = output_shape.FlatSize() * filter_height * filter_width; + // Try to avoid real runtime divisions if possible by dividing by a + // compile-time constant. + int thread_count = std::max(1, num_muls / kMinMulPerThread); + return thread_count; +} + +inline bool MultithreadAlongBatches(int thread_count, int batches) +{ + assert(thread_count >= 2); + // If there are fewer batch entries than the number of threads we want to use, + // then better do intra-batch-entry multithreading. + if (batches < thread_count) + { + return false; + } + // If there are at least 2 batch entries to be handed to each thread, then + // it's safe to proceed with batch-wise multithreading: each thread will have + // approximately equal number of batch entries to handle, so the load + // balancing will be reasonable, and the amount to which the load is not + // perfectly balanced will be offset by the inherent advantages of + // batch-wise multithreading (each thread is more efficient thanks to working + // on larger buffers with less boundary-handling overhead). + if (batches >= 2 * thread_count) + { + return true; + } + // In the limit case were there are at least 1 but not much more than 1 + // batch entries per thread, it may be a good idea to do per-batch + // multithreading if the number of batch entries is a multiple of the number + // of threads, so that each thread will have the same number of batch entries + // to process. + return ((batches % thread_count) == 0); } +template <typename T, typename TS> inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, - const float *input_data, const Shape &filter_shape, - const float *filter_data, const Shape &bias_shape, const float *bias_data, - const Shape &output_shape, float *output_data) + const T *input_data, const Shape &filter_shape, const T *filter_data, + const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, + T *output_data, ruy::Context *ruy_context) { - const int stride_width = params.stride_width; - const int stride_height = params.stride_height; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - const int pad_width = params.padding_values.width; - const int pad_height = params.padding_values.height; - const int depth_multiplier = params.depth_multiplier; - const float output_activation_min = params.float_activation_min; - const float output_activation_max = params.float_activation_max; assert(input_shape.DimensionsCount() == 4); assert(filter_shape.DimensionsCount() == 4); assert(output_shape.DimensionsCount() == 4); - const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); - const int input_depth = input_shape.Dims(3); - const int filter_height = filter_shape.Dims(1); - const int filter_width = filter_shape.Dims(2); + int thread_count = HowManyConvThreads(output_shape, filter_shape); + + // NOTE Borrow RuyContext to get max_num_threads setting + // TODO Define and use max_num_threads for CPU backend + const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads(); + + thread_count = std::max(1, std::min(thread_count, max_threads)); + // Cap the number of threads to 2 for float path to avoid regression in + // performance (b/132294857). + if (std::is_floating_point<T>::value) + { + thread_count = std::min(thread_count, 2); + } + + const int output_batches = output_shape.Dims(0); const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); - assert(output_depth == input_depth * depth_multiplier); - assert(bias_shape.FlatSize() == output_depth); - UNUSED_RELEASE(output_depth); - UNUSED_RELEASE(bias_shape); - for (int b = 0; b < batches; ++b) + if (thread_count == 1) + { + optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, 0, output_height, + 1); + return; + } + + int thread_dim, thread_dim_size; + if (MultithreadAlongBatches(thread_count, output_batches)) + { + thread_dim = 0; + thread_dim_size = output_batches; + } + else + { + thread_dim = 1; + thread_dim_size = output_height; + } + + std::vector<DepthwiseConvWorkerTask<T, TS>> tasks; + // TODO(b/131746020) don't create new heap allocations every time. + // At least we make it a single heap allocation by using reserve(). + tasks.reserve(thread_count); + int thread_start = 0; + for (int i = 0; i < thread_count; ++i) { - for (int out_y = 0; out_y < output_height; ++out_y) - { - for (int out_x = 0; out_x < output_width; ++out_x) - { - for (int ic = 0; ic < input_depth; ++ic) - { - for (int m = 0; m < depth_multiplier; m++) - { - const int oc = m + ic * depth_multiplier; - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; - float total = 0.f; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) - { - for (int filter_x = 0; filter_x < filter_width; ++filter_x) - { - const int in_x = in_x_origin + dilation_width_factor * filter_x; - const int in_y = in_y_origin + dilation_height_factor * filter_y; - // If the location is outside the bounds of the input image, - // use zero as a default value. - if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) - { - float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)]; - float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)]; - total += (input_value * filter_value); - } - } - } - float bias_value = 0.0f; - if (bias_data) - { - bias_value = bias_data[oc]; - } - output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax( - total + bias_value, output_activation_min, output_activation_max); - } - } - } - } + int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i); + tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape, + bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); + thread_start = thread_end; } + cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context); } } // namespace cker diff --git a/compute/cker/include/cker/operation/ELU.h b/compute/cker/include/cker/operation/ELU.h new file mode 100644 index 000000000..6bdd7c62e --- /dev/null +++ b/compute/cker/include/cker/operation/ELU.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ELU_H__ +#define __NNFW_CKER_ELU_H__ + +#include "cker/Shape.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void ELU(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < flat_size; ++i) + { + const float val = input_data[i]; + output_data[i] = val < 0.0 ? std::exp(val) - 1 : val; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ELU_H__ diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h index 3d1837f47..13fccfd15 100644 --- a/compute/cker/include/cker/operation/Einsum.h +++ b/compute/cker/include/cker/operation/Einsum.h @@ -394,8 +394,8 @@ private: for (int label = 0; label < num_labels; ++label) { bool removed = (_output_label_counts[label] == 0); - bool unique = num_inputs == 1 || _input_label_counts[0][label] == 0 || - _input_label_counts[1][label] == 0; + bool unique = + num_inputs == 1 || _input_label_counts[0][label] == 0 || _input_label_counts[1][label] == 0; _label_types[label] = getDimensionType(removed, unique); } } @@ -483,8 +483,8 @@ private: if (inputs[i].shape.DimensionsCount() + 1 < (int32_t)labels->size()) { throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank at least " + - std::to_string(labels->size() - 1) + " but got: " + - std::to_string(inputs[i].shape.DimensionsCount())}; + std::to_string(labels->size() - 1) + + " but got: " + std::to_string(inputs[i].shape.DimensionsCount())}; } int ellipsis_axis = -1; const int num_bcast_dims = inputs[i].shape.DimensionsCount() - labels->size() + 1; @@ -511,7 +511,7 @@ private: } std::vector<bool>::iterator it_input = - std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true); + std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true); if (it_input == _input_has_ellipsis.end() && !_output_has_ellipsis) { return; @@ -645,11 +645,11 @@ private: // Reduce along the last axis (i.e axis 1) of the rank-2 Tensor. const int32_t output_size = - reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract]; + reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract]; functor::ReduceFunctor<Eigen::ThreadPoolDevice, Reducer>::Reduce( - device, output->shaped<T, 1>({output_size}), - input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}), - Reducer()); + device, output->shaped<T, 1>({output_size}), + input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}), + Reducer()); } bool shouldSwapFreeAndContract(const Labels &labels, @@ -779,7 +779,7 @@ private: { const int32_t count = label_counts[label]; const int current_axis = - should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size(); + should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size(); const int32_t dim = input.shape.Dims(current_axis); strided_shape_dims.push_back(dim); inflated_shape_dims.insert(inflated_shape_dims.end(), count, dim); @@ -879,7 +879,7 @@ private: for (size_t i = 0; i < inputs.size(); ++i) { const int32_t free_axis = - inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2); + inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2); output_shape.SetDim(i + old_output_shape.DimensionsCount(), inputs[i].shape.Dims(free_axis)); } bool adj_x = swap_free_and_contract[0]; diff --git a/compute/cker/include/cker/operation/Elementwise.h b/compute/cker/include/cker/operation/Elementwise.h index 9d080d89b..0e980f18e 100644 --- a/compute/cker/include/cker/operation/Elementwise.h +++ b/compute/cker/include/cker/operation/Elementwise.h @@ -98,6 +98,28 @@ inline void Floor(const Shape &input_shape, const float *input_data, const Shape } } +inline void Sqrt(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + for (int i = 0; i < flat_size; i++) + { + output_data[i] = std::sqrt(input_data[i]); + } +} + +inline void Square(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + for (int i = 0; i < flat_size; i++) + { + output_data[i] = input_data[i] * input_data[i]; + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h index 14daf9839..d657acc12 100644 --- a/compute/cker/include/cker/operation/Fill.h +++ b/compute/cker/include/cker/operation/Fill.h @@ -24,27 +24,12 @@ namespace nnfw { namespace cker { -template <typename T> -inline void Fill(const Shape &input_shape, int *input_data, const T value_data, - const Shape &output_shape, T output_data) +template <typename T> inline void Fill(const T value_data, const Shape &output_shape, T output_data) { - int input_size = input_shape.FlatSize(); - int output_size = 1; - for (int i = 0; i < input_size; i++) + int output_size = output_shape.FlatSize(); + for (int i = 0; i < output_size; i++) { - output_size *= input_data[i]; - } - - if (output_size == output_shape.FlatSize()) - { - for (int i = 0; i < output_size; i++) - { - output_data[i] = *value_data; - } - } - else - { - throw std::runtime_error("Cker Fill.h: output's size is not matched inferred size of output"); + output_data[i] = *value_data; } } diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h index 958532402..b7d27e85d 100644 --- a/compute/cker/include/cker/operation/FullyConnected.h +++ b/compute/cker/include/cker/operation/FullyConnected.h @@ -117,7 +117,7 @@ inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &inpu const int filter_dim_count = filter_shape.DimensionsCount(); const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); const int output_depth = - MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1); + MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1); const int accum_depth = filter_shape.Dims(filter_dim_count - 1); for (int b = 0; b < batches; ++b) { @@ -229,7 +229,7 @@ inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams ¶ms, const int weights_dims_count = weights_shape.DimensionsCount(); const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); const int output_depth = - MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); const int accum_depth = weights_shape.Dims(weights_dims_count - 1); UNUSED_RELEASE(bias_shape); @@ -249,7 +249,7 @@ inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams ¶ms, { int idx_1 = w1_indices[pw1]; output_data[b * output_depth + idx_0] += - weights_data[pw1] * input_data[b * accum_depth + idx_1]; + weights_data[pw1] * input_data[b * accum_depth + idx_1]; } } } diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h index 28ae7a3bc..df397f73e 100644 --- a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h +++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h @@ -70,7 +70,7 @@ inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams ¶ms, const int weights_dims_count = weights_shape.DimensionsCount(); const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); const int output_depth = - MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); const int accum_depth = weights_shape.Dims(weights_dims_count - 1); UNUSED_RELEASE(bias_shape); diff --git a/compute/cker/include/cker/operation/FusedBatchNorm.h b/compute/cker/include/cker/operation/FusedBatchNorm.h index d17a5796b..8a97d8421 100644 --- a/compute/cker/include/cker/operation/FusedBatchNorm.h +++ b/compute/cker/include/cker/operation/FusedBatchNorm.h @@ -105,7 +105,7 @@ public: float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size)); // This adjustment is for Bessel's correction float rest_size_adjust = - static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one); + static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one); Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth); Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth); @@ -117,12 +117,12 @@ public: batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv; auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale) - .eval() - .reshape(one_by_depth) - .broadcast(bcast_spec); + .eval() + .reshape(one_by_depth) + .broadcast(bcast_spec); auto x_scaled = x_centered * scaling_factor; auto x_shifted = - (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>(); + (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>(); UNUSED_RELEASE(rest_size_adjust); diff --git a/compute/cker/include/cker/operation/Helper/BCast.h b/compute/cker/include/cker/operation/Helper/BCast.h index a0abf2935..211db98ce 100644 --- a/compute/cker/include/cker/operation/Helper/BCast.h +++ b/compute/cker/include/cker/operation/Helper/BCast.h @@ -22,7 +22,7 @@ * ToDo : This file will be moved into upper folder when integrate with other * custom operations. * And It should merged with EinsumHelper's BCast. -**/ + **/ #include "cker/Shape.h" #include "cker/eigen/EigenSupport.h" @@ -393,7 +393,7 @@ public: BCast(const Vec &x, const Vec &y, const bool fewer_dims_optimization = true, const bool return_flattened_batch_indices = false) - : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices) + : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices) { } diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h index baeafd7c9..cbebff142 100644 --- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h +++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h @@ -168,7 +168,7 @@ public: // Must have lo < hi UniformDistribution(int32_t lo, int32_t hi) - : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo)) + : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo)) { } @@ -207,7 +207,7 @@ public: // Must have lo < hi UniformDistribution(int64_t lo, int64_t hi) - : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo)) + : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo)) { } @@ -291,22 +291,22 @@ public: template <typename Generator> class UniformFullIntDistribution<Generator, int32_t> - : public UniformFullIntDistribution32<Generator, int32_t> + : public UniformFullIntDistribution32<Generator, int32_t> { }; template <typename Generator> class UniformFullIntDistribution<Generator, uint32_t> - : public UniformFullIntDistribution32<Generator, uint32_t> + : public UniformFullIntDistribution32<Generator, uint32_t> { }; template <typename Generator> class UniformFullIntDistribution<Generator, int64_t> - : public UniformFullIntDistribution64<Generator, int64_t> + : public UniformFullIntDistribution64<Generator, int64_t> { }; template <typename Generator> class UniformFullIntDistribution<Generator, uint64_t> - : public UniformFullIntDistribution64<Generator, uint64_t> + : public UniformFullIntDistribution64<Generator, uint64_t> { }; @@ -324,7 +324,7 @@ public: PHILOX_DEVICE_INLINE explicit SingleSampleAdapter(Generator *gen) - : generator_(gen), used_result_index_(Generator::kResultElementCount) + : generator_(gen), used_result_index_(Generator::kResultElementCount) { } @@ -615,8 +615,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double> public: // The number of elements that will be returned. static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1) - ? SingleSampleGenerator::kNativeElementCount / 2 - : 1; + ? SingleSampleGenerator::kNativeElementCount / 2 + : 1; // Cost of generation of a single element (in cycles). static constexpr int kElementCost = 90; // Indicate that this distribution may take variable number of samples diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h index 85d267723..6e9ffbdfd 100644 --- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h +++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h @@ -109,7 +109,7 @@ template <class Distribution> struct FillPhiloxRandomTask<Distribution, true> { const int kGroupSize = Distribution::kResultElementCount; static const int kGeneratorSkipPerOutputGroup = - kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount; + kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount; int64_t offset = 0; diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h index e6ac008a5..ec29a15c3 100644 --- a/compute/cker/include/cker/operation/Helper/Tensor.h +++ b/compute/cker/include/cker/operation/Helper/Tensor.h @@ -29,58 +29,58 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> str { // Rank-<NDIMS> tensor of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned> - Tensor; + Tensor; typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstTensor; + ConstTensor; // Unaligned Rank-<NDIMS> tensor of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>> UnalignedTensor; typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>> - UnalignedConstTensor; + UnalignedConstTensor; typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned> - Tensor32Bit; + Tensor32Bit; // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned> - Scalar; + Scalar; typedef Eigen::TensorMap< - Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstScalar; + Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned> + ConstScalar; // Unaligned Scalar tensor of scalar type T. typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>> - UnalignedScalar; + UnalignedScalar; typedef Eigen::TensorMap< - Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>> - UnalignedConstScalar; + Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>> + UnalignedConstScalar; // Rank-1 tensor (vector) of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat; typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstFlat; + ConstFlat; typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec; typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstVec; + ConstVec; // Unaligned Rank-1 tensor (vector) of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedFlat; typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> - UnalignedConstFlat; + UnalignedConstFlat; typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedVec; typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> UnalignedConstVec; // Rank-2 tensor (matrix) of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix; typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstMatrix; + ConstMatrix; // Unaligned Rank-2 tensor (matrix) of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>> UnalignedMatrix; typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>> - UnalignedConstMatrix; + UnalignedConstMatrix; }; typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32; diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h index 6445e8a2b..8fa8b03bc 100644 --- a/compute/cker/include/cker/operation/InstanceNorm.h +++ b/compute/cker/include/cker/operation/InstanceNorm.h @@ -78,8 +78,8 @@ inline void InstanceNorm(const InstanceNormParams ¶ms, const Shape &input_sh double input_value = input_data[Offset(output_shape, batch, height, width, channel)]; double output_value = input_value * a + b; output_data[Offset(output_shape, batch, height, width, channel)] = - ActivationFunctionWithMinMax((float)output_value, output_activation_min, - output_activation_max); + ActivationFunctionWithMinMax((float)output_value, output_activation_min, + output_activation_max); } } } diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h index a0075c3d0..c1fca91cc 100644 --- a/compute/cker/include/cker/operation/L2Normalize.h +++ b/compute/cker/include/cker/operation/L2Normalize.h @@ -77,7 +77,7 @@ void L2NormalizeQuant8(L2NormParams ¶ms, const Shape &input_shape, const uin { int32_t diff = *input_data - input_zero_point; int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( - 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); + 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); int32_t unclamped_output_val = 128 + rescaled_diff; int32_t output_val = std::min(static_cast<int32_t>(255), std::max(static_cast<int32_t>(0), unclamped_output_val)); diff --git a/compute/cker/include/cker/operation/LSTM.h b/compute/cker/include/cker/operation/LSTM.h index 27beaaead..a8f1f8ca3 100644 --- a/compute/cker/include/cker/operation/LSTM.h +++ b/compute/cker/include/cker/operation/LSTM.h @@ -283,23 +283,23 @@ void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, const float // contiguous, and we manually loop over the batched outputs. // LINT.IfChange inline void LstmStepFloat( - const float *input_ptr, const float *input_to_input_weights_ptr, - const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr, - const float *input_to_output_weights_ptr, const float *aux_input_ptr, - const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr, - const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr, - const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr, - const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr, - const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr, - const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr, - const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr, - const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr, - const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr, - const float *output_gate_bias_ptr, const float *projection_weights_ptr, - const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, - int n_input, int n_aux_input, int n_output, int output_batch_leading_dim, - float *output_state_ptr, float *cell_state_ptr, float *scratch0, float *scratch1, - float *scratch2, float *scratch3, float *output_ptr) + const float *input_ptr, const float *input_to_input_weights_ptr, + const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr, + const float *input_to_output_weights_ptr, const float *aux_input_ptr, + const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr, + const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr, + const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr, + const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr, + const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr, + const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr, + const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr, + const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr, + const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr, + const float *output_gate_bias_ptr, const float *projection_weights_ptr, + const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, int n_input, + int n_aux_input, int n_output, int output_batch_leading_dim, float *output_state_ptr, + float *cell_state_ptr, float *scratch0, float *scratch1, float *scratch2, float *scratch3, + float *output_ptr) { // Since we have already checked that weights are all there or none, we can // check the existence of only one to the get the condition. @@ -314,7 +314,7 @@ inline void LstmStepFloat( // Check if inputs are all zeros so we can skip some computations. const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input); const bool is_aux_input_all_zeros = - (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input)); + (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input)); if (!use_cifg) { // Calculate the input gate. (If not CIFG.) @@ -336,11 +336,11 @@ inline void LstmStepFloat( forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros); // Calculate the cell update gate. CalculateLstmGateFloat( - input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr, - output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr, - /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, - n_batch, n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch, - is_input_all_zeros, is_aux_input_all_zeros); + input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr, + output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr, + /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, n_batch, + n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch, + is_input_all_zeros, is_aux_input_all_zeros); // Update the cell state. UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch, cell_gate_scratch, use_cifg, params->cell_clip); diff --git a/compute/cker/include/cker/operation/LeakyReLU.h b/compute/cker/include/cker/operation/LeakyReLU.h new file mode 100644 index 000000000..e12d01bba --- /dev/null +++ b/compute/cker/include/cker/operation/LeakyReLU.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LEKAY_RELU_H__ +#define __NNFW_CKER_LEKAY_RELU_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void LeakyReLU(const LeakyReluParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &output_shape, float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + for (int i = 0; i < flat_size; i++) + { + const float val = input_data[i]; + // Note that alpha might be > 1 or < 0, so we don't use std::max here. + output_data[i] = val > 0 ? val : val * params.alpha; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RELU_H__ diff --git a/compute/cker/include/cker/operation/LogSoftMax.h b/compute/cker/include/cker/operation/LogSoftMax.h index 326a44f0c..eb7bdd900 100644 --- a/compute/cker/include/cker/operation/LogSoftMax.h +++ b/compute/cker/include/cker/operation/LogSoftMax.h @@ -71,7 +71,7 @@ inline void LogSoftmax(const SoftmaxParams ¶ms, const Shape &input_shape, for (int c = 0; c < depth; ++c) { output_data[(i * depth + c) * inner_size + j] = - (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum; + (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum; } } } @@ -124,10 +124,10 @@ inline void LogSoftmax(const SoftmaxParams ¶ms, float input_scale, const Sha for (int c = 0; c < depth; ++c) { const float log_prob = - scale * input_data[(i * depth + c) * inner_size] * beta - precomputed; + scale * input_data[(i * depth + c) * inner_size] * beta - precomputed; const int32_t prob_quantized = std::rint(log_prob) + params.zero_point; output_data[(i * depth + c) * inner_size] = - static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min)); + static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min)); } } } diff --git a/compute/cker/include/cker/operation/LogicalAnd.h b/compute/cker/include/cker/operation/LogicalAnd.h new file mode 100644 index 000000000..e877f5f47 --- /dev/null +++ b/compute/cker/include/cker/operation/LogicalAnd.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LOGICAL_AND_H__ +#define __NNFW_CKER_LOGICAL_AND_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void LogicalAndBroadcast(const Shape &unextended_input1_shape, const T *input1_data, + const Shape &unextended_input2_shape, const T *input2_data, + const Shape &unextended_output_shape, T *output_data) +{ + assert(unextended_input1_shape.DimensionsCount() <= 4); + assert(unextended_input2_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1, + &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) + { + for (int y = 0; y < output_shape.Dims(1); ++y) + { + for (int x = 0; x < output_shape.Dims(2); ++x) + { + for (int c = 0; c < output_shape.Dims(3); ++c) + { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = in1_val && in2_val; + } + } + } + } +} + +template <typename T> +inline void LogicalAndElementwise(const Shape &shape, const T *input1_data, const T *input2_data, + T *output_data) +{ + + int num_elements = shape.FlatSize(); + + for (int t = 0; t < num_elements; t++) + { + output_data[t] = input1_data[t] && input2_data[t]; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_LOGICAL_AND_H__ diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h index 5674ff3ef..ef2868455 100644 --- a/compute/cker/include/cker/operation/MatrixBandPart.h +++ b/compute/cker/include/cker/operation/MatrixBandPart.h @@ -43,11 +43,11 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap if (!(num_lower_diags <= row_num)) throw std::runtime_error( - "MatrixBandPart : num_lower must be negative or less or equal to number of rows"); + "MatrixBandPart : num_lower must be negative or less or equal to number of rows"); if (!(num_upper_diags <= col_num)) throw std::runtime_error( - "MatrixBandPart : num_upper must be negative or less or equal to number of columns"); + "MatrixBandPart : num_upper must be negative or less or equal to number of columns"); std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init @@ -60,9 +60,10 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap auto input = input_data + (batch * row_num * col_num + row * col_num); const T band_start = - num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags)); - const T band_end = num_upper_diags < 0 ? col_num : std::min(static_cast<T>(col_num), - row + num_upper_diags + 1); + num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags)); + const T band_end = num_upper_diags < 0 + ? col_num + : std::min(static_cast<T>(col_num), row + num_upper_diags + 1); for (T band_idx = band_start; band_idx < band_end; band_idx++) { diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h index ea3fcaca6..5dc84d368 100644 --- a/compute/cker/include/cker/operation/MaxPool.h +++ b/compute/cker/include/cker/operation/MaxPool.h @@ -67,10 +67,10 @@ void MaxPool<float>(const PoolParams ¶ms, const Shape &input_shape, const fl int hpad = h + params.padding_values.height; int wpad = w + params.padding_values.width; int h_start = - (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; int h_end = std::min(hpad / stride_height + 1, output_height); int w_start = - (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; int w_end = std::min(wpad / stride_width + 1, output_width); // compute elementwise sum for (int ph = h_start; ph < h_end; ++ph) @@ -79,8 +79,8 @@ void MaxPool<float>(const PoolParams ¶ms, const Shape &input_shape, const fl { int out_offset = NodeOffset(b, ph, pw, output_height, output_width); out_mat.col(out_offset) = - out_mat.col(out_offset) - .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width))); + out_mat.col(out_offset) + .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width))); } } } @@ -139,8 +139,8 @@ void MaxPool<uint8_t>(const PoolParams ¶ms, const Shape &input_shape, const const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); memset(acc, 0, tranche_depth * sizeof(acc[0])); const uint8_t *input_ptr = - input_data + depth_base + - depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); for (int fy = filter_y_start; fy < filter_y_end; fy++) { const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); diff --git a/compute/cker/include/cker/operation/OneHot.h b/compute/cker/include/cker/operation/OneHot.h index c0dbc6df5..ddc27b4c2 100644 --- a/compute/cker/include/cker/operation/OneHot.h +++ b/compute/cker/include/cker/operation/OneHot.h @@ -55,7 +55,7 @@ void OneHot(const int32_t depth, const T on_value, const T off_value, int32_t ax for (int k = 0; k < suffix_dim_size; ++k, ++output_data) { *output_data = - static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value; + static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value; } } } diff --git a/compute/cker/include/cker/operation/Range.h b/compute/cker/include/cker/operation/Range.h index 5c3a773a2..d6ccc68c8 100644 --- a/compute/cker/include/cker/operation/Range.h +++ b/compute/cker/include/cker/operation/Range.h @@ -35,8 +35,8 @@ template <typename T> inline int GetSize(T start, T limit, T delta) } int size = (std::is_integral<T>::value - ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta)) - : std::ceil(std::abs((limit - start) / delta))); + ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta)) + : std::ceil(std::abs((limit - start) / delta))); return size; } diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h index 2b2e8d338..dbf938147 100644 --- a/compute/cker/include/cker/operation/Reduce.h +++ b/compute/cker/include/cker/operation/Reduce.h @@ -50,7 +50,7 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape { int r_idx = 0; float tmp_data[4] = { - 0, + 0, }; float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data); for (; r_idx <= reduce_size - 32; r_idx += 32) @@ -143,7 +143,7 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha { size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); size_t output_offset = - ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]); } while (NextIndex(input_num_dims, input_dims, input_iter)); return true; @@ -319,7 +319,7 @@ public: for (size_t idx = 0; idx < num_outputs; ++idx) { const U value = - static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point; + static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point; output_data[idx] = static_cast<T>(value); } } @@ -329,7 +329,7 @@ public: for (size_t idx = 0; idx < num_outputs; ++idx) { float float_mean = - static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis); + static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis); float result = std::min(std::round(float_mean * scale + bias) + output_zero_point, static_cast<float>(std::numeric_limits<T>::max())); result = std::max(result, static_cast<float>(std::numeric_limits<T>::min())); diff --git a/compute/cker/include/cker/operation/ReduceMean.h b/compute/cker/include/cker/operation/ReduceMean.h index 2e4fc6274..924e85037 100644 --- a/compute/cker/include/cker/operation/ReduceMean.h +++ b/compute/cker/include/cker/operation/ReduceMean.h @@ -72,9 +72,9 @@ inline bool ReduceMeanImpl(const In *input_data, const Shape &input_shape, const { size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); size_t output_offset = - ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); output_data[output_offset] = - reducer(output_data[output_offset], input_data[input_offset], normalizer); + reducer(output_data[output_offset], input_data[input_offset], normalizer); } while (NextIndex(input_num_dims, input_dims, input_iter)); return true; } @@ -102,7 +102,7 @@ inline size_t ReduceSumQuantImpl(const In *input_data, const Shape &input_shape, { size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); size_t output_offset = - ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]); } while (NextIndex(input_num_dims, input_dims, input_iter)); return normalizer; @@ -185,8 +185,8 @@ public: } size_t normalizer = - ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis, - temp_index_data(), reducer, _temp_sum.data()); + ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis, + temp_index_data(), reducer, _temp_sum.data()); if (num_outputs > 0) { float scale = input_scale / output_scale; @@ -231,6 +231,37 @@ void MeanQ8Asymm(const Shape &input_shape, const In *input_data, float input_sca sum_reducer); } +template <typename In, typename Out> +void MeanAxis1And2(const Shape &input_shape, const In *input_data, const Shape &output_shape, + Out *output_data) +{ + UNUSED_RELEASE(output_shape); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int output_batch = output_shape.Dims(0); + const int output_depth = output_shape.Dims(3); + + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + + for (int out_b = 0; out_b < output_batch; ++out_b) + { + for (int out_d = 0; out_d < output_depth; ++out_d) + { + float value = 0; + for (int in_h = 0; in_h < input_height; ++in_h) + { + for (int in_w = 0; in_w < input_width; ++in_w) + { + value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)]; + } + } + output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height); + } + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h index 7fc1e9123..8d9a7495f 100644 --- a/compute/cker/include/cker/operation/ResizeBilinear.h +++ b/compute/cker/include/cker/operation/ResizeBilinear.h @@ -62,7 +62,7 @@ inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t // Bottom right corner. output_data[output_offset + output_x_offset + output_y_offset] = - (output + ((x1y0 + x1y1) / 2)) / 2; + (output + ((x1y0 + x1y1) / 2)) / 2; } } @@ -192,8 +192,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei &x1); int32_t input_offset[4] = { - Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0), - Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)}; + Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0), + Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)}; float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)), (1 - (input_y - y0)) * (input_x - x0), (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)}; @@ -202,8 +202,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei { const T *input_ptr = &input_data[d]; *output_ptr++ = static_cast<T>( - input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] + - input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]); + input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] + + input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]); } } } @@ -253,16 +253,16 @@ void ResizeBilinear(ResizeBilinearParams ¶ms, const Shape &input_shape, int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); float height_scale = (params.align_corners && params.output_height > 1) - ? (static_cast<float>(input_height - 1) / (params.output_height - 1)) - : (static_cast<float>(input_height) / params.output_height); + ? (static_cast<float>(input_height - 1) / (params.output_height - 1)) + : (static_cast<float>(input_height) / params.output_height); float width_scale = (params.align_corners && params.output_width > 1) - ? (static_cast<float>(input_width - 1) / (params.output_width - 1)) - : (static_cast<float>(input_width) / params.output_width); + ? (static_cast<float>(input_width - 1) / (params.output_width - 1)) + : (static_cast<float>(input_width) / params.output_width); ResizeBilinearGenericSmallChannel<uint8_t>( - batches, input_height, input_width, depth, params.output_height, params.output_width, - height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers); + batches, input_height, input_width, depth, params.output_height, params.output_width, + height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers); } } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/Select.h b/compute/cker/include/cker/operation/Select.h index ab2de94cc..644fe0a0e 100644 --- a/compute/cker/include/cker/operation/Select.h +++ b/compute/cker/include/cker/operation/Select.h @@ -34,7 +34,7 @@ void Select(const Shape &input_condition_shape, const D *input_condition_data, const T *input_y_data, const Shape &output_shape, T *output_data) { const int64_t flatsize = - MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape); + MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape); for (int64_t i = 0; i < flatsize; ++i) { output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i]; @@ -101,7 +101,7 @@ void BroadcastSelect4DSlow(const Shape &input_condition_shape, const D *input_co const int x_index = SubscriptToIndex(desc_x, b, y, x, c); const int y_index = SubscriptToIndex(desc_y, b, y, x, c); output_data[Offset(extended_output_shape, b, y, x, c)] = - input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index]; + input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index]; } } } diff --git a/compute/cker/include/cker/operation/Slice.h b/compute/cker/include/cker/operation/Slice.h index a072cff8e..ef97fd5d8 100644 --- a/compute/cker/include/cker/operation/Slice.h +++ b/compute/cker/include/cker/operation/Slice.h @@ -43,16 +43,16 @@ inline void Slice(const SliceParams &op_params, const Shape &input_shape, : start_b + op_params.size[0]; const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3]; const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1) - ? input_shape.Dims(1) - : start_h + op_params.size[size_count - 3]; + ? input_shape.Dims(1) + : start_h + op_params.size[size_count - 3]; const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2]; const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1) - ? input_shape.Dims(2) - : start_w + op_params.size[size_count - 2]; + ? input_shape.Dims(2) + : start_w + op_params.size[size_count - 2]; const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1]; const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1) - ? input_shape.Dims(3) - : start_d + op_params.size[size_count - 1]; + ? input_shape.Dims(3) + : start_d + op_params.size[size_count - 1]; for (int in_b = start_b; in_b < stop_b; ++in_b) { diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h index 0e0f364ba..620c1f968 100644 --- a/compute/cker/include/cker/operation/SoftMax.h +++ b/compute/cker/include/cker/operation/SoftMax.h @@ -65,7 +65,7 @@ inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const for (int c = 0; c < depth; ++c) { output_data[i * depth + c] = - std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum; + std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum; } } } @@ -163,11 +163,11 @@ inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, if (input_diff >= diff_min) { const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); + input_diff, input_beta_multiplier, input_beta_left_shift); const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); + FixedPointScaledDiff::FromRaw(input_diff_rescaled); sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( - exp_on_negative_values(scaled_diff_f8)); + exp_on_negative_values(scaled_diff_f8)); } } @@ -178,11 +178,11 @@ inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, // no later adjustment will be needed. int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; int32_t shifted_sum_minus_one = - static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) - - (static_cast<uint32_t>(1) << 31)); + static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) - + (static_cast<uint32_t>(1) << 31)); FixedPoint0 shifted_scale = - one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one)); + one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one)); for (int c = 0; c < depth; ++c) { @@ -190,16 +190,16 @@ inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, if (input_diff >= diff_min) { const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); + input_diff, input_beta_multiplier, input_beta_left_shift); const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); + FixedPointScaledDiff::FromRaw(input_diff_rescaled); FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); output_data[i * depth + c] = static_cast<uint8_t>( - std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0))); + std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0))); } else { diff --git a/compute/cker/include/cker/operation/SpaceToBatchND.h b/compute/cker/include/cker/operation/SpaceToBatchND.h index feeb358c9..aff36e2f3 100644 --- a/compute/cker/include/cker/operation/SpaceToBatchND.h +++ b/compute/cker/include/cker/operation/SpaceToBatchND.h @@ -79,9 +79,9 @@ inline void SpaceToBatchND(const SpaceToBatchParams ¶ms, const Shape &unexte else { const T *in = - input_data + Offset(input_shape, input_batch, - (out_h * block_shape_height + shift_h) - padding_top, - (out_w * block_shape_width + shift_w) - padding_left, 0); + input_data + Offset(input_shape, input_batch, + (out_h * block_shape_height + shift_h) - padding_top, + (out_w * block_shape_width + shift_w) - padding_left, 0); memcpy(out, in, depth * sizeof(T)); } } diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h index d5952ae23..cdd812a08 100644 --- a/compute/cker/include/cker/operation/StatelessRandomUniform.h +++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h @@ -95,7 +95,7 @@ inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_da GenerateKey(seed_t, &key, &counter); Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>( - random::PhiloxRandom(counter, key), &output_t); + random::PhiloxRandom(counter, key), &output_t); } } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/Tile.h b/compute/cker/include/cker/operation/Tile.h index 1dcdd9b79..42433468a 100644 --- a/compute/cker/include/cker/operation/Tile.h +++ b/compute/cker/include/cker/operation/Tile.h @@ -55,7 +55,7 @@ std::pair<int, int> TileOneDimension(const Shape &in_dimensions, const T *in_dat { int stride_size = 0, tiled_stride_size = 0; std::tie(stride_size, tiled_stride_size) = - TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1); + TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1); copy_from_data += stride_size; copy_to_data += tiled_stride_size; total_stride_size += stride_size; diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h index 9d8cd340d..62eb432ae 100644 --- a/compute/cker/include/cker/operation/Transpose.h +++ b/compute/cker/include/cker/operation/Transpose.h @@ -555,9 +555,9 @@ void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_inp const int total_size = shrunk_input_shape.FlatSize(); const int non_flatten_size = - Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params, + Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params, - &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params); + &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params); assert(non_flatten_params.perm[0] != 0); for (int i = 0; i < total_size; i += non_flatten_size) diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h index 7db3a1179..d41f86047 100644 --- a/compute/cker/include/cker/operation/TransposeConv.h +++ b/compute/cker/include/cker/operation/TransposeConv.h @@ -90,11 +90,11 @@ inline void TransposeConv(const TransposeConvParams ¶ms, const Shape &input_ (out_y < output_height)) { float input_value = - input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; - float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y, - filter_x, in_channel)]; + input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + float filter_value = + filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] += - input_value * filter_value; + input_value * filter_value; } } } diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h index 912b01a64..8c1d31b56 100644 --- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h @@ -130,12 +130,12 @@ inline int32_t quant8_sum(const BinaryArithmeticOpParam ¶ms, const uint8_t i const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, params.input1_multiplier, params.input1_shift); + shifted_input1_val, params.input1_multiplier, params.input1_shift); const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, params.input2_multiplier, params.input2_shift); + shifted_input2_val, params.input2_multiplier, params.input2_shift); const int32_t raw_sum = scaled_input1_val + scaled_input2_val; const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, params.output_multiplier, params.output_shift) + + raw_sum, params.output_multiplier, params.output_shift) + params.output_offset; const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); @@ -192,9 +192,9 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms const int16x4_t s1_narrowed = vmovn_s32(s1); const int16x4_t s2_narrowed = vmovn_s32(s2); const int16x8_t s = - vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset)); - const uint8x8_t clamped = vmax_u8(output_activation_min_vector, - vmin_u8(output_activation_max_vector, vqmovun_s16(s))); + vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset)); + const uint8x8_t clamped = + vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s))); vst1_u8(output_data + i, clamped); } #endif // NEON @@ -205,12 +205,12 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, params.input1_multiplier, params.input1_shift); + shifted_input1_val, params.input1_multiplier, params.input1_shift); const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, params.input2_multiplier, params.input2_shift); + shifted_input2_val, params.input2_multiplier, params.input2_shift); const int32_t raw_sum = scaled_input1_val + scaled_input2_val; const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, params.output_multiplier, params.output_shift) + + raw_sum, params.output_multiplier, params.output_shift) + params.output_offset; const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); @@ -387,7 +387,7 @@ inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam ¶ms, auto a2 = vld1q_f32(input2_data + i); auto x = OPERATOR::calculate(a1, a2); // vaddq auto x_clamped = - ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max); + ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max); vst1q_f32(output_data + i, x_clamped); } #endif // USE_NEON @@ -395,7 +395,7 @@ inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam ¶ms, { auto x = OPERATOR::calculate(input1_data[i], input2_data[i]); output_data[i] = ACTIVATION::applyCeiling( - ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max); + ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max); } } @@ -441,7 +441,7 @@ inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &par auto a2 = vld1q_f32(input2_data + i); auto x = OPERATOR::calculate(broadcast_value_dup, a2); auto x_clamped = - ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max); + ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max); vst1q_f32(output_data + i, x_clamped); } #endif // USE_NEON @@ -449,13 +449,13 @@ inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &par { auto x = OPERATOR::calculate(broadcast_value, input2_data[i]); output_data[i] = ACTIVATION::applyCeiling( - ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max); + ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max); } } using BinaryOpImplFloatFuncs = - std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *), - void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>; + std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *), + void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>; template <class FUNC> inline BinaryOpImplFloatFuncs @@ -514,23 +514,22 @@ inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam ¶ms, if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) { const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)> - fn = [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, - const uint8_t &b) -> uint8_t { + fn = + [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, const uint8_t &b) -> uint8_t { return static_cast<uint8_t>(quant8_sum(params, a, b)); }; - reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data, - input2_shape, input2_data, output_shape, - output_data, fn); + reference::BroadcastBinaryArithmeticOpSlowQuant8( + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); } else { BinaryBroadcastFiveFold( - params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, - input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, - uint8_t *)>(AddElementwiseQuant8), - static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, - uint8_t *)>(AddScalarBroadcastQuant8)); + params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, + uint8_t *)>(AddElementwiseQuant8), + static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, + uint8_t *)>(AddScalarBroadcastQuant8)); } } @@ -542,7 +541,7 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam ¶ms, const Sh if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) { const std::function<float(const float &, const float &)> fn = - [](const float &a, const float &b) -> float { return a + b; }; + [](const float &a, const float &b) -> float { return a + b; }; reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); } @@ -550,10 +549,10 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam ¶ms, const Sh { auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params); - BinaryBroadcastFiveFold(params, params.broadcast_category == - BroadcastableOpCategory::kSecondInputBroadcastsFast, - input1_shape, input1_data, input2_shape, input2_data, output_shape, - output_data, implFuncs.first, implFuncs.second); + BinaryBroadcastFiveFold( + params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + implFuncs.first, implFuncs.second); } } @@ -580,14 +579,14 @@ inline void BroadcastSubDispatch(const BinaryArithmeticOpParam ¶ms, const Sh else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast) { auto implFuncs = - getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params); + getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params); BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, implFuncs.first, implFuncs.second); } else { const std::function<float(const float &, const float &)> fn = - [](const float &a, const float &b) -> float { return a - b; }; + [](const float &a, const float &b) -> float { return a - b; }; reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); } @@ -599,11 +598,11 @@ inline int32_t quant8_mul(const BinaryArithmeticOpParam ¶ms, const uint8_t i const int32_t input1_val = params.input1_offset + input1_data; const int32_t input2_val = params.input2_offset + input2_data; const int32_t unclamped_result = - params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, - params.output_multiplier, - params.output_shift); + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32_t clamped_output = std::min( - params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); return clamped_output; } @@ -652,8 +651,8 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms const auto p1_narrowed = vqmovn_s32(p1); const auto p2_narrowed = vqmovn_s32(p2); const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); - const auto clamped = vmax_u8(output_activation_min_vector, - vmin_u8(output_activation_max_vector, vqmovun_s16(p))); + const auto clamped = + vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(p))); vst1_u8(output_data + i, clamped); } #endif // NEON @@ -663,12 +662,11 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms const int32_t input1_val = params.input1_offset + input1_data[i]; const int32_t input2_val = params.input2_offset + input2_data[i]; const int32_t unclamped_result = - params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, - params.output_multiplier, - params.output_shift); - const int32_t clamped_output = - std::min(params.quantized_activation_max, - std::max(params.quantized_activation_min, unclamped_result)); + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); output_data[i] = static_cast<uint8_t>(clamped_output); } } @@ -711,22 +709,21 @@ inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam ¶ms, if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) { const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)> - fn = [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, - const uint8_t &b) -> uint8_t { + fn = + [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, const uint8_t &b) -> uint8_t { return static_cast<uint8_t>(quant8_mul(params, a, b)); }; - reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data, - input2_shape, input2_data, output_shape, - output_data, fn); + reference::BroadcastBinaryArithmeticOpSlowQuant8( + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); return; } BinaryBroadcastFiveFold( - params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, - input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, - uint8_t *)>(MulElementwiseQuant8), - static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, - uint8_t *)>(MulSimpleBroadcastQuant8)); + params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, + uint8_t *)>(MulElementwiseQuant8), + static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, + uint8_t *)>(MulSimpleBroadcastQuant8)); } inline void BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -738,16 +735,16 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Sh { // TODO: Use GetBinaryArithmeticFn const std::function<float(const float &, const float &)> fn = - [](const float &a, const float &b) -> float { return a * b; }; + [](const float &a, const float &b) -> float { return a * b; }; reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); return; } auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params); - BinaryBroadcastFiveFold(params, params.broadcast_category == - BroadcastableOpCategory::kSecondInputBroadcastsFast, - input1_shape, input1_data, input2_shape, input2_data, output_shape, - output_data, implFuncs.first, implFuncs.second); + BinaryBroadcastFiveFold( + params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + implFuncs.first, implFuncs.second); } inline void Div(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -760,7 +757,7 @@ inline void Div(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); #else const std::function<float(const float &, const float &)> fn = - [](const float &a, const float &b) -> float { return a / b; }; + [](const float &a, const float &b) -> float { return a / b; }; reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); #endif // __aarch64__ @@ -781,7 +778,7 @@ inline void BroadcastDivDispatch(const BinaryArithmeticOpParam ¶ms, const Sh else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast) { auto implFuncs = - getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params); + getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params); BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, implFuncs.first, implFuncs.second); } @@ -789,7 +786,7 @@ inline void BroadcastDivDispatch(const BinaryArithmeticOpParam ¶ms, const Sh #endif // __aarch64__ { const std::function<float(const float &, const float &)> fn = - [](const float &a, const float &b) -> float { return a / b; }; + [](const float &a, const float &b) -> float { return a / b; }; reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); } diff --git a/compute/cker/include/cker/operation/optimized/Conv.h b/compute/cker/include/cker/operation/optimized/Conv.h index 0f620146c..26fc443db 100644 --- a/compute/cker/include/cker/operation/optimized/Conv.h +++ b/compute/cker/include/cker/operation/optimized/Conv.h @@ -48,7 +48,7 @@ struct GemmlowpOutputPipeline typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>, gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent, gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> - Pipeline; + Pipeline; static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset, int32_t output_multiplier, int output_left_shift, int32_t output_activation_min, int32_t output_activation_max) @@ -106,7 +106,7 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 const int filter_height = filter_shape.Dims(1); const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1; const bool need_im2col = - stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; + stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; if (need_dilated_im2col) { assert(im2col_data); @@ -141,7 +141,7 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 // the other calls commented out. This is a partial rollback of cl/196819423. // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3); const int gemm_input_cols = - gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2); + gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2); const int filter_rows = filter_shape.Dims(0); // See b/79927784. // const int filter_cols = FlatSizeSkipDim(filter_shape, 0); @@ -156,17 +156,17 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 assert(bias_shape.FlatSize() == output_rows); UNUSED_RELEASE(bias_shape); gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix( - filter_data, filter_rows, filter_cols); + filter_data, filter_rows, filter_cols); gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix( - gemm_input_data, gemm_input_rows, gemm_input_cols); + gemm_input_data, gemm_input_rows, gemm_input_cols); gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows, output_cols); const auto &output_pipeline = - GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max); + GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier, + output_shift, output_activation_min, output_activation_max); gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( - gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset, - output_pipeline); + gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset, + output_pipeline); } } // namespace optimized @@ -202,10 +202,10 @@ public: T *output_data, int output_height, int output_width) { const bool is_1x1_kernel = - (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1); + (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1); const bool is_same_height_width = - (filter_height == input_height && filter_width == input_width && pad_width == 0 && - pad_height == 0); + (filter_height == input_height && filter_width == input_width && pad_width == 0 && + pad_height == 0); if (is_1x1_kernel || is_same_height_width) { // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication. diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h new file mode 100644 index 000000000..d4397933a --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h @@ -0,0 +1,1250 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__ +#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/neon/neon_check.h" + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +// Implementation of float DepthwiseConv + +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +struct FloatDepthwiseConvKernel +{ +}; + +#ifdef USE_NEON + +template <> struct FloatDepthwiseConvKernel<false, 8, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the inputs + float32x4_t input[4]; + for (int i = 0; i < 4; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_f32(acc[0], input[0], filter[0]); + acc[1] = vmlaq_f32(acc[1], input[1], filter[1]); + acc[2] = vmlaq_f32(acc[2], input[2], filter[0]); + acc[3] = vmlaq_f32(acc[3], input[3], filter[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the inputs + float32x4_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<false, 2, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + + const float32x2_t filters = vld1_f32(filter_ptr); + const float32x4_t filters_dup2 = vcombine_f32(filters, filters); + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the inputs + float32x4_t input[4]; + for (int i = 0; i < 4; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 4; i++) + { + acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the inputs + float32x4_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the inputs + const float32x4_t input = vld1q_f32(input_ptr); + input_ptr += 4; + // Load the accumulators from acc_buffer + float32x4_t acc = vld1q_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filters_dup2); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle 1 output pixel at a time + for (; outp < num_output_pixels; outp++) + { + // Load the inputs + const float32x2_t input = vld1_f32(input_ptr); + input_ptr += 2; + // Load the accumulators from acc_buffer + float32x2_t acc = vld1_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmla_f32(acc, input, filters); + // Store the accumulators back to acc_buffer + vst1_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 0, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)depth_multiplier; + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float *local_filter_ptr = filter_ptr; + const float *local_input_ptr = input_ptr; + int ic = 0; + // Handle 16 input channels at a time. + for (; ic <= input_depth - 16; ic += 16) + { + // Load the filters + float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0); + float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1); + float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2); + float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3); + local_filter_ptr += 16; + // Load the inputs + float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0); + float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1); + float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2); + float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3); + local_input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); + float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); + float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); + float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); + // Multiply-accumulate + acc_0 = vmlaq_f32(acc_0, input_0, filter_0); + acc_1 = vmlaq_f32(acc_1, input_1, filter_1); + acc_2 = vmlaq_f32(acc_2, input_2, filter_2); + acc_3 = vmlaq_f32(acc_3, input_3, filter_3); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); + acc_buffer_ptr += 16; + } + // Handle 4 input channels at a time. + for (; ic <= input_depth - 4; ic += 4) + { + // Load the filters + float32x4_t filter; + filter = vld1q_f32(local_filter_ptr); + local_filter_ptr += 4; + // Load the inputs + float32x4_t input; + input = vld1q_f32(local_input_ptr); + local_input_ptr += 4; + // Load the accumulators from acc_buffer + float32x4_t acc; + acc = vld1q_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filter); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + const float input_val = *local_input_ptr++; + const float filter_val = *local_filter_ptr++; + *acc_buffer_ptr++ += filter_val * input_val; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 0, 8> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)depth_multiplier; + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float *local_filter_ptr = filter_ptr; + const float *local_input_ptr = input_ptr; + int ic = 0; + // Handle 2 input channels at a time. + for (; ic <= input_depth - 2; ic += 2) + { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) + { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + const float32x2_t input = vld1_f32(local_input_ptr); + local_input_ptr += 2; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0); + acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0); + acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1); + acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 8; + // Load the inputs + const float input_val = *local_input_ptr++; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + input_ptr += input_ptr_increment; + } + } +}; + +// Note this implementation is very slow for input_depths < 8 +// (e.g. comparable to reference implementation) see, specializations for +// input_depth=3 below. +template <> struct FloatDepthwiseConvKernel<true, 0, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)depth_multiplier; + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float *local_filter_ptr = filter_ptr; + const float *local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) + { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + float32x4x2_t input_dup2[2]; + for (int i = 0; i < 2; i++) + { + const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i); + input_dup2[i] = vzipq_f32(input, input); + } + local_input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]); + acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]); + acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]); + acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 input channels at a time. + for (; ic <= input_depth - 4; ic += 4) + { + // Load the filters + float32x2_t filter[4]; + for (int i = 0; i < 4; i++) + { + filter[i] = vld1_f32(local_filter_ptr + 2 * i); + } + local_filter_ptr += 8; + // Load the inputs + const float32x4_t input = vld1q_f32(local_input_ptr); + local_input_ptr += 4; + // Load the accumulators from acc_buffer + float32x2_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); + } + // Multiply-accumulate + acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0); + acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1); + acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0); + acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 input channels at a time. + for (; ic <= input_depth - 2; ic += 2) + { + // Load the filters + const float32x4_t filter = vld1q_f32(local_filter_ptr); + local_filter_ptr += 4; + // Load the inputs + const float32x2_t input = vld1_f32(local_input_ptr); + local_input_ptr += 2; + // Load the accumulators from acc_buffer + float32x2_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); + } + // Multiply-accumulate + acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0); + acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); + } + acc_buffer_ptr += 4; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + // Load the inputs + const float input_val = *local_input_ptr++; + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc_buffer_ptr[i] += local_filter_ptr[i] * input_val; + } + local_filter_ptr += 2; + acc_buffer_ptr += 2; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 3, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x2_t filter[3]; + for (int i = 0; i < 3; i++) + { + filter[i] = vld1_f32(filter_ptr + 2 * i); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float32x2_t input01 = vld1_f32(input_ptr); + const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); + // Load the accumulators from acc_buffer + float32x2_t acc[3]; + for (int i = 0; i < 3; i++) + { + acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); + } + // Multiply-accumulate for each input channel there 2 outputs + acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0); + acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1); + acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 3; i++) + { + vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); + } + acc_buffer_ptr += 6; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 3, 4> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter[3]; + for (int i = 0; i < 3; i++) + { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // NOTE: we only want 3 values, so we read it as two ops where + // the second op just duplicates the lane + const float32x2_t input01 = vld1_f32(input_ptr); + const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); + // Load the accumulators from acc_buffer + float32x4_t acc[3]; + for (int i = 0; i < 3; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate all outputs. + acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0); + acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1); + acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 3; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 12; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 1, 8> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + const float input_val = *input_ptr; + input_ptr += input_ptr_increment; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 1, 32> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); + float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); + float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); + float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); + float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); + float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5); + float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6); + float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + const float input_val = *input_ptr; + input_ptr += input_ptr_increment; + // Load the accumulators from acc_buffer + float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); + float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); + float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); + float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); + float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); + float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5); + float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6); + float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7); + // Multiply-accumulate + acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); + acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); + acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); + acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); + acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); + acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val); + acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val); + acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); + vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5); + vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6); + vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7); + acc_buffer_ptr += 32; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 1, 20> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); + float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); + float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); + float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); + float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + const float input_val = *input_ptr; + input_ptr += input_ptr_increment; + // Load the accumulators from acc_buffer + float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); + float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); + float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); + float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); + float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); + // Multiply-accumulate + acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); + acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); + acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); + acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); + acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); + acc_buffer_ptr += 20; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 0, 16> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)depth_multiplier; + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float *local_filter_ptr = filter_ptr; + const float *local_input_ptr = input_ptr; + for (int ic = 0; ic < input_depth; ic++) + { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) + { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + const float input_val = *local_input_ptr++; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 4; i++) + { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 8, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + float32x4_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 2, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + float32x2_t filter = vld1_f32(filter_ptr); + float32x4_t filter_x4 = vcombine_f32(filter, filter); + int outp = 0; + + // Handle two output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the inputs + float32x2_t input_1 = vld1_f32(input_ptr); + input_ptr += input_ptr_increment; + float32x2_t input_2 = vld1_f32(input_ptr); + input_ptr += input_ptr_increment; + float32x4_t input = vcombine_f32(input_1, input_2); + + // Load the accumulators from acc_buffer + float32x4_t acc = vld1q_f32(acc_buffer_ptr); + + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filter_x4); + + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the inputs + float32x2_t input = vld1_f32(input_ptr); + input_ptr += input_ptr_increment; + + // Load the accumulators from acc_buffer + float32x2_t acc = vld1_f32(acc_buffer_ptr); + + // Multiply-accumulate + acc = vmla_f32(acc, input, filter); + + // Store the accumulators back to acc_buffer + vst1_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 4, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + float32x4_t filter = vld1q_f32(filter_ptr); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + float32x4_t input = vld1q_f32(input_ptr); + // Load the accumulators from acc_buffer + float32x4_t acc = vld1q_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filter); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + input_ptr += input_ptr_increment; + } + } +}; +#endif + +// Accumulates the effect of one row of the filter, on a segment of one row +// of the output, accessing the corresponding one row of the input. +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width, + const float *input_data, int pad_width, int depth_multiplier, + int filter_width, const float *filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, float *acc_buffer) +{ + // Sanity check parameters. This is important in particular to ensure + // that we keep the number of template instantiations minimal, so we don't + // increase binary size unnecessarily. + static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); + static_assert(kFixedInputDepth || kAllowStrided, ""); + assert(stride == 1 || kAllowStrided); + if (kFixedInputDepth) + { + assert(input_depth == kFixedInputDepth); + } + if (kFixedDepthMultiplier) + { + assert(depth_multiplier == kFixedDepthMultiplier); + } + assert(output_depth == input_depth * depth_multiplier); + const int input_ptr_increment = stride * input_depth; + const float *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + // For the current (filter_x, filter_y) point in the filter, + // compute the boundaries of the corresponding output row segment. + int out_x_loop_start_unclamped = 0; + int out_x_loop_end_unclamped = 0; + if (kAllowStrided) + { + if (stride == 2) + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2; + out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2; + } + else if (stride == 4) + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4; + out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4; + } + else + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride; + out_x_loop_end_unclamped = + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; + } + } + else + { + out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x; + out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x; + } + // The kernel will have to iterate on the segment of the + // output row that starts at out_x_loop_start and out_x_loop_end. + const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped); + const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped); + + float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const float *input_ptr = input_data + in_x_origin * input_depth; + const int num_output_pixels = out_x_loop_end - out_x_loop_start; + FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run( + num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment, + filter_base_ptr, acc_buffer_ptr); + filter_base_ptr += output_depth; + } +} + +// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized. +inline void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, + int input_width, const float *input_data, + int pad_width, int depth_multiplier, int filter_width, + const float *filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, + float *acc_buffer) +{ + const float *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int out_x_loop_start = + std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_end = + std::min(out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); + + float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const float *input_ptr = input_data + in_x_origin * input_depth; + const int input_ptr_increment = (stride - 1) * input_depth; + for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) + { + const float *filter_ptr = filter_base_ptr; + for (int ic = 0; ic < input_depth; ++ic) + { + const float input_val = *input_ptr++; + for (int m = 0; m < depth_multiplier; m++) + { + const float filter_val = *filter_ptr++; + *acc_buffer_ptr++ += filter_val * input_val; + } + } + input_ptr += input_ptr_increment; + } + filter_base_ptr += output_depth; + } +} + +// Initializes the accumulator buffer with bias values. +inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, + const float *bias_data, float *acc_buffer) +{ + // TODO(benoitjacob): This might need optimized specializations + // for small output_depth values, if that ever becomes an important + // case (like it was for some quantized DepthwiseConv cases). + for (int i = 0; i < num_output_pixels; i++) + { + memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth); + } +} + +// DepthwiseConv can run with multi threads on the dim specified by thread_dim. +// Each thread processes output elements on dim, thread_dim, in the range of +// [thread_start, thread_end). +// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it +// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :]. +inline void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data, + int thread_start, int thread_end, int thread_dim) +{ + UNUSED_RELEASE(bias_shape); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + assert(thread_dim == 0 || thread_dim == 1); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + + static const int kAccBufferMaxSize = 4832; + float acc_buffer[kAccBufferMaxSize]; + assert(kAccBufferMaxSize >= output_depth); + const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; + const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; + assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize); + assert(kAccBufferActualSize <= kAccBufferMaxSize); + assert(kOutputPixelsInAccBuffer >= 1); + + UNUSED_RELEASE(kAccBufferActualSize); + + // row_accum_func will point to the core accumulation function to be used + // for this DepthwiseConv op. + using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric); + row_accum_func_t row_accum_func = nullptr; + +#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \ + if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ + (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ + depth_multiplier == FIXED_DEPTH_MULTIPLIER) \ + { \ + row_accum_func = \ + FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ + } + +#ifdef USE_NEON + // We go over our list of kernels by decreasing order of preference + // for the cases where multiple kernels could apply. + + // Start with the fastest kernels: AllowStrided=false, fixed input depth. + + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) + + // Next come the strided kernels: AllowStrided=true, fixed input depth. + // They are a bit less efficient, but allow stride!=1. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) + + // Finally, the kernels allowing a variable input depth, + // these are the least efficient but most general kernels. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16) + +#endif // USE_NEON + +#undef TFMINI_USE_DEPTHWISECONV_KERNEL + + // No matching fast kernel found, use slow fallback. + if (!row_accum_func) + { + row_accum_func = FloatDepthwiseConvAccumRowGeneric; + } + + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + + // Now that we have determined row_accum_func, we can start work. + int batch_start = 0; + int batch_end = batches; + int row_start = 0; + int row_end = output_height; + int output_ptr_offset = 0; + + switch (thread_dim) + { + case 0: + // Multithread along with the batch axis + assert(thread_start >= 0); + assert(thread_end <= batches); + batch_start = thread_start; + batch_end = thread_end; + output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0); + break; + case 1: + // Multithread along with the row axis + assert(thread_start >= 0); + assert(thread_end <= output_height); + row_start = thread_start; + row_end = thread_end; + output_ptr_offset = row_start * output_width * output_depth; + break; + } + + float *output_ptr = output_data + output_ptr_offset; + int batch_step = (output_height + row_start - row_end) * output_width * output_depth; + + for (int b = batch_start; b < batch_end; ++b) + { + for (int out_y = row_start; out_y < row_end; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_y_start = + std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); + const int filter_y_end = + std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); + for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; + out_x_buffer_start += kOutputPixelsInAccBuffer) + { + const int out_x_buffer_end = + std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + // We call a 'pixel' a group of activation that share all but the + // 'depth'/'channel' coordinate. num_output_pixels is the number of + // output pixels that we will accumulate in this loop iteration. + const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; + // Initialize our local accumulator with the bias values, so we don't + // have to add them later. + DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer); + // Accumulation loop. Most of the time should be spent in here. + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + row_accum_func(stride_width, dilation_width_factor, input_depth, input_width, + input_data + in_y * input_height_stride + b * input_batch_stride, + pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_height_stride, out_x_buffer_start, + out_x_buffer_end, output_depth, acc_buffer); + } + // Finished accumulating. Now store to destination. + const int num_output_values = output_depth * num_output_pixels; + int i = 0; +// TODO(benoitjacob) optimized code goes here +#ifdef USE_NEON + // Handle 16 values at a time + for (; i <= num_output_values - 16; i += 16) + { + float32x4_t acc[4]; + for (int k = 0; k < 4; k++) + { + acc[k] = vld1q_f32(acc_buffer + i + 4 * k); + } + for (int k = 0; k < 4; k++) + { + acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min), + vminq_f32(vdupq_n_f32(output_activation_max), acc[k])); + } + for (int k = 0; k < 4; k++) + { + vst1q_f32(output_ptr + 4 * k, acc[k]); + } + output_ptr += 16; + } + // Handle 4 values at a time + for (; i <= num_output_values - 4; i += 4) + { + float32x4_t acc = vld1q_f32(acc_buffer + i); + + acc = vmaxq_f32(vdupq_n_f32(output_activation_min), + vminq_f32(vdupq_n_f32(output_activation_max), acc)); + + vst1q_f32(output_ptr, acc); + output_ptr += 4; + } +#endif + // Handle leftover values, one by one. This is very slow. + for (; i < num_output_values; i++) + { + float acc = acc_buffer[i]; + acc = std::max(output_activation_min, std::min(output_activation_max, acc)); + + *output_ptr++ = acc; + } + } + } + output_ptr += batch_step; + } +} + +} // nnfw +} // cker +} // optimized + +#endif diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h index d383b126d..5ca56fd09 100644 --- a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h +++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h @@ -32,6 +32,8 @@ namespace cker { namespace optimized { +namespace depthwise_conv +{ // Implementation of quantized DepthwiseConv @@ -44,8 +46,8 @@ struct QuantizedDepthwiseConvKernel template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -57,7 +59,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> for (int i = 0; i < 2; i++) { filter[i] = - vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset)); + vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset)); } // Handle one output pixel at a time. for (int outp = 0; outp < num_output_pixels; outp++) @@ -80,9 +82,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> for (int i = 0; i < 2; i++) { acc[0].val[i] = - vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i])); + vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i])); acc[1].val[i] = - vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i])); + vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i])); } // Store the accumulators back to acc_buffer for (int i = 0; i < 2; i++) @@ -98,8 +100,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> template <> struct QuantizedDepthwiseConvKernel<false, 8, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -174,8 +176,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 8, 1> template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -206,9 +208,9 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> for (int i = 0; i < 2; i++) { acc[2 * i + 0] = - vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i])); + vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i])); acc[2 * i + 1] = - vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i])); + vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i])); } // Store the accumulators back to acc_buffer for (int i = 0; i < 4; i++) @@ -253,8 +255,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> template <> struct QuantizedDepthwiseConvKernel<false, 2, 8> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -338,8 +340,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 8> template <> struct QuantizedDepthwiseConvKernel<false, 2, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -409,8 +411,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 2> template <> struct QuantizedDepthwiseConvKernel<false, 2, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -534,8 +536,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 1> template <> struct QuantizedDepthwiseConvKernel<false, 1, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -600,8 +602,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 2> template <> struct QuantizedDepthwiseConvKernel<false, 1, 4> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -703,8 +705,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 4> template <> struct QuantizedDepthwiseConvKernel<false, 4, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -778,8 +780,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 1> template <> struct QuantizedDepthwiseConvKernel<false, 4, 4> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -864,8 +866,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 4> template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -873,7 +875,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> // We will do that by register-level table-look-up using VTBL instructions. // Here we prepare the registers containing the table-lookup indices. static const uint8_t dup3_indices_array[3][8] = { - {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}}; + {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}}; uint8x8_t dup3_indices[3]; for (int i = 0; i < 3; i++) { @@ -928,9 +930,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> for (int j = 0; j < 3; j++) { acc[0].val[j] = - vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j])); + vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j])); acc[1].val[j] = - vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j])); + vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j])); } // Store the accumulators back to acc_buffer for (int i = 0; i < 2; i++) @@ -944,10 +946,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> // Handle one input channel at a time. for (; ic < input_depth; ic++) { - const uint16_t input_val = *local_input_ptr++ + input_offset; + const int16_t input_val = *local_input_ptr++ + input_offset; for (int i = 0; i < 3; i++) { - const uint16_t filter_val = local_filter_ptr[i] + filter_offset; + const int16_t filter_val = local_filter_ptr[i] + filter_offset; *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; } local_filter_ptr += 3; @@ -960,8 +962,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1002,9 +1004,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> for (int j = 0; j < 2; j++) { acc[0].val[j] = - vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j])); + vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j])); acc[1].val[j] = - vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j])); + vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j])); } // Store the accumulators back to acc_buffer. for (int i = 0; i < 2; i++) @@ -1018,10 +1020,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> for (; ic < input_depth; ic++) { // Load the inputs. - const uint16_t input_val = *local_input_ptr++ + input_offset; + const int16_t input_val = *local_input_ptr++ + input_offset; for (int i = 0; i < 2; i++) { - const uint16_t filter_val = local_filter_ptr[i] + filter_offset; + const int16_t filter_val = local_filter_ptr[i] + filter_offset; *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; } local_filter_ptr += 2; @@ -1034,8 +1036,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1112,8 +1114,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> // Handle one input channel at a time. for (; ic < input_depth; ic++) { - const uint16_t input_val = *local_input_ptr++ + input_offset; - const uint16_t filter_val = *local_filter_ptr++ + filter_offset; + const int16_t input_val = *local_input_ptr++ + input_offset; + const int16_t filter_val = *local_filter_ptr++ + filter_offset; *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; } input_ptr += input_ptr_increment; @@ -1124,8 +1126,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1174,7 +1176,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> { acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i])); acc[2 * i + 1] = - vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i])); + vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i])); } // Store the accumulators back to acc_buffer for (int i = 0; i < 4; i++) @@ -1189,8 +1191,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> template <> struct QuantizedDepthwiseConvKernel<true, 8, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1228,8 +1230,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 1> template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1253,7 +1255,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> { uint8_t input_u8 = *input_ptr; input_ptr += input_ptr_increment; - uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + int16_t input = static_cast<int16_t>(input_u8) + input_offset; // Load the accumulators from acc_buffer int32x4_t acc[4]; for (int i = 0; i < 4; i++) @@ -1279,8 +1281,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1302,7 +1304,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> { uint8_t input_u8 = *input_ptr; input_ptr += input_ptr_increment; - uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + int16_t input = static_cast<int16_t>(input_u8) + input_offset; // Load the accumulators from acc_buffer int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); @@ -1338,8 +1340,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1363,7 +1365,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> { uint8_t input_u8 = *input_ptr; input_ptr += input_ptr_increment; - uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + int16_t input = static_cast<int16_t>(input_u8) + input_offset; // Load the accumulators from acc_buffer int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); @@ -1390,21 +1392,21 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> template <> struct QuantizedDepthwiseConvKernel<true, 1, 8> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; // Load the filters, add filter_offset. const uint8x8_t filter_u8 = vld1_u8(filter_ptr); const int16x8_t filter = - vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); + vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); // Handle one output pixel at a time. for (int outp = 0; outp < num_output_pixels; outp++) { uint8_t input_u8 = *input_ptr; input_ptr += input_ptr_increment; - uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + int16_t input = static_cast<int16_t>(input_u8) + input_offset; // Load the accumulators from acc_buffer int32x4_t acc[2]; for (int i = 0; i < 2; i++) @@ -1427,8 +1429,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 8> template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1455,7 +1457,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1); input_ptr += input_ptr_increment; const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16)))); + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16)))); const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); // Multiply-accumulate. @@ -1490,8 +1492,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> template <> struct QuantizedDepthwiseConvKernel<true, 4, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1555,8 +1557,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 4, 1> template <> struct QuantizedDepthwiseConvKernel<false, 12, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1652,9 +1654,9 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d else { out_x_loop_start_unclampled = - (pad_width - dilation_factor * filter_x + stride - 1) / stride; + (pad_width - dilation_factor * filter_x + stride - 1) / stride; out_x_loop_end_unclampled = - (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; } } else @@ -1672,8 +1674,8 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d const uint8_t *input_ptr = input_data + in_x_origin * input_depth; const int num_output_pixels = out_x_loop_end - out_x_loop_start; QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run( - num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset, - input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr); + num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset, + input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr); filter_base_ptr += output_depth; } } @@ -1690,11 +1692,11 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_facto const uint8_t *filter_base_ptr = filter_data; for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int out_x_loop_start = std::max( - out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_start = + std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); const int out_x_loop_end = - std::min(out_x_buffer_end, - (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); + std::min(out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; @@ -1813,7 +1815,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, - uint8_t *output_data) + uint8_t *output_data, int thread_start, int thread_end, + int thread_dim) { (void)bias_shape; const int stride_width = params.stride_width; @@ -1852,6 +1855,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize); assert(kAccBufferActualSize <= kAccBufferMaxSize); assert(kOutputPixelsInAccBuffer >= 1); + assert(thread_dim == 0 || thread_dim == 1); + UNUSED_RELEASE(kAccBufferActualSize); // row_accum_func will point to the core accumulation function to be used @@ -1865,7 +1870,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape depth_multiplier == FIXED_DEPTH_MULTIPLIER) \ { \ row_accum_func = \ - QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ + QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ } #ifdef USE_NEON @@ -1919,22 +1924,49 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); // Now that we have determined row_accum_func, we can start work. - uint8_t *output_ptr = output_data; - for (int b = 0; b < batches; ++b) + int batch_start = 0; + int batch_end = batches; + int row_start = 0; + int row_end = output_height; + int output_ptr_offset = 0; + + switch (thread_dim) + { + case 0: + // Multithread along with the batch axis + assert(thread_start >= 0); + assert(thread_end <= batches); + batch_start = thread_start; + batch_end = thread_end; + output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0); + break; + case 1: + // Multithread along with the row axis + assert(thread_start >= 0); + assert(thread_end <= output_height); + row_start = thread_start; + row_end = thread_end; + output_ptr_offset = row_start * output_width * output_depth; + break; + } + + uint8_t *output_ptr = output_data + output_ptr_offset; + int batch_step = (output_height + row_start - row_end) * output_width * output_depth; + for (int b = batch_start; b < batch_end; ++b) { - for (int out_y = 0; out_y < output_height; ++out_y) + for (int out_y = row_start; out_y < row_end; ++out_y) { const int in_y_origin = (out_y * stride_height) - pad_height; const int filter_y_start = - std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); + std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); const int filter_y_end = - std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / - dilation_height_factor); + std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; out_x_buffer_start += kOutputPixelsInAccBuffer) { const int out_x_buffer_end = - std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); // We call a 'pixel' a group of activation that share all but the // 'depth'/'channel' coordinate. num_output_pixels is the number of // output pixels that we will accumulate in this loop iteration. @@ -1952,7 +1984,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape filter_data + filter_y * filter_height_stride, filter_offset, out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); } - // Finished accumulating int32 values. Now need to convert them to + // Finished accumulating int32_t values. Now need to convert them to // the final 8bit form and store them. const int num_output_values = output_depth * num_output_pixels; int i = 0; @@ -2113,9 +2145,111 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape } } } + output_ptr += batch_step; } } +} // namespace depthwise_conv + +// template <DepthwiseConvOutputRounding kOutputRounding> +inline void DepthwiseConvWithRounding(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data, int thread_start, int thread_end, + int thread_dim) +{ + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + assert(dilation_width_factor >= 1); + assert(dilation_height_factor >= 1); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + assert(output_activation_min <= output_activation_max); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_depth = input_shape.Dims(3); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + + UNUSED_RELEASE(depth_multiplier); + UNUSED_RELEASE(output_activation_min); + UNUSED_RELEASE(output_activation_max); + UNUSED_RELEASE(dilation_width_factor); + UNUSED_RELEASE(dilation_height_factor); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(input_depth); + +// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on +// Jetson TX-2. This compiler does not support the offsetof() macro. +#if defined(__aarch64__) && !defined(GOOGLE_L4T) +// TODO Use below codes +// // Dispatch to dot-product 3x3 kernels when supported. +// +// ruy::Context *ruy_context = cpu_backend_context->ruy_context(); +// const bool has_dot_product_instructions = +// ruy_context != nullptr && +// (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone; +// if (has_dot_product_instructions) +// { +// using optimized_ops::depthwise_conv::DotProduct3x3KernelType; +// DotProduct3x3KernelType kernel_type = +// optimized_ops::depthwise_conv::CategorizeDotProductKernel( +// input_shape, filter_shape, params); +// if (kernel_type != DotProduct3x3KernelType::kNone) +// { +// optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3< +// DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data, +// filter_shape, filter_data, +// bias_shape, +// bias_data, output_shape, +// output_data); +// return; +// } +// } +// +// // Dispatch to non-dot-product 3x3 kernels when supported. +// +// const int stride_width = params.stride_width; +// const int stride_height = params.stride_height; +// const int pad_width = params.padding_values.width; +// const int pad_height = params.padding_values.height; +// const int output_shift = params.output_shift; +// +// // Call kernel optimized for depthwise convolutions using 3x3 filters if +// // parameters are supported. +// if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width, +// stride_height, dilation_width_factor, +// dilation_height_factor, pad_width, pad_height, +// depth_multiplier, output_shape, output_shift)) +// { +// depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>( +// params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, +// output_shape, output_data, thread_start, thread_end, thread_dim); +// return; +// } +#endif + + depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, + thread_start, thread_end, thread_dim); +} + +inline void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data, int thread_start, int thread_end, + int thread_dim) +{ + return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, thread_start, + thread_end, thread_dim); +} + } // namespace optimized } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h index ae1f9e78e..f5edc94ab 100644 --- a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h +++ b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h @@ -111,7 +111,7 @@ inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h, { const int bottom_row_elements = (bottom_padding * kwidth * in_depth); const int bottom_start = - output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); + output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T))); } } @@ -159,7 +159,7 @@ void DilatedIm2col(const ConvParams ¶ms, const Shape &input_shape, const T * for (int batch = 0; batch < batches; ++batch) { const T zero_byte = - zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]); + zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]); for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) diff --git a/compute/cker/include/cker/operation/reference/BatchMatMul.h b/compute/cker/include/cker/operation/reference/BatchMatMul.h index e8ffd4014..1b3020de2 100644 --- a/compute/cker/include/cker/operation/reference/BatchMatMul.h +++ b/compute/cker/include/cker/operation/reference/BatchMatMul.h @@ -87,9 +87,8 @@ inline void BatchMatMul(const Shape &lhs_shape, const float *lhs_data, const Sha { const float *lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; const float *rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; - float *out_ptr = - output_data + - ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; + float *out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * + lhs_rows * rhs_cols; for (int j = 0; j < rhs_cols; ++j) { for (int i = 0; i < lhs_rows; ++i) diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h index f7e39248c..93cb21e0b 100644 --- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h @@ -56,17 +56,16 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shap const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < size; i++) { - output_data[i] = - ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]), - params.float_activation_min, params.float_activation_max); + output_data[i] = ActivationFunctionWithMinMax( + fn(input1_data[i], input2_data[i]), params.float_activation_min, params.float_activation_max); } } template <typename T> inline void BroadcastBinaryArithmeticOpSlowQuant8( - const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, - const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data, - const std::function<T(const BinaryArithmeticOpParam ¶ms, const T &, const T &)> &fn) + const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data, + const std::function<T(const BinaryArithmeticOpParam ¶ms, const T &, const T &)> &fn) { NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; @@ -100,10 +99,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8( for (int c = 0; c < extended_output_shape.Dims(3); ++c) { output_data[Offset(extended_output_shape, b, y, x, c)] = - ActivationFunctionWithMinMax<uint8_t>( - fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]), - params.quantized_activation_min, params.quantized_activation_max); + ActivationFunctionWithMinMax<uint8_t>( + fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); } } } @@ -143,9 +142,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam ¶m for (int c = 0; c < extended_output_shape.Dims(3); ++c) { output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>( - fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]), - params.quantized_activation_min, params.quantized_activation_max); + fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); } } } @@ -154,9 +153,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam ¶m template <> inline void BroadcastBinaryArithmeticOpSlow( - const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const float *input1_data, - const Shape &input2_shape, const float *input2_data, const Shape &output_shape, - float *output_data, const std::function<float(const float &, const float &)> &fn) + const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const float *input1_data, + const Shape &input2_shape, const float *input2_data, const Shape &output_shape, + float *output_data, const std::function<float(const float &, const float &)> &fn) { NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; @@ -171,10 +170,10 @@ inline void BroadcastBinaryArithmeticOpSlow( { for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]), - params.float_activation_min, params.float_activation_max); + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax(fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.float_activation_min, params.float_activation_max); } } } diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h index 86e8b5143..43a5bf256 100644 --- a/compute/cker/include/cker/operation/reference/Conv.h +++ b/compute/cker/include/cker/operation/reference/Conv.h @@ -98,8 +98,8 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const float bias_value = bias_data[out_channel]; } output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = - ActivationFunctionWithMinMax(total + bias_value, output_activation_min, - output_activation_max); + ActivationFunctionWithMinMax(total + bias_value, output_activation_min, + output_activation_max); } } } @@ -183,7 +183,7 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = - static_cast<uint8_t>(acc); + static_cast<uint8_t>(acc); } } } diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h index 7b4ff2040..62eeaf6bd 100644 --- a/compute/cker/include/cker/ruy/RuySupport.h +++ b/compute/cker/include/cker/ruy/RuySupport.h @@ -52,7 +52,7 @@ void MakeRuyMatrix(const MatrixParams<Scalar> ¶ms, DataPointer data_ptr, ruy::Matrix<Scalar> *dst, bool use_caching = false) { ruy::Order ruy_order = - params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor; + params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor; ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout()); // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer. // It does care whether we assign to it a Scalar* or a const Scalar*. diff --git a/compute/ruy/CMakeLists.txt b/compute/ruy/CMakeLists.txt new file mode 100644 index 000000000..d98ee1cd6 --- /dev/null +++ b/compute/ruy/CMakeLists.txt @@ -0,0 +1,11 @@ +nnfw_find_package(Ruy REQUIRED) + +add_library(nnfw_lib_ruy INTERFACE) +target_link_libraries(nnfw_lib_ruy INTERFACE ruy) +target_link_libraries(nnfw_lib_ruy INTERFACE ruy_instrumentation) +target_compile_definitions(nnfw_lib_ruy INTERFACE USE_RUY_GEMV) +if(PROFILE_RUY) + target_link_libraries(nnfw_lib_ruy INTERFACE ruy_profiler) +endif(PROFILE_RUY) + +target_include_directories(nnfw_lib_ruy INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) diff --git a/compute/ruy/include/ruy/NeonTensorUtils.h b/compute/ruy/include/ruy/NeonTensorUtils.h new file mode 100644 index 000000000..fb8b0a363 --- /dev/null +++ b/compute/ruy/include/ruy/NeonTensorUtils.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_NEON_TENSOR_UTILS_H__ +#define __NNFW_RUY_NEON_TENSOR_UTILS_H__ + +#include "ruy/neon/neon_check.h" + +#ifdef USE_NEON + +#define kFloatWeightsPerNeonLane 4 + +namespace nnfw +{ +namespace ruy +{ + +inline bool NeonIsZeroVector(const float *vector, int v_size) +{ + // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot + // use the main vectorized loop, and we need to process sequentially. + // postamble_start shows the start index where this should happen. + const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + + const float32x4_t zero_x4_float = vmovq_n_f32(0.0f); + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) + { + const float32x4_t i_x4_float = vld1q_f32(vector + v); + uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float); + if (vgetq_lane_u32(cmp_result, 0) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 1) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 2) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 3) == 0) + return false; + } + + // Postamble loop + for (int v = postamble_start; v < v_size; ++v) + { + if (vector[v] != 0.0) + return false; + } + return true; +} + +} // namespace ruy +} // namespace nnfw + +#endif // USE_NEON + +#endif // __NNFW_RUY_NEON_TENSOR_UTILS_H__ diff --git a/compute/ruy/include/ruy/PortableTensorUtils.h b/compute/ruy/include/ruy/PortableTensorUtils.h new file mode 100644 index 000000000..2d2c36cb2 --- /dev/null +++ b/compute/ruy/include/ruy/PortableTensorUtils.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__ +#define __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__ + +namespace nnfw +{ +namespace ruy +{ + +inline bool PortableIsZeroVector(const float *vector, int v_size) +{ + for (int i = 0; i < v_size; ++i) + { + if (*vector++ != 0.0f) + return false; + } + return true; +} + +} // namespace ruy +} // namespace nnfw + +#endif // __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__ diff --git a/compute/ruy/include/ruy/RuySupport.h b/compute/ruy/include/ruy/RuySupport.h new file mode 100644 index 000000000..7086a96c4 --- /dev/null +++ b/compute/ruy/include/ruy/RuySupport.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_RUY_SUPPORT_H__ +#define __NNFW_RUY_RUY_SUPPORT_H__ + +#include <util/ConfigSource.h> +#include <ruy/matrix.h> +#include <ruy/ruy.h> +#include <cassert> +#include "Types.h" + +namespace nnfw +{ +namespace ruy +{ +namespace ruy_support +{ + +inline ::ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy) +{ + switch (cache_policy) + { + case CachePolicy::kNeverCache: + return ::ruy::CachePolicy::kNeverCache; + case CachePolicy::kCacheIfLargeSpeedup: + return ::ruy::CachePolicy::kCacheIfLargeSpeedup; + case CachePolicy::kAlwaysCache: + return ::ruy::CachePolicy::kAlwaysCache; + default: + assert(false); + return ::ruy::CachePolicy::kNeverCache; + } +} + +template <typename Scalar, typename DataPointer> +void MakeRuyMatrix(const MatrixParams<Scalar> ¶ms, DataPointer data_ptr, + ::ruy::Matrix<Scalar> *dst, bool use_caching = false) +{ + ::ruy::Order ruy_order = + params.order == Order::kColMajor ? ::ruy::Order::kColMajor : ::ruy::Order::kRowMajor; + ::ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout()); + // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer. + // It does care whether we assign to it a Scalar* or a const Scalar*. + dst->set_data(data_ptr); + dst->set_zero_point(params.zero_point); + if (use_caching) + { + dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy)); + } +} + +template <typename GemmParamsType, typename RuySpecType> +void MakeRuyMulParams(const GemmParamsType ¶ms, RuySpecType *ruy_mul_params) +{ + // This validation has already been performed by the Gemm API entry point, + // but it doesn't hurt to test specifically this again here, where it's + // being used. + ValidateGemmParams(params); + + ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint); + ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent); + ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel); + ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel); + ruy_mul_params->set_bias(params.bias); + ruy_mul_params->set_clamp_min(params.clamp_min); + ruy_mul_params->set_clamp_max(params.clamp_max); +} + +} // namespace ruy_support +} // namespace ruy +} // namespace nnfw + +#endif // __NNFW_RUY_RUY_SUPPORT_H__ diff --git a/compute/ruy/include/ruy/Shape.h b/compute/ruy/include/ruy/Shape.h new file mode 100644 index 000000000..981c5b4de --- /dev/null +++ b/compute/ruy/include/ruy/Shape.h @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_SHAPE_H__ +#define __NNFW_RUY_SHAPE_H__ + +#include <algorithm> +#include <cstring> +#include <cassert> +#include <vector> + +#define UNUSED_RELEASE(a) (void)(a) + +namespace nnfw +{ +namespace ruy +{ + +class Shape +{ +public: + // Shapes with dimensions up to 5 are stored directly in the structure, while + // larger shapes are separately allocated. + static constexpr int kMaxSmallSize = 5; + + Shape &operator=(Shape const &) = delete; + + Shape() : _size(0) {} + + explicit Shape(int dimensions_count) : _size(dimensions_count) + { + if (dimensions_count > kMaxSmallSize) + { + _dims_pointer = new int32_t[dimensions_count]; + } + } + + Shape(int shape_size, int32_t value) : _size(0) + { + Resize(shape_size); + for (int i = 0; i < shape_size; ++i) + { + SetDim(i, value); + } + } + + Shape(int dimensions_count, const int32_t *dims_data) : _size(0) + { + ReplaceWith(dimensions_count, dims_data); + } + + Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); } + + // Avoid using this constructor. We should be able to delete it when C++17 + // rolls out. + Shape(Shape const &other) : _size(other.DimensionsCount()) + { + if (_size > kMaxSmallSize) + { + _dims_pointer = new int32_t[_size]; + } + std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size); + } + + bool operator==(const Shape &comp) const + { + return this->_size == comp._size && + std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0; + } + + ~Shape() + { + if (_size > kMaxSmallSize) + { + delete[] _dims_pointer; + } + } + + inline int32_t DimensionsCount() const { return _size; } + inline int32_t Dims(int i) const + { + assert(i >= 0); + assert(i < _size); + return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i]; + } + inline void SetDim(int i, int32_t val) + { + assert(i >= 0); + assert(i < _size); + if (_size > kMaxSmallSize) + { + _dims_pointer[i] = val; + } + else + { + _dims[i] = val; + } + } + + inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; } + inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; } + // The caller must ensure that the shape is no bigger than 4-D. + inline const int32_t *DimsDataUpTo4D() const { return _dims; } + + inline void Resize(int dimensions_count) + { + if (_size > kMaxSmallSize) + { + delete[] _dims_pointer; + } + _size = dimensions_count; + if (dimensions_count > kMaxSmallSize) + { + _dims_pointer = new int32_t[dimensions_count]; + } + } + + inline void ReplaceWith(int dimensions_count, const int32_t *dims_data) + { + Resize(dimensions_count); + int32_t *dst_dims = DimsData(); + std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t)); + } + + inline void ReplaceWith(const Shape &other) + { + ReplaceWith(other.DimensionsCount(), other.DimsData()); + } + + inline void ReplaceWith(Shape &&other) + { + Resize(0); + std::swap(_size, other._size); + if (_size <= kMaxSmallSize) + std::copy(other._dims, other._dims + kMaxSmallSize, _dims); + else + _dims_pointer = other._dims_pointer; + } + + template <typename T> inline void BuildFrom(const T &src_iterable) + { + const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end()); + Resize(dimensions_count); + int32_t *data = DimsData(); + for (auto it : src_iterable) + { + *data = it; + ++data; + } + } + + // This will probably be factored out. Old code made substantial use of 4-D + // shapes, and so this function is used to extend smaller shapes. Note that + // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be + // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their + // inputs should already be 4-D, so this function should not be needed. + inline static Shape ExtendedShape(int new_shape_size, const Shape &shape) + { + return Shape(new_shape_size, shape, 1); + } + + inline void BuildFrom(const std::initializer_list<int> init_list) + { + BuildFrom<const std::initializer_list<int>>(init_list); + } + + // Returns the total count of elements, that is the size when flattened into a + // vector. + inline int FlatSize() const + { + int buffer_size = 1; + const int *dims_data = DimsData(); + for (int i = 0; i < _size; i++) + { + const int dim = dims_data[i]; + assert(dim >= 1); + buffer_size *= dim; + } + return buffer_size; + } + + bool operator!=(const Shape &comp) const { return !((*this) == comp); } + +private: + // For use only by ExtendedShape(), written to guarantee (return-value) copy + // elision in C++17. + // This creates a shape padded to the desired size with the specified value. + Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0) + { + assert(new_shape_size >= shape.DimensionsCount()); + assert(new_shape_size <= kMaxSmallSize); + Resize(new_shape_size); + const int size_increase = new_shape_size - shape.DimensionsCount(); + for (int i = 0; i < size_increase; ++i) + { + SetDim(i, pad_value); + } + std::memcpy(DimsData() + size_increase, shape.DimsData(), + sizeof(int32_t) * shape.DimensionsCount()); + } + + int32_t _size; + union { + int32_t _dims[kMaxSmallSize]; + int32_t *_dims_pointer{nullptr}; + }; +}; + +inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2) +{ + UNUSED_RELEASE(shape2); + UNUSED_RELEASE(index2); + assert(shape1.Dims(index1) == shape2.Dims(index2)); + return shape1.Dims(index1); +} + +template <typename... Args> +int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2, Args... args) +{ + assert(shape1.Dims(index1) == shape2.Dims(index2)); + UNUSED_RELEASE(shape2); + UNUSED_RELEASE(index2); + return MatchingDim(shape1, index1, args...); +} + +inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); } + +inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3) +{ + assert(shape.DimensionsCount() == 4); + const int *dims_data = shape.DimsDataUpTo4D(); + assert(i0 >= 0 && i0 < dims_data[0]); + assert(i1 >= 0 && i1 < dims_data[1]); + assert(i2 >= 0 && i2 < dims_data[2]); + assert(i3 >= 0 && i3 < dims_data[3]); + return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3; +} + +inline int Offset(const Shape &shape, int *index) +{ + return Offset(shape, index[0], index[1], index[2], index[3]); +} + +inline int FlatSizeSkipDim(const Shape &shape, int skip_dim) +{ + const int dims_count = shape.DimensionsCount(); + assert(skip_dim >= 0 && skip_dim < dims_count); + const auto *dims_data = shape.DimsData(); + int flat_size = 1; + for (int i = 0; i < dims_count; ++i) + { + flat_size *= (i == skip_dim) ? 1 : dims_data[i]; + } + return flat_size; +} + +// Flat size calculation, checking that dimensions match with one or more other +// arrays. +template <typename... Ts> inline bool checkMatching(const Shape &shape, Ts... check_shapes) +{ + const Shape check_shapes_array[sizeof...(Ts)] = {std::forward<Ts>(check_shapes)...}; + for (const auto &check_shape : check_shapes_array) + { + // Check matching of shapes except the case of that two shapes can be scalar + if (shape.DimensionsCount() > 1 || check_shape.DimensionsCount() > 1 || shape.FlatSize() != 1 || + check_shape.FlatSize() != 1) + { + if (shape.DimensionsCount() != check_shape.DimensionsCount()) + { + return false; + } + for (int i = 0; i < shape.DimensionsCount(); ++i) + { + if (shape.Dims(i) != check_shape.Dims(i)) + { + return false; + } + } + } + } + return true; +} + +struct UNUSED_ALL +{ + template <typename... Args> UNUSED_ALL(Args const &...) {} +}; +template <typename... Ts> inline int MatchingFlatSize(const Shape &shape, Ts... check_shapes) +{ + UNUSED_ALL{check_shapes...}; + assert(checkMatching(shape, std::forward<Ts>(check_shapes)...)); + return shape.FlatSize(); +} + +inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0) +{ + UNUSED_RELEASE(check_shape_0); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + if (i != skip_dim) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + } + return FlatSizeSkipDim(shape, skip_dim); +} + +inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0, + const Shape &check_shape_1) +{ + UNUSED_RELEASE(check_shape_0); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + if (i != skip_dim) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + } + return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1); +} + +inline int MatchingElementsSize(const Shape &shape, const Shape &check_shape_0, + const Shape &check_shape_1) +{ + const int size_1 = shape.FlatSize(); + const int size_2 = check_shape_0.FlatSize(); + const int size_3 = check_shape_1.FlatSize(); + assert(size_1 == size_2); + assert(size_2 == size_3); + UNUSED_RELEASE(size_2); + UNUSED_RELEASE(size_3); + return size_1; +} + +} // namespace ruy +} // namespace nnfw + +#endif // __NNFW_RUY_SHAPE_H__ diff --git a/compute/ruy/include/ruy/TensorUtils.h b/compute/ruy/include/ruy/TensorUtils.h new file mode 100644 index 000000000..149037cc9 --- /dev/null +++ b/compute/ruy/include/ruy/TensorUtils.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_TENSOR_UTILS_H__ +#define __NNFW_RUY_TENSOR_UTILS_H__ + +#include "ruy/PortableTensorUtils.h" +#include "ruy/NeonTensorUtils.h" + +namespace nnfw +{ +namespace ruy +{ + +inline bool IsZeroVector(const float *vector, int v_size) +{ + return NEON_OR_PORTABLE(IsZeroVector, vector, v_size); +} + +} // namespace ruy +} // namespace nnfw + +#endif // __NNFW_RUY_TENSOR_UTILS_H__ diff --git a/compute/ruy/include/ruy/Types.h b/compute/ruy/include/ruy/Types.h new file mode 100644 index 000000000..b19b59735 --- /dev/null +++ b/compute/ruy/include/ruy/Types.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_TYPES_H__ +#define __NNFW_RUY_TYPES_H__ + +#include <cassert> +#include <cstdint> +#include <type_traits> +#include <limits> +#include <string> +#include "Shape.h" + +namespace nnfw +{ +namespace ruy +{ + +enum class FusedActivationFunctionType +{ + kNone = 0, + kRelu6 = 1, + kRelu1 = 2, + kRelu = 3, + kTanh = 4, + kSigmoid = 6, +}; + +enum class PaddingType +{ + kNone = 0, + kSame = 1, + kValid = 2, +}; + +struct PaddingValues +{ + int16_t width; + int16_t height; +}; + +struct ConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + // TODO(starka): This was just "stride", so check that width+height is OK. + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + // uint8_t inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8_t, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; + bool is_replaced_weights{false}; +}; + +struct FullyConnectedParams +{ + FusedActivationFunctionType activation{FusedActivationFunctionType::kNone}; + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + float weights_scale; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params - no one use this params, but ruy might use them later. + float float_activation_min; + float float_activation_max; + // Mark the operands as cacheable if they are unchanging, e.g. weights. + bool lhs_cacheable; + bool rhs_cacheable; + // FullyConnectedWeightsFormat weights_format; +}; + +enum class Order +{ + kColMajor, + kRowMajor +}; + +enum class CachePolicy : std::uint8_t +{ + kNeverCache, + kCacheIfLargeSpeedup, + kAlwaysCache, +}; + +// MatrixParams encapsulates the parameters that Gemm needs about each +// matrix, besides the buffer data pointer. +// Compare to ruy::Matrix, which also encapsulates the data pointer. +// Rationale for leaving the data pointer out of here: doing so +// requires complicated const-correctness mechanics. See +// ruy::ConstCheckingPtr. +template <typename Scalar> struct MatrixParams +{ + // Storage layout order. For now we only do plain linear non-strided + // layout. It would be easy to support a stride if needed. + Order order = Order::kColMajor; + // Number of rows of the matrix. + int rows = 0; + // Number of columns of the matrix. + int cols = 0; + // The zero_point, i.e. which Scalar value is to be interpreted as zero. + // When Scalar is floating-point, this must be 0. + Scalar zero_point = 0; + // When the data pointed to by this matrix is constant data, so that it is + // valid to assume that equality of pointers implies equality of data, + // a CachePolicy may be used instead of the default kNeverCache, + // which will enable ruy to take advantage of this constancy of the data to + // cache the packing work, which can be a large speedup in matrix*vector + // and other narrow shapes. + CachePolicy cache_policy = CachePolicy::kNeverCache; +}; + +// Enumeration of broad categories of Gemm. +// +// The primary reason for this to exist is to allow Gemm to compile +// only uniform-quantized or only per-channel-quantized code paths. +// This is unneeded with ruy as the back-end, as this is only a runtime +// difference in ruy, but with gemmlowp these really are separate code +// paths and templatizing in a QuantizationFlavor is necessary to avoid +// compiling unused gemmlowp code. Indeed, TFLite currently uses +// uint8 with uniform quantization and int8 with per-channel quantization, +// and does not use uint8 with per-channel. We want to avoid compiling +// the gemmlowp uint8 per-channel path when gemmlowp is the back-end. +// +// It's possible to drop this in the future if gemmlowp goes away and no +// other then-relevant backend library handles quantized paths in a way that +// requires knowing this at compile-time. +enum class QuantizationFlavor +{ + // Floating-point Gemm: the accumulators are not multiplied by any + // 'multiplier'. + kFloatingPoint, + // Quantized Gemm using a single multiplier for all accumulators. + kIntegerWithUniformMultiplier, + // Quantized Gemm using a separate multipliers for accumulators of each + // row of the destination matrix. This is what is called 'per-channel' + // in GemmParams. Here we use the more specific 'per-row' terminology + // to allow for the possibility of 'per-column' in the future, and to + // allow for that to be a separate code path in some back-end such as + // gemmlowp. + kIntegerWithPerRowMultiplier +}; + +// Additional parameters that Gemm needs, beyond what falls into +// the MatrixParams that it takes. Compare to ruy::Spec. +// +// Decoupling AccumScalar from DstScalar (rather than deducing it from that) +// is useful future-proofing. Think of a float16 path using float32 accum. +// +// QuantizationFlavor is passed here even though it's technically not used +// in this class. This is so that we retain the ability in the future to +// specialize this class for quantization flavor, and this allows for +// Gemm to be templatized in quantization_flavor via the GemmParams that it +// takes, allowing for automatic template parameter deduction to take place, +// so that most call sites don't need to specify a QuantizationFlavor +// (only those that need perchannel quantization do). +template <typename AccumScalar, typename DstScalar, + QuantizationFlavor quantization_flavor = + std::is_floating_point<AccumScalar>::value + ? QuantizationFlavor::kFloatingPoint + : QuantizationFlavor::kIntegerWithUniformMultiplier> +struct GemmParams +{ + // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa) + // of the multiplier by which accumulators are multiplied before being casted + // to the destination type. + AccumScalar multiplier_fixedpoint = 0; + // Only for non-floating-point cases. The exponent part of the aforementioned + // multiplier. + int multiplier_exponent = 0; + // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must + // point to a buffer of as many values as there are rows in the destination + // matrix. Each row of the destination matrix will use the corresponding + // buffer element instead of multiplier_fixedpoint. + const AccumScalar *multiplier_fixedpoint_perchannel = nullptr; + // Per-channel variant of multiplier_exponent. If not nullptr, this must + // point to a buffer of as many values as there are rows in the destination + // matrix. Each row of the destination matrix will use the corresponding + // buffer element instead of multiplier_exponent. + // + // Either none or both of multiplier_exponent_perchannel and + // multiplier_fixedpoint_perchannel must be nullptr. + const int *multiplier_exponent_perchannel = nullptr; + // The bias vector data, if not null. + const AccumScalar *bias = nullptr; + // min clamp bound of destination values. + DstScalar clamp_min = std::is_floating_point<DstScalar>::value + ? -std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::lowest(); + // max clamp bound of destination values. + DstScalar clamp_max = std::is_floating_point<DstScalar>::value + ? std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::max(); +}; + +// Validates self-consistency of GemmParams. +template <typename AccumScalar, typename DstScalar, QuantizationFlavor quantization_flavor> +void ValidateGemmParams(const GemmParams<AccumScalar, DstScalar, quantization_flavor> ¶ms) +{ + // Guard consistency of the quantized multiplier fields. + if (quantization_flavor == QuantizationFlavor::kFloatingPoint) + { + assert(!params.multiplier_fixedpoint); + assert(!params.multiplier_exponent); + assert(!params.multiplier_fixedpoint_perchannel); + assert(!params.multiplier_exponent_perchannel); + } + else if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier && + !std::is_same<DstScalar, int32_t>::value) + { + assert(params.multiplier_fixedpoint); + // Nothing to check about multiplier_exponent + assert(!params.multiplier_fixedpoint_perchannel); + assert(!params.multiplier_exponent_perchannel); + } + else if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier && + !std::is_same<DstScalar, int32_t>::value) + { + assert(!params.multiplier_fixedpoint); + assert(!params.multiplier_exponent); + assert(params.multiplier_fixedpoint_perchannel); + assert(params.multiplier_exponent_perchannel); + } + else + { + // For the get raw accumulator case, we should make sure none of the + // quantization params are set. + assert(!params.multiplier_fixedpoint); + assert(!params.multiplier_exponent); + assert(!params.multiplier_fixedpoint_perchannel); + assert(!params.multiplier_exponent_perchannel); + } + UNUSED_RELEASE(params); +} + +inline CachePolicy DefaultCachePolicy(bool is_constant_data) +{ + return is_constant_data ? CachePolicy::kCacheIfLargeSpeedup : CachePolicy::kNeverCache; +} + +} // namespace ruy +} // namespace nnfw + +#endif // __NNFW_RUY_TYPES_H__ diff --git a/compute/ruy/include/ruy/Utils.h b/compute/ruy/include/ruy/Utils.h new file mode 100644 index 000000000..50205abe5 --- /dev/null +++ b/compute/ruy/include/ruy/Utils.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_UTILS_H__ +#define __NNFW_RUY_UTILS_H__ + +#include "Types.h" +#include "Shape.h" + +#include <stdexcept> + +namespace nnfw +{ +namespace ruy +{ +template <typename T> +inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h, int b, int kheight, + int kwidth, int stride_width, int stride_height, + int pad_width, int pad_height, int in_width, int in_height, + int in_depth, int single_buffer_length, int buffer_id, + const T *in_data, T *conv_buffer_data, uint8_t zero_byte) +{ + assert(input_shape.DimensionsCount() == 4); + // This chunk of code reshapes all the inputs corresponding to + // output (b, h, w) to a column vector in conv_buffer(:, buffer_id). + const int kwidth_times_indepth = kwidth * in_depth; + const int inwidth_times_indepth = in_width * in_depth; + const int ih_ungated_start = h * stride_height - pad_height; + const int ih_ungated_end = (ih_ungated_start + kheight); + const int ih_end = std::min(ih_ungated_end, in_height); + const int iw_ungated_start = w * stride_width - pad_width; + const int iw_ungated_end = (iw_ungated_start + kwidth); + const int iw_end = std::min(iw_ungated_end, in_width); + // If the patch is off the edge of the input image, skip writing those rows + // and columns from the patch into the output array. + const int h_offset = std::max(0, -ih_ungated_start); + const int w_offset = std::max(0, -iw_ungated_start); + const int ih_start = std::max(0, ih_ungated_start); + const int iw_start = std::max(0, iw_ungated_start); + const int single_row_num = std::min(kwidth - w_offset, in_width - iw_start) * in_depth; + const int output_row_offset = (buffer_id * single_buffer_length); + int out_offset = output_row_offset + (h_offset * kwidth + w_offset) * in_depth; + int in_offset = Offset(input_shape, b, ih_start, iw_start, 0); + + // Express all of the calculations as padding around the input patch. + const int top_padding = h_offset; + const int bottom_padding = (ih_ungated_end - ih_end); + const int left_padding = w_offset; + const int right_padding = (iw_ungated_end - iw_end); + assert(single_row_num == ((kwidth - (left_padding + right_padding)) * in_depth)); + + // Write out zeroes to the elements representing the top rows of the input + // patch that are off the edge of the input image. + if (top_padding > 0) + { + const int top_row_elements = (top_padding * kwidth * in_depth); + memset(conv_buffer_data + output_row_offset, zero_byte, (top_row_elements * sizeof(T))); + } + + // If the patch is on the interior of the input image horizontally, just copy + // over the rows sequentially, otherwise add zero padding at the start or end. + if ((left_padding == 0) && (right_padding == 0)) + { + for (int ih = ih_start; ih < ih_end; ++ih) + { + memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T)); + out_offset += kwidth_times_indepth; + in_offset += inwidth_times_indepth; + } + } + else + { + for (int ih = ih_start; ih < ih_end; ++ih) + { + if (left_padding > 0) + { + const int left_start = (out_offset - (left_padding * in_depth)); + memset(conv_buffer_data + left_start, zero_byte, (left_padding * in_depth * sizeof(T))); + } + memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T)); + if (right_padding > 0) + { + const int right_start = (out_offset + single_row_num); + memset(conv_buffer_data + right_start, zero_byte, (right_padding * in_depth * sizeof(T))); + } + out_offset += kwidth_times_indepth; + in_offset += inwidth_times_indepth; + } + } + + // If the bottom of the patch falls off the input image, pad the values + // representing those input rows with zeroes. + if (bottom_padding > 0) + { + const int bottom_row_elements = (bottom_padding * kwidth * in_depth); + const int bottom_start = + output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); + memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T))); + } +} + +// Supports per-batch zero_byte for per-batch asymmetric quantized inputs. +template <typename T> +void DilatedIm2col(const ConvParams ¶ms, const Shape &input_shape, const T *input_data, + const Shape &filter_shape, const Shape &output_shape, T *im2col_data, + const int32_t *zero_bytes, const int zero_bytes_len) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + // For dilated convolution, the input pixels are not contiguous therefore we + // can't use the same optimizations as Im2Col(). Though note this code would + // work fine for the non-dilated case too (though likely a bit slower). + assert(dilation_width_factor != 1 || dilation_height_factor != 1); + assert(im2col_data); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + MatchingDim(output_shape, 3, filter_shape, 0); + + // Construct the MxN sized im2col matrix. + // The rows M, are sub-ordered B x H x W + const Shape row_shape({1, batches, output_height, output_width}); + // The columns, N, are sub-ordered Kh x Kw x Din + const Shape col_shape({1, filter_height, filter_width, input_depth}); + // Use dimensions M and N to construct dims for indexing directly into im2col + const Shape im2col_shape({1, 1, row_shape.FlatSize(), col_shape.FlatSize()}); + + // Loop through the output rows (B x H x W) + for (int batch = 0; batch < batches; ++batch) + { + const T zero_byte = + zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]); + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + // Each im2col row is an output pixel. Arrange the input data in this + // row in an order we can conveniently multiply with the filter data. + int row_offset = Offset(row_shape, 0, batch, out_y, out_x); + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + // Loop through all the pixels of the filter (Kh x Kw) + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + if ((in_y >= 0) && (in_y < input_height)) + { + // Filter row is within the input data. + // Loop through all the filter pixels in this row. + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0); + T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset); + if ((in_x >= 0) && (in_x < input_width)) + { + // Filter pixel is within the input, copy the input data. + T const *src = input_data + Offset(input_shape, batch, in_y, in_x, 0); + memcpy(dst, src, input_depth * sizeof(T)); + } + else + { + // Filter pixel is outside the input, zero it out. + memset(dst, zero_byte, input_depth * sizeof(T)); + } + } + } + else + { + // Filter row is outside the input, zero out the entire filter row. + int col_offset = Offset(col_shape, 0, filter_y, 0, 0); + T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset); + memset(dst, zero_byte, filter_width * input_depth * sizeof(T)); + } + } + } + } + } +} + +template <typename T> +void DilatedIm2col(const ConvParams ¶ms, uint8_t zero_byte, const Shape &input_shape, + const T *input_data, const Shape &filter_shape, const Shape &output_shape, + T *im2col_data) +{ + const int32_t zero_point = static_cast<int32_t>(zero_byte); + DilatedIm2col<T>(params, input_shape, input_data, filter_shape, output_shape, im2col_data, + &zero_point, 1); +} + +template <typename T> +void Im2col(const ConvParams ¶ms, int kheight, int kwidth, uint8_t zero_byte, + const Shape &input_shape, const T *input_data, const Shape &output_shape, + T *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + + int buffer_id = 0; + // Loop over the output nodes. + for (int b = 0; b < batches; ++b) + { + for (int h = 0; h < output_height; ++h) + { + for (int w = 0; w < output_width; ++w) + { + ExtractPatchIntoBufferColumn(input_shape, w, h, b, kheight, kwidth, stride_width, + stride_height, pad_width, pad_height, input_width, + input_height, input_depth, output_depth, buffer_id, input_data, + output_data, zero_byte); + ++buffer_id; + } + } + } +} + +} // namespace ruy +} // namespace nnfw + +#endif // __NNFW_RUY_UTILS_H__ diff --git a/compute/ruy/include/ruy/neon/neon_check.h b/compute/ruy/include/ruy/neon/neon_check.h new file mode 100644 index 000000000..08394f26f --- /dev/null +++ b/compute/ruy/include/ruy/neon/neon_check.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_NEON_CHECK_H__ +#define __NNFW_RUY_NEON_CHECK_H__ + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define USE_NEON +#include <arm_neon.h> +#endif + +// Disable X86_NEON +// #if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON +#if 0 +#define USE_NEON +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wattributes" +#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wsequence-point" +#include "NEON_2_SSE.h" +#pragma GCC diagnostic pop +#endif + +// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is +// defined, PortableSomeFunc(args) otherwise. +#ifdef USE_NEON +// Always use Neon code +#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__) + +#else +// No NEON available: Use Portable code +#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__) + +#endif // defined(USE_NEON) + +#endif // __NNFW_RUY_NEON_CHECK_H__ diff --git a/compute/ruy/include/ruy/operation/Conv.h b/compute/ruy/include/ruy/operation/Conv.h new file mode 100644 index 000000000..2b9c8c390 --- /dev/null +++ b/compute/ruy/include/ruy/operation/Conv.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_CONV_H__ +#define __NNFW_RUY_CONV_H__ + +#include "ruy/Types.h" +#include "ruy/Shape.h" +#include "ruy/Utils.h" +#include "ruy/RuySupport.h" + +#include <ruy/ruy.h> +#include <ruy/context.h> +#include <iostream> +#include <vector> + +namespace nnfw +{ +namespace ruy +{ + +class Conv +{ +public: + Conv() : _im2col_shape(4), _need_im2col(false), _prepared(false) {} + + void prepare(const Shape &input_shape, const Shape &kernel_shape, const Shape &output_shape, + uint32_t stride_width, uint32_t stride_height, uint32_t dilation_width_factor, + uint32_t dilation_height_factor) + { + if (!_prepared) + { + IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height, + dilation_width_factor, dilation_height_factor); + _prepared = true; + } + } + + void operator()(const ConvParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data, + ::ruy::Context *ruy_context) + { + if (!_prepared) + { + // This means that input or output are dynamic or filter is not constant + IsRequiredIm2col(input_shape, filter_shape, output_shape, params.stride_width, + params.stride_height, params.dilation_width_factor, + params.dilation_height_factor); + _prepared = true; + } + + int im2col_size = _need_im2col ? _im2col_shape.FlatSize() : 0; + + // Use heap if size is larger than 8MB + if (im2col_size > 2 * 1024 * 1024) + { + std::unique_ptr<float[]> im2col_data = std::make_unique<float[]>(im2col_size); + ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, + output_shape, output_data, _im2col_shape, im2col_data.get(), ruy_context); + } + else if (im2col_size > 0) + { + float im2col_data[im2col_size]; + ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, + output_shape, output_data, _im2col_shape, im2col_data, ruy_context); + } + else + { + ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, + output_shape, output_data, _im2col_shape, nullptr, ruy_context); + } + } + +private: + void ConvFloat(const ConvParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data, + const Shape &im2col_shape, float *im2col_data, ::ruy::Context *ruy_context) + { + UNUSED_RELEASE(bias_shape); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + // NB: the float 0.0f value is represented by all zero bytes. + const uint8_t float_zero_byte = 0x00; + const float *gemm_input_data = nullptr; + const Shape *gemm_input_shape = nullptr; + const int filter_width = filter_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1; + const bool need_im2col = + stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; + if (need_dilated_im2col) + { + DilatedIm2col(params, float_zero_byte, input_shape, input_data, filter_shape, output_shape, + im2col_data); + gemm_input_data = im2col_data; + gemm_input_shape = &im2col_shape; + } + else if (need_im2col) + { + assert(im2col_data); + Im2col(params, filter_height, filter_width, float_zero_byte, input_shape, input_data, + im2col_shape, im2col_data); + gemm_input_data = im2col_data; + gemm_input_shape = &im2col_shape; + } + else + { + // TODO(aselle): We need to make sure to not send im2col if it is not + // needed. + assert(!im2col_data); + gemm_input_data = input_data; + gemm_input_shape = &input_shape; + } + + const int gemm_input_dims = gemm_input_shape->DimensionsCount(); + int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1); + int n = output_shape.Dims(3); + int k = gemm_input_shape->Dims(gemm_input_dims - 1); + + // When an optimized CBLAS implementation is not available, fall back + // to using cpu_backend_gemm. + MatrixParams<float> lhs_params; + lhs_params.order = Order::kRowMajor; + lhs_params.rows = n; + lhs_params.cols = k; + MatrixParams<float> rhs_params; + rhs_params.order = Order::kColMajor; + rhs_params.rows = k; + rhs_params.cols = m; + MatrixParams<float> dst_params; + dst_params.order = Order::kColMajor; + dst_params.rows = n; + dst_params.cols = m; + GemmParams<float, float> gemm_params; + gemm_params.bias = bias_data; + gemm_params.clamp_min = output_activation_min; + gemm_params.clamp_max = output_activation_max; + + // Below code is from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy + ::ruy::Matrix<float> ruy_lhs; + ::ruy::Matrix<float> ruy_rhs; + ::ruy::Matrix<float> ruy_dst; + // Note that cache is always enabled for input and weight tensors + ruy_support::MakeRuyMatrix(lhs_params, filter_data, &ruy_lhs, true); + ruy_support::MakeRuyMatrix(rhs_params, gemm_input_data, &ruy_rhs, true); + ruy_support::MakeRuyMatrix(dst_params, output_data, &ruy_dst); + + ::ruy::BasicSpec<float, float> ruy_mul_params; + ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params); + + ::ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst); + } + + void IsRequiredIm2col(const Shape &input_shape, const Shape &kernel_shape, + const Shape &output_shape, uint32_t stride_width, uint32_t stride_height, + uint32_t dilation_width_factor, uint32_t dilation_height_factor) + { + const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1; + const bool need_non_dilated_im2col = stride_width != 1 || stride_height != 1 || + kernel_shape.Dims(1) != 1 || kernel_shape.Dims(2) != 1; + + _need_im2col = need_dilated_im2col || need_non_dilated_im2col; + + if (_need_im2col) + { + _im2col_shape.SetDim(0, output_shape.Dims(0)); + _im2col_shape.SetDim(1, output_shape.Dims(1)); + _im2col_shape.SetDim(2, output_shape.Dims(2)); + _im2col_shape.SetDim(3, input_shape.Dims(3) * kernel_shape.Dims(1) * kernel_shape.Dims(2)); + } + } + +private: + Shape _im2col_shape; + bool _need_im2col; + bool _prepared; +}; +} // namespace ruy +} // namespace nnfw + +#endif // __NNFW_RUY_CONV_H_ diff --git a/compute/ruy/include/ruy/operation/FullyConnected.h b/compute/ruy/include/ruy/operation/FullyConnected.h new file mode 100644 index 000000000..59facdb22 --- /dev/null +++ b/compute/ruy/include/ruy/operation/FullyConnected.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RUY_FULLY_CONNECTED_H__ +#define __NNFW_RUY_FULLY_CONNECTED_H__ + +#include "ruy/Shape.h" +#include "ruy/Types.h" +#include "ruy/Utils.h" +#include "ruy/RuySupport.h" + +#include <ruy/ruy.h> +#include <ruy/context.h> + +namespace nnfw +{ +namespace ruy +{ + +inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &weights_shape, + const float *weights_data, const Shape &, + const float *optional_bias_data, const Shape &output_shape, + float *output_data, ::ruy::Context *ruy_context) +{ + const int dims_count = weights_shape.DimensionsCount(); + const int input_rows = weights_shape.Dims(dims_count - 1); + MatrixParams<float> rhs_params; + rhs_params.order = Order::kColMajor; + rhs_params.rows = input_rows; + rhs_params.cols = input_shape.FlatSize() / input_rows; + rhs_params.cache_policy = DefaultCachePolicy(params.rhs_cacheable); + assert(input_shape.FlatSize() == (rhs_params.rows * rhs_params.cols)); + MatrixParams<float> lhs_params; + lhs_params.order = Order::kRowMajor; + lhs_params.cols = weights_shape.Dims(dims_count - 1); + lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1); + lhs_params.cache_policy = DefaultCachePolicy(params.lhs_cacheable); + MatrixParams<float> dst_params; + dst_params.order = Order::kColMajor; + dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1); + dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1); + GemmParams<float, float> gemm_params; + gemm_params.bias = optional_bias_data; + gemm_params.clamp_min = params.float_activation_min; + gemm_params.clamp_max = params.float_activation_max; + + // Below code was copied from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy + ::ruy::Matrix<float> ruy_lhs; + ::ruy::Matrix<float> ruy_rhs; + ::ruy::Matrix<float> ruy_dst; + // Note that cache is always enabled for input and weight tensors + ruy_support::MakeRuyMatrix(lhs_params, weights_data, &ruy_lhs, true); + ruy_support::MakeRuyMatrix(rhs_params, input_data, &ruy_rhs, true); + ruy_support::MakeRuyMatrix(dst_params, output_data, &ruy_dst); + + ::ruy::BasicSpec<float, float> ruy_mul_params; + ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params); + + ::ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst); +} + +} // namespace ruy +} // namespace nnfw + +#endif // __NNFW_RUY_FULLY_CONNECTED_H__ diff --git a/compute/test/cker/Range.cc b/compute/test/cker/Range.cc index 55f4fcf20..e5fe4801f 100644 --- a/compute/test/cker/Range.cc +++ b/compute/test/cker/Range.cc @@ -48,9 +48,7 @@ TEST(CKer_Operation, Range) const float start = 3; const float limit = 1; const float delta = -0.5; - std::vector<float> expected = { - 3, 2.5, 2, 1.5, - }; + std::vector<float> expected = {3, 2.5, 2, 1.5}; std::vector<float> actual(expected.size()); nnfw::cker::Range<float>(&start, &limit, &delta, actual.data()); |