diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2020-12-14 14:43:43 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2020-12-14 14:43:43 +0900 |
commit | 62529acabbafce7730601ed01d5709d7bc0d378a (patch) | |
tree | bf6912cfa8fac4a2997292bfcb3c82055734c97e /compute/ARMComputeEx | |
parent | 6ea13af5257155ff993c205cf997b870cc627f73 (diff) | |
download | nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.tar.gz nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.tar.bz2 nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.zip |
Imported Upstream version 1.12.0upstream/1.12.0
Diffstat (limited to 'compute/ARMComputeEx')
66 files changed, 1374 insertions, 1374 deletions
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h index d29886a9d..4a3717885 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -255,14 +255,14 @@ private: cl::Device _device; /**< Underlying CL device. */ std::string _kernel_path; /**< Path to the kernels folder. */ mutable std::map<std::string, const Program> - _programs_map; /**< Map with all already loaded program data. */ + _programs_map; /**< Map with all already loaded program data. */ mutable std::map<std::string, cl::Program> - _built_programs_map; /**< Map with all already built program data. */ + _built_programs_map; /**< Map with all already built program data. */ static const std::map<std::string, std::string> - _kernel_program_map; /**< Map that associates kernel names with programs. */ + _kernel_program_map; /**< Map that associates kernel names with programs. */ static const std::map<std::string, std::string> - _program_source_map; /**< Contains sources for all programs. - Used for compile-time kernel inclusion. >*/ + _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ }; } #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h index a614d5259..fb689f747 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -54,8 +54,8 @@ namespace arm_compute class ICLTensor; /** -* @brief Class to perform EmbeddingLookup operation with opencl kernel -*/ + * @brief Class to perform EmbeddingLookup operation with opencl kernel + */ class CLEmbeddingLookupKernel : public ICLKernel { public: diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h index 99cfa61ec..96f830898 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -55,8 +55,8 @@ namespace arm_compute class ICLTensor; /** -* @brief Class to perform HashtableLookup operation with opencl kernel -*/ + * @brief Class to perform HashtableLookup operation with opencl kernel + */ class CLHashtableLookupKernel : public ICLKernel { public: diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h index 99bb351bc..963d7b821 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h @@ -68,34 +68,37 @@ public: const char *name() const override { return "NEOneHotKernel"; } /** Initialise the kernel's inputs and outputs * - * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the - * following types: U32/S32 - * @param[in] depth The tensor for depth of the one hot dimension. Supported tensor rank: up to - * 3. Must be one of the following types: U32/S32 - * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: - * U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same - * as @p on_value - * @param[out] output Destination tensor. Data type supported: Same as @p on_value - * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. - * The value must be in range [-indices.rank , indices.rank) + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. + * Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) */ void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, const ITensor *off_value, ITensor *output, int axis = -1); /** Static function to check if given info will lead to a valid configuration of @ref - * NEOneHotKernel + * NEOneHotKernel * - * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the - * following types: U32/S32 - * @param[in] depth The tensor info for depth of the one hot dimension. Supported tensor rank: - * up to 3. Must be one of the following types: U32/S32 - * @param[in] on_value On value tensor info. Supported tensor rank: only 1. Data type supported: - * U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. Data type supported: - * Same as @p on_value - * @param[out] output Destination tensor info. Data type supported: Same as @p on_value - * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. - * The value must be in range [-indices.rank , indices.rank) + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h index 1e69f0912..2aaab6b3a 100644 --- a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h +++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h @@ -72,10 +72,10 @@ namespace shape_calculator * @return the calculated shape */ inline TensorShape compute_transposeconv_upsampled_shape( - const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, - std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, - unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, - unsigned int &pad_top, unsigned int &pad_bottom) + const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, + std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, + unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, + unsigned int &pad_top, unsigned int &pad_bottom) { unsigned int sx = info.stride().first; unsigned int sy = info.stride().second; @@ -103,7 +103,7 @@ inline TensorShape compute_transposeconv_upsampled_shape( unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right; unsigned int pady_all_except_invallid = - pady + info.pad_top() + info.pad_bottom() - invalid_bottom; + pady + info.pad_top() + info.pad_bottom() - invalid_bottom; pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left(); pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right; pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top(); @@ -135,7 +135,7 @@ compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> & const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int channel_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); TensorShape out_shape{input_shape}; @@ -160,7 +160,7 @@ inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int idx_channel = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); TensorShape output_shape{input->tensor_shape()}; output_shape.set(idx_width, input->dimension(idx_width) * block); diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h index 409eaf593..026209f69 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h @@ -106,22 +106,24 @@ public: CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default; /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. - * Data type supported: Should match @p input data type, except for - * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type - * @param[out] output Output tensor. The output has the same number of dimensions as the - * @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this - * is decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, + * except for input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type + * @param[out] output Output tensor. + * The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with + * @ref CLWeightsReshapeKernel. * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, @@ -130,23 +132,24 @@ public: /** Set the input, weights, biases and output tensors. * * @param[in] compile_context The compile context to be used. - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and - * an optional 4th dimension for batch of inputs. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. * @param[in] bias (Optional) The biases have one dimension. * Data type supported: Should match @p input data type, except for - * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type * @param[out] output Output tensor. The output has the same number of dimensions as - * the @p input. + * the @p input. * @param[in] info Contains padding and policies to be used in the deconvolution, - * this is decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref - * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref - * CLWeightsReshapeKernel. + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, @@ -154,24 +157,26 @@ public: unsigned int invalid_right, unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLDirectTransposeConvLayer + * CLDirectTransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. - * Data type supported: Should match @p input data type, except for input - * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type - * @param[in] output Output tensor info. The output has the same number of dimensions as the - * @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, + * except for input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type + * @param[in] output Output tensor info. The output has the same number of dimensions + * as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped + * with @ref CLWeightsReshapeKernel. * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h index e65a646dc..f27e9913e 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h @@ -216,7 +216,7 @@ private: CLConvertFullyConnectedWeights _convert_weights; weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed; weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged - _reshape_weights_managed_function; + _reshape_weights_managed_function; CLFlattenLayer _flatten_layer; CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function; CLGEMM _mm_gemm; diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h index 289ab167f..bdb168664 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h @@ -43,8 +43,8 @@ public: public: CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) - : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, - _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false) + : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, + _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false) { // DO NOTHING } diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h index b01ec4255..167554c9e 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h @@ -66,7 +66,7 @@ public: * @param[out] output The output tensor, Data types supported: same as @p input. * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 * @return N/A - */ + */ void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); /** diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h index 5fb102e47..5b27d362a 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h @@ -63,20 +63,22 @@ public: /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same - * as @p input. - * @param[out] output Output tensor. The output has the same number of dimensions as the - * @p input. - * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this - * is described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions + * as the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, @@ -85,22 +87,22 @@ public: /** Set the input, weights, biases and output tensors. * * @param[in] compile_context The compile context to be used. - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and - * an optional 4th dimension for batch of inputs. Data types supported: - * QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: - * Same as @p input. - * @param[out] output Output tensor. The output has the same number of dimensions as - * the @p input. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions + * as the @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, - * this is described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref - * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref - * CLWeightsReshapeKernel. + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, @@ -108,22 +110,24 @@ public: unsigned int invalid_right, unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLTransposeConvLayer + * CLTransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as - * @p input. - * @param[in] output Output tensor info. The output has the same number of dimensions as the - * @p input. - * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is - * described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[in] output Output tensor info. The output has the same number of dimensions + * as the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with + * @ref CLWeightsReshapeKernel. * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h index 18cb61bf9..e34b4dcb0 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h @@ -43,8 +43,8 @@ public: public: NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) - : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), - _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) + : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), + _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) { // DO NOTHING } diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h index b2ea6270f..1a68f801a 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h @@ -66,19 +66,20 @@ public: void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, const ITensor *off_value, ITensor *output, int axis = -1); /** Static function to check if given info will lead to a valid configuration of @ref - * NEOneHotKernel + * NEOneHotKernel * - * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the - * following types: U32/S32 - * @param[in] depth The tensor info for depth of the one hot dimension. Supported tensor rank: - * up to 3. Must be one of the following types: U32/S32 - * @param[in] on_value On value tensor info. Supported tensor rank: only 1. Data type supported: - * U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. Data type supported: - * Same as @p on_value - * @param[out] output Destination tensor info. Data type supported: Same as @p on_value - * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. - * The value must be in range [-indices.rank , indices.rank) + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h index 24ff5dac9..7a08dae97 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -110,39 +110,42 @@ public: /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type - * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 - * for F16 input. - * @param[out] output Output tensor. The output has the same number of dimensions as the @p - * input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. + * Data type supported: Data types supported: S32 for QASYMM8 and + * QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. * */ void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom); /** Static function to check if given info will lead to a valid configuration of @ref - * NETransposeConvLayer + * NETransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types - * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. - * @param[in] output Output tensor info. The output has the same number of dimensions as the @p - * input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] innvalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, + * F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. * * @return a status */ diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 81d0cb70f..1a8ff3e71 100644 --- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -54,123 +54,123 @@ using namespace arm_compute; const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { - // ARMComputeEx kernels - {"arg_min_max_ex_x", "arg_min_max_ex.cl"}, - {"arg_min_max_ex_y", "arg_min_max_ex.cl"}, - {"arg_min_max_ex_z", "arg_min_max_ex.cl"}, - {"arg_min_max_ex_w", "arg_min_max_ex.cl"}, - {"binary_logical_op", "binary_logical_op.cl"}, - {"cast_bool", "cast.cl"}, - {"embedding_lookup", "embedding_lookup.cl"}, - {"gather_ex", "gather_ex.cl"}, - {"gather_ex_1d", "gather_ex.cl"}, - {"gather_ex_1d_out", "gather_ex.cl"}, - {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, - {"hashtable_lookup", "hashtable_lookup.cl"}, - {"instance_normalization_ex", "instance_normalization_ex.cl"}, - {"multiply_scale_factor", "multiply_scale_factor.cl"}, - {"neg_tensor", "neg_tensor.cl"}, - {"one_hot", "one_hot.cl"}, - {"one_hot_only_on_value", "one_hot.cl"}, - {"quantization_symm8", "quantization_symm8.cl"}, - {"reduce_min_max", "reduce_operation.cl"}, - {"reduce_sum_mean", "reduce_operation.cl"}, - {"topkv2_init", "topkv2.cl"}, - {"topkv2_find_first_negative", "topkv2.cl"}, - {"topkv2_reorder_negatives", "topkv2.cl"}, - {"topkv2_store", "topkv2.cl"}, - {"radixsort_histogram", "topkv2_radixsort.cl"}, - {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, - {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, - {"radixsort_reorder", "topkv2_radixsort.cl"}, - {"topkv2_quicksort", "topkv2_quicksort.cl"}, - {"scale_factor_symm8", "scale_factor.cl"}, + // ARMComputeEx kernels + {"arg_min_max_ex_x", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_y", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_z", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_w", "arg_min_max_ex.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, + {"cast_bool", "cast.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"gather_ex", "gather_ex.cl"}, + {"gather_ex_1d", "gather_ex.cl"}, + {"gather_ex_1d_out", "gather_ex.cl"}, + {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"multiply_scale_factor", "multiply_scale_factor.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"one_hot", "one_hot.cl"}, + {"one_hot_only_on_value", "one_hot.cl"}, + {"quantization_symm8", "quantization_symm8.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"topkv2_init", "topkv2.cl"}, + {"topkv2_find_first_negative", "topkv2.cl"}, + {"topkv2_reorder_negatives", "topkv2.cl"}, + {"topkv2_store", "topkv2.cl"}, + {"radixsort_histogram", "topkv2_radixsort.cl"}, + {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, + {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, + {"radixsort_reorder", "topkv2_radixsort.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"scale_factor_symm8", "scale_factor.cl"}, }; const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { #ifdef EMBEDDED_KERNELS - { - "arg_min_max_ex.cl", + { + "arg_min_max_ex.cl", #include "./cl_kernels/arg_min_max_ex.clembed" - }, - { - "cast.cl", + }, + { + "cast.cl", #include "./cl_kernels/cast.clembed" - }, - { - "embedding_lookup.cl", + }, + { + "embedding_lookup.cl", #include "./cl_kernels/embedding_lookup.clembed" - }, - { - "gather_ex.cl", + }, + { + "gather_ex.cl", #include "./cl_kernels/gather_ex.clembed" - }, - { - "gemmlowp_ex.cl", + }, + { + "gemmlowp_ex.cl", #include "./cl_kernels/gemmlowp_ex.clembed" - }, - { - "hashtable_lookup.cl", + }, + { + "hashtable_lookup.cl", #include "./cl_kernels/hashtable_lookup.clembed" - }, - { - "helpers.h", + }, + { + "helpers.h", #include "./cl_kernels/helpers.hembed" - }, - { - "helpers_asymm.h", + }, + { + "helpers_asymm.h", #include "./cl_kernels/helpers_asymm.hembed" - }, - { - "instance_normalization_ex.cl", + }, + { + "instance_normalization_ex.cl", #include "./cl_kernels/instance_normalization_ex.clembed" - }, - { - "binary_logical_op.cl", + }, + { + "binary_logical_op.cl", #include "./cl_kernels/binary_logical_op.clembed" - }, - { - "multiply_scale_factor.cl", + }, + { + "multiply_scale_factor.cl", #include "./cl_kernels/multiply_scale_factor.clembed" - }, - { - "neg_tensor.cl", + }, + { + "neg_tensor.cl", #include "./cl_kernels/neg_tensor.clembed" - }, - { - "one_hot.cl", + }, + { + "one_hot.cl", #include "./cl_kernels/one_hot.clembed" - }, - { - "quantization_symm8.cl", + }, + { + "quantization_symm8.cl", #include "./cl_kernels/quantization_symm8.clembed" - }, - { - "reduce_operation.cl", + }, + { + "reduce_operation.cl", #include "./cl_kernels/reduce_operation.clembed" - }, - { - "scale_factor.cl", + }, + { + "scale_factor.cl", #include "./cl_kernels/scale_factor.clembed" - }, - { - "topkv2.cl", + }, + { + "topkv2.cl", #include "./cl_kernels/topkv2.clembed" - }, - { - "topkv2_radixsort.cl", + }, + { + "topkv2_radixsort.cl", #include "./cl_kernels/topkv2_radixsort.clembed" - }, - { - "topkv2_quicksort.cl", + }, + { + "topkv2_quicksort.cl", #include "./cl_kernels/topkv2_quicksort.clembed" - }, + }, #endif /* EMBEDDED_KERNELS */ }; CLKernelLibraryEx::CLKernelLibraryEx() - : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the // CLKernelLibraryEx is built @@ -337,8 +337,8 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); ARM_COMPUTE_ERROR_ON_MSG( - err != 0, - "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); ARM_COMPUTE_UNUSED(err); return result; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl index 0a014d15c..135cacf59 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl @@ -119,15 +119,15 @@ inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); - idx_sel.s0123 = (in.s0123 < in.s4567) || - (in.s0123 == in.s4567 && - CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + idx_sel.s0123 = + (in.s0123 < in.s4567) || + (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); idx_sel.s01 = - (in.s01 < in.s23) || - (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + (in.s01 < in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); in.s01 = select(in.s23, in.s01, idx_sel.s01); res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); @@ -204,15 +204,15 @@ inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); - idx_sel.s0123 = (in.s0123 > in.s4567) || - (in.s0123 == in.s4567 && - CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + idx_sel.s0123 = + (in.s0123 > in.s4567) || + (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); idx_sel.s01 = - (in.s01 > in.s23) || - (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + (in.s01 > in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); in.s01 = select(in.s23, in.s01, idx_sel.s01); res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); @@ -296,22 +296,21 @@ __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src), const uint x_idx = get_global_id(0); const uint y_idx = get_global_id(1); const __global DATA_TYPE *src_in_row = - (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + - y_idx * src_step_y); + (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y); for (unsigned int y = 0; y < get_local_size(1); ++y) { #if defined(ARG_MAX) #if defined(PREV_OUTPUT) - local_results[lid] = arg_idx_max_prev_out( - src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); + local_results[lid] = + arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); #else // !defined(PREV_OUTPUT) local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx); #endif // defined(PREV_OUTPUT) #else // defined(ARG_MIN) #if defined(PREV_OUTPUT) - local_results[lid] = arg_idx_min_prev_out( - src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); + local_results[lid] = + arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); #else // !defined(PREV_OUTPUT) local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx); #endif // defined(PREV_OUTPUT) @@ -334,12 +333,12 @@ __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src), DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]); #if defined(ARG_MAX) condition_check3 = - ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1); + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1); local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3); #else // defined(ARG_MIN) local_results[lid] = select( - local_results[lid], local_results[lid + i], - ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1)); + local_results[lid], local_results[lid + i], + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1)); #endif // defined(ARG_MAX) || defined(ARG_MIN) } barrier(CLK_LOCAL_MEM_FENCE); @@ -403,7 +402,7 @@ __kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output) { VEC_DATA_TYPE(DATA_TYPE, 16) in = - CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16)); + CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16)); VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl index e249663bc..f8b5bbeb8 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -111,14 +111,14 @@ __kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATI #if OP_CODE == 1 // LOGICAL AND VSTORE(VEC_SIZE) (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) && - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); #elif OP_CODE == 2 // LOGICAL OR VSTORE(VEC_SIZE) (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) || - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl index 92e5dfbee..5ebc78d23 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -117,15 +117,15 @@ __kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION // lookup ids for based on the tensor dimensions int lup_id[4] = {0}; - lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) - : get_global_id(0); - lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) - : get_global_id(1); + lup_id[0] = + (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0); + lup_id[1] = + (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1); lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) : get_global_id(2) % DEPTH_OUT; lup_id[3] = (NUM_DIMS == 4) - ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - : get_global_id(2) / DEPTH_OUT; + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl index 80ba73d1d..85fc09de4 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl @@ -41,7 +41,7 @@ #include "helpers.h" #if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \ - defined(COLS_A) + defined(COLS_A) #define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X) #define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X) #define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X) @@ -117,7 +117,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( , uint dst_cross_plane_pad #endif // REINTERPRET_OUTPUT_AS_3D - ) +) { int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; @@ -208,9 +208,9 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 // Load values from matrix B VECTOR_CHAR b0 = - VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)( - 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y)); + 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y)); // Accumulate acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0; @@ -251,7 +251,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 // Load values from matrix B VECTOR_CHAR b0 = - VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); // Accumulate acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl index a4f7dbd48..3ace1fde8 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -115,15 +115,15 @@ __kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION int lup_id[4] = {0}; - lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) - : get_global_id(0); - lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) - : get_global_id(1); + lup_id[0] = + (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0); + lup_id[1] = + (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1); lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) : get_global_id(2) % DEPTH_OUT; lup_id[3] = (NUM_DIMS == 4) - ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - : get_global_id(2) / DEPTH_OUT; + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; if (lup_id[NUM_DIMS - 1] < 0) { diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h index e07a25ec9..4a3bc1369 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -49,7 +49,7 @@ #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ - defined(cl_arm_integer_dot_product_accumulate_int8) + defined(cl_arm_integer_dot_product_accumulate_int8) #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && // defined(cl_arm_integer_dot_product_accumulate_int8) @@ -288,21 +288,21 @@ #define VECTOR_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ - uint name##_offset_first_element_in_bytes + uint name##_offset_first_element_in_bytes #define IMAGE_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_offset_first_element_in_bytes #define TENSOR3D_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ - uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ + uint name##_offset_first_element_in_bytes #define TENSOR4D_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ - uint name##_step_w, uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ + uint name##_step_w, uint name##_offset_first_element_in_bytes #define CONVERT_TO_VECTOR_STRUCT(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ @@ -406,9 +406,9 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ uint stride_x, uint step_x) { Vector vector = { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, }; vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; return vector; @@ -436,7 +436,7 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el .stride_x = stride_x, .stride_y = stride_y}; img.ptr += - img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; return img; } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h index 5f1b3f902..d7f1d0814 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -100,16 +100,16 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return quantized values */ -#define QUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(type, size) \ - quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ - { \ - VEC_DATA_TYPE(float, size) \ - out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ - VEC_DATA_TYPE(type, size) \ - res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \ - VEC_DATA_TYPE(type, size)); \ - return res; \ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = \ + CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ + return res; \ } /** Dequantize a vector of values to floating-point @@ -119,11 +119,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return dequantized values in floating point */ -#define DEQUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(float, size) \ - dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ - { \ - return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ } /** Correctly-rounded-to-nearest division by a power-of-two. @@ -134,7 +134,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \ - VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ { \ const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ @@ -152,32 +152,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Product of two fixed-point numbers. */ -#define ASYMM_MULT_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(int, size) \ - overflow = a == b && a == INT_MIN; \ - VEC_DATA_TYPE(long, size) \ - a_64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b_64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - ab_64 = a_64 * b_64; \ - /* Revert COMPMID-907 */ \ - VEC_DATA_TYPE(long, size) \ - mask1 = 1 << 30; \ - VEC_DATA_TYPE(long, size) \ - mask2 = 1 - (1 << 30); \ - VEC_DATA_TYPE(long, size) \ - is_positive_or_zero = ab_64 >= 0; \ - VEC_DATA_TYPE(long, size) \ - nudge = select(mask2, mask1, is_positive_or_zero); \ - VEC_DATA_TYPE(long, size) \ - mask = 1ll << 31; \ - VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ - return select(ab_x2_high32, INT_MAX, overflow); \ +#define ASYMM_MULT_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(int, size) \ + overflow = a == b && a == INT_MIN; \ + VEC_DATA_TYPE(long, size) \ + a_64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ + /* Revert COMPMID-907 */ \ + VEC_DATA_TYPE(long, size) \ + mask1 = 1 << 30; \ + VEC_DATA_TYPE(long, size) \ + mask2 = 1 - (1 << 30); \ + VEC_DATA_TYPE(long, size) \ + is_positive_or_zero = ab_64 >= 0; \ + VEC_DATA_TYPE(long, size) \ + nudge = select(mask2, mask1, is_positive_or_zero); \ + VEC_DATA_TYPE(long, size) \ + mask = 1ll << 31; \ + VEC_DATA_TYPE(int, size) \ + ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ + return select(ab_x2_high32, INT_MAX, overflow); \ } /** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). @@ -186,32 +186,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ - a) \ - { \ - const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ - const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ - const int k_fractional_bits = 31; \ - VEC_DATA_TYPE(int, size) \ - x = a + (1 << (k_fractional_bits - 3)); \ - VEC_DATA_TYPE(int, size) \ - x2 = ASYMM_MULT(x, x, size); \ - VEC_DATA_TYPE(int, size) \ - x3 = ASYMM_MULT(x2, x, size); \ - VEC_DATA_TYPE(int, size) \ - x4 = ASYMM_MULT(x2, x2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2 = \ - ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ - ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ - return constant_term + \ - ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ + a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = \ + ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + \ + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ } /** Each bit of the result is set to the corresponding bit of either then_val or @@ -263,15 +263,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) #define EXP_BARREL_SHIFTER_IMPL(size) \ inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ - VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ - int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ { \ if (k_integer_bits > exponent) \ { \ const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ return ASYMM_SELECT_USING_MASK( \ - ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ - ASYMM_MULT(result, fp_multiplier, size), result, size); \ + ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ } \ \ return result; \ @@ -285,7 +285,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ { \ const int k_fractional_bits = 31 - k_integer_bits; \ VEC_DATA_TYPE(int, size) \ @@ -298,7 +298,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ VEC_DATA_TYPE(int, size) \ result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ - a_mod_quarter_minus_one_quarter_scaled, size); \ + a_mod_quarter_minus_one_quarter_scaled, size); \ VEC_DATA_TYPE(int, size) \ remainder = a_mod_quarter_minus_one_quarter - a; \ \ @@ -312,10 +312,10 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) remainder, size); \ result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ - size); \ result = \ - EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ + result = \ + EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ \ if (k_integer_bits > 5) \ { \ @@ -335,27 +335,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Arithmetic left or right shift. */ -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - if (exponent < 0) \ - { \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) min = INT_MIN; \ - const VEC_DATA_TYPE(int, size) max = INT_MAX; \ - int threshold = ((1 << (31 - exponent)) - 1); \ - VEC_DATA_TYPE(int, size) \ - positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ - VEC_DATA_TYPE(int, size) \ - negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ - VEC_DATA_TYPE(int, size) \ - result = x << exponent; \ - result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ - result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ - return result; \ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ } /** Calculates (a+b)/2, rounded to the nearest integer. @@ -365,21 +365,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return (a+b)/2, rounded to the nearest integer. */ -#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(long, size) \ - a64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - sum = a64 + b64; \ - const VEC_DATA_TYPE(long, size) one = 1; \ - const VEC_DATA_TYPE(long, size) minus_one = -1; \ - VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, sum >= 0); \ - return convert_int##size((sum + sign) / 2); \ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, sum >= 0); \ + return convert_int##size((sum + sign) / 2); \ } /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). @@ -390,7 +390,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ inline VEC_DATA_TYPE(int, size) \ - asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ { \ const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ @@ -462,14 +462,14 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ asymm_rescale##size(value, src_integer_bits, dst_integer_bits) -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ - { \ - const int left_shift = shift > 0 ? shift : 0; \ - const int right_shift = shift > 0 ? 0 : -shift; \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ - right_shift, size); \ +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ + right_shift, size); \ } #define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ multiply_by_quantized_multiplier##size(input, qmul, shift) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl index 014842680..96a243110 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl @@ -41,7 +41,7 @@ #include "helpers.h" #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ - defined(DIM_Y) && defined(DIM_Z) + defined(DIM_Y) && defined(DIM_Z) /** This function normalizes the input 2D tensor across the first dimension with respect to mean and * standard deviation of the same dimension. * @@ -108,14 +108,14 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output) #endif /* IN_PLACE */ #ifdef GAMMA - , + , VECTOR_DECLARATION(gamma) #endif // GAMMA #ifdef BETA - , + , VECTOR_DECLARATION(beta) #endif // BETA - ) +) { Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); #ifndef IN_PLACE @@ -213,12 +213,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (int i_h = 0; i_h < DIM_Z; ++i_h) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); #endif /* IN_PLACE */ *(output_address) = (*(input_address)-mean) * multip + beta; } @@ -231,12 +231,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); #endif /* IN_PLACE */ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) @@ -251,12 +251,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (; x < DIM_X; ++x) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); #endif /* IN_PLACE */ *(output_address) = (*(input_address)-mean) * multip + beta; } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl index 3943fc4c2..abbfbd275 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl @@ -114,8 +114,8 @@ __kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION (val, 0, (__global DATA_TYPE *)output.ptr); #else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) *((__global DATA_TYPE *)(output.ptr)) = - ((DATA_TYPE)(*((__global int *)(input.ptr)))) * - *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier); + ((DATA_TYPE)(*((__global int *)(input.ptr)))) * + *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier); #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl index c274aba62..784a8d6aa 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl @@ -206,16 +206,16 @@ __kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLAR #if AXIS == 0 *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) = - *((__global const DATA_TYPE *)on_value_ptr); + *((__global const DATA_TYPE *)on_value_ptr); #elif AXIS == 1 *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) = - *((__global const DATA_TYPE *)on_value_ptr); + *((__global const DATA_TYPE *)on_value_ptr); #elif AXIS == 2 *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) = - *((__global const DATA_TYPE *)on_value_ptr); + *((__global const DATA_TYPE *)on_value_ptr); #elif AXIS == 3 *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) = - *((__global const DATA_TYPE *)on_value_ptr); + *((__global const DATA_TYPE *)on_value_ptr); #endif // AXIS } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl index 76fda9041..532000e9e 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -138,7 +138,7 @@ __kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARAT // Multiply with a multiplier smaller than 1 out_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl index 4ae9adb0b..c829f264d 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl @@ -116,7 +116,7 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc // Create scale vector const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = - *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1)); + *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1)); // Quantize VEC_DATA_TYPE(int, VEC_SIZE) @@ -127,10 +127,10 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); #else //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP( - CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) / - (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))), - int), - MIN_QUANT_VAL, MAX_QUANT_VAL); + CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) / + (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))), + int), + MIN_QUANT_VAL, MAX_QUANT_VAL); #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } #endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl index 832ac1270..d0ef31b20 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -100,12 +100,14 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(o Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); int indices[4] = { - get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, }; DATA_TYPE value = - *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); for (int i = 1; i < dim; ++i) { indices[axis] = i; @@ -186,16 +188,18 @@ __kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION( Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); int indices[4] = { - get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, }; DATA_TYPE sum_value = (DATA_TYPE)0; for (int i = 0; i < dim; ++i) { indices[axis] = i; - sum_value += *( - (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + sum_value += + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); } #if OP_CODE == 3 // REDUCE_SUM diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp index 047004d5e..45307fad7 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp @@ -63,10 +63,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_outp { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && - op != ReductionOperation::ARG_IDX_MIN, + op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); @@ -101,13 +102,13 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, output_shape.set(axis, 1); DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32; auto_init_if_empty(*output, input->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); - Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), - Steps(vector_size)); + Window win = + calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size)); bool window_changed = false; switch (axis) @@ -137,15 +138,15 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, } Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_tuple(err, win); } } // namespace CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx() - : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), - _op(ReductionOperation::ARG_IDX_MAX) + : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), + _op(ReductionOperation::ARG_IDX_MAX) { } @@ -155,11 +156,11 @@ void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, - output->info(), axis, op)); + validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, + output->info(), axis, op)); auto win_config = validate_and_configure_window( - input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, - op); + input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, + op); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); _input = input; @@ -213,7 +214,7 @@ void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor ARM_COMPUTE_ERROR("Not supported"); } _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( - "arg_min_max_ex_" + kernel_axis_name, build_opts.options())); + "arg_min_max_ex_" + kernel_axis_name, build_opts.options())); // Configure kernel window ICLKernel::configure_internal(std::get<1>(win_config), lws_hint); @@ -225,8 +226,8 @@ Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITenso { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, - output->clone().get(), axis, op))); + input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, + output->clone().get(), axis, op))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp index fbc76f5e1..ffa2c5a67 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -55,7 +55,7 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); @@ -68,15 +68,15 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); } return Status{}; } } // namespace CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) + : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -111,13 +111,13 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); const ValidRegion &valid_region = broadcast_pair.second; @@ -130,8 +130,8 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); output_access.set_valid_region(win, valid_region); @@ -151,7 +151,7 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) { can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) { can_collapse = (in_shape1[d] == in_shape2[d]); @@ -160,13 +160,13 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) bool has_collapsed = false; Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; Window slice = collapsed.first_slice_window_3D(); Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); @@ -189,9 +189,9 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) BorderSize CLBinaryLogicalOpKernel::border_size() const { const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); return BorderSize(0, border, 0, 0); } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp index 6e0bcde7f..3f2ae357d 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp @@ -103,7 +103,7 @@ void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output) // Create kernel const std::string kernel_name = "cast_bool"; _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); // Configure kernel ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp index 67aaf2db6..e4c617c8d 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -61,14 +61,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen input_access.set_valid_region(win, output->valid_region()); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_pair(err, win); } } // namespace CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() - : _input(nullptr), _output(nullptr), _lookups(nullptr) + : _input(nullptr), _output(nullptr), _lookups(nullptr) { } @@ -77,8 +77,8 @@ Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensor { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -108,8 +108,8 @@ void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *outpu build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); // Configure kernel window auto win_config = validate_and_configure_window(input->info(), output->info()); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp index 3bfe3e407..8b5885225 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -62,15 +62,15 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), actual_axis); + input->tensor_shape(), indices->tensor_shape(), actual_axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -86,7 +86,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); std::unique_ptr<ITensorInfo> output_info = input->clone(); output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), actual_axis)); + input->tensor_shape(), indices->tensor_shape(), actual_axis)); // Output auto initialization if not yet initialized auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); @@ -100,7 +100,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLGatherExKernel::CLGatherExKernel() - : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) + : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) { } @@ -109,11 +109,11 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), indices->info(), output->info(), axis)); + validate_arguments(input->info(), indices->info(), output->info(), axis)); // Configure kernel window auto win_config = - validate_and_configure_window(input->info(), indices->info(), output->info(), axis); + validate_and_configure_window(input->info(), indices->info(), output->info(), axis); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); _input = input; @@ -133,7 +133,7 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); ICLKernel::configure_internal(win_config.second); } @@ -144,7 +144,7 @@ Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *i ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis) - .first); + .first); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp index 930e7c944..f0a761b97 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -61,8 +61,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen input_access.set_valid_region(win, output->valid_region()); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_pair(err, win); } } // namespace @@ -78,8 +78,8 @@ Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); @@ -102,7 +102,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso const ICLTensor *input, ICLTensor *output, ICLTensor *hits) { ARM_COMPUTE_ERROR_THROW_ON( - validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); _lookups = lookups; _keys = keys; @@ -113,7 +113,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso // Make _lookup_indices tensor _lookup_indices = support::cpp14::make_unique<CLTensor>(); _lookup_indices->allocator()->init( - TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); _lookup_indices->allocator()->allocate(); // Set kernel build options @@ -127,8 +127,8 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); // Configure kernel window auto win_config = validate_and_configure_window(input->info(), output->info()); @@ -148,7 +148,7 @@ void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) // Set values of hits const int32_t *lookups_buf = - reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp index 61c14d271..dab6480b2 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -94,8 +94,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe } // namespace CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx() - : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), - _run_in_place(false) + : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), + _run_in_place(false) { } @@ -132,7 +132,7 @@ void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); // Configure kernel window auto win_config = validate_and_configure_window(_input->info(), _output->info()); @@ -147,7 +147,7 @@ Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp index 6b27c9917..1d4b141a7 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp @@ -99,7 +99,7 @@ std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *inpu } // namespace CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) { } @@ -108,7 +108,7 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -123,9 +123,9 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen Window win = calculate_max_window(*output->info()); if (multi_access_x) { - win.set(Window::DimX, - Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), - vec_size_x)); + win.set( + Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -134,11 +134,11 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option_if( - multi_access_x, "-DLAST_ACCESSED_X=" + - support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + multi_access_x, "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options())); } Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input, @@ -147,7 +147,7 @@ Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); ARM_COMPUTE_RETURN_ON_ERROR( - std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp index 643c8b110..ee633d437 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -80,9 +80,9 @@ void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) std::set<std::string> build_opts; build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); // Configure window Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp index 35d70d689..0b8e7cc41 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp @@ -65,7 +65,7 @@ inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo * { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output); TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( - indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } return Status{}; @@ -79,7 +79,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices, const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); // Output auto initialization if not yet initialized TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( - indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); auto_init_if_empty((*output), output_shape, 1, on_value->data_type()); // Create window Window win = calculate_max_window(*output, Steps()); @@ -88,8 +88,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices, } } // namespace CLOneHotKernel::CLOneHotKernel() - : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr), - _is_off_value_memset(false) + : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr), + _is_off_value_memset(false) { } void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value, @@ -114,10 +114,10 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor ICLTensor *output, int depth, int axis) { ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis)); + validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis)); // Configure kernel window auto win_config = - validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis); + validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); if (_is_off_value_memset) { @@ -131,7 +131,7 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size( - data_size_from_type(on_value->info()->data_type()))); + data_size_from_type(on_value->info()->data_type()))); build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis)); build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth)); build_opts.add_option("-DOUTPUT_DIM_Z=" + @@ -139,7 +139,7 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor // Create kernel const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot"; _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); ICLKernel::configure_internal(win_config.second); } Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, @@ -153,7 +153,7 @@ Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *o ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), on_value->clone().get(), output->clone().get(), depth, axis) - .first); + .first); return Status{}; } Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, @@ -163,7 +163,7 @@ Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *o ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), on_value->clone().get(), output->clone().get(), depth, axis) - .first); + .first); return Status{}; } void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp index 1a7a18cfa..b417a7103 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp @@ -87,9 +87,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen if (multi_access_x) { - win.set(Window::DimX, - Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), - vec_size_x)); + win.set( + Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } Coordinates coord; @@ -101,7 +101,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr) { } @@ -110,7 +110,7 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT { ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -132,11 +132,11 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option_if( - multi_access_x, "-DLAST_ACCESSED_X=" + - support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); + multi_access_x, + "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options())); } Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input, @@ -145,7 +145,7 @@ Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input->clone().get(), output->clone().get()).first); + validate_and_configure_window(input->clone().get(), output->clone().get()).first); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp index 3fbebf25a..3906009c2 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -145,7 +145,7 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu // Create kernel _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); // Configure kernel window Window win = calculate_max_window(*output_info, Steps()); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp index 8d8853c81..4a6374444 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp @@ -94,8 +94,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_tuple(err, win); } } // namespace @@ -115,7 +115,7 @@ void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *outp // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts)); + CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts)); auto win_config = validate_and_configure_window(input->info(), output->info()); @@ -128,7 +128,7 @@ Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITenso { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); ARM_COMPUTE_RETURN_ON_ERROR( - std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp index dfe5d59b0..c88bef6d7 100644 --- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp +++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp @@ -53,12 +53,12 @@ namespace using namespace arm_compute; template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> void elementwise_op_templ( - const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), - int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, - OutputScalarType *, const bool), - int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, - OutputScalarType *)) + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, + OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, + OutputScalarType *)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -88,26 +88,26 @@ void elementwise_op_templ( Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, - [&](const Coordinates &) { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = - reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = - *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, - non_broadcast_input_ptr, broadcast_value, - output_ptr, !is_broadcast_input_2); - for (; x < window_end_x; ++x) - { - const auto a = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = - (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, - !is_broadcast_input_2 ? a : broadcast_value); - } - }, - broadcast_input, non_broadcast_input, output); + execute_window_loop( + win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = + (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_value, output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -119,24 +119,23 @@ void elementwise_op_templ( Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, - [&](const Coordinates &) { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto input1_ptr = - reinterpret_cast<const InputScalarType *>(input1.ptr()); - const auto input2_ptr = - reinterpret_cast<const InputScalarType *>(input2.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, - input1_ptr, input2_ptr, output_ptr); - for (; x < window_end_x; ++x) - { - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = (*scalar_func)(a, b); - } - }, - input1, input2, output); + execute_window_loop( + win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, + output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); } } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp index 32d7d6237..a8464afce 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp @@ -103,8 +103,10 @@ template <BinaryLogicalOperation op> inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b) { uint8x16x4_t out = {{ - elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]), - elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]), + elementwise_logic_op<op>(a.val[0], b.val[0]), + elementwise_logic_op<op>(a.val[1], b.val[1]), + elementwise_logic_op<op>(a.val[2], b.val[2]), + elementwise_logic_op<op>(a.val[3], b.val[3]), }}; return out; } @@ -160,8 +162,8 @@ void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, } std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( - const ITensor *input1, const ITensor *input2, ITensor *output, - std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) + const ITensor *input1, const ITensor *input2, ITensor *output, + std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) { std::string function_to_call("op_"); function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; @@ -184,8 +186,8 @@ std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) { static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { - {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, - {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; + {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, + {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; return configure_func(input1, input2, output, map_function); } @@ -223,7 +225,7 @@ Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &inp ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); const TensorShape out_shape = - TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -232,8 +234,8 @@ Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &inp if (output.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), - "Wrong shape for output"); + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); } return Status{}; diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp index 12017e543..f935596e6 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp @@ -129,125 +129,125 @@ void NECastBoolKernel::run(const Window &window, const ThreadInfo &info) case DataType::S8: { /* Conversion U8 -> S8 */ - execute_window_loop(win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8( - texels_u8, vdupq_n_u8(true_val)))); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_s8(output_ptr + x, + vreinterpretq_s8_u8(vandq_u8(texels_u8, vdupq_n_u8(true_val)))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } case DataType::S16: { /* Up-conversion U8 -> S16 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const int16x8x2_t texels = { - {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), - vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; - - vst1q_s16(output_ptr + x, texels.val[0]); - vst1q_s16(output_ptr + x + 8, texels.val[1]); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s16(output_ptr + x, texels.val[0]); + vst1q_s16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } case DataType::S32: { /* Up-conversion U8 -> S32 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const int16x8x2_t texels = { - {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), - vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; - - vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } case DataType::F32: { /* Up-conversion U8 -> F32 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const int16x8x2_t texels = { - {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), - vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; - vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(output_ptr + x + 12, - vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val); - *(output_ptr + x) = static_cast<float>(in); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val); + *(output_ptr + x) = static_cast<float>(in); + } + }, + input, output); break; } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -255,86 +255,87 @@ void NECastBoolKernel::run(const Window &window, const ThreadInfo &info) { /* Up-conversion U8 -> F16 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const int16x8x2_t texels = { - {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), - vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; - vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::U8: { /* Conversion U8 -> S8 */ - execute_window_loop(win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val))); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } case DataType::U16: { /* Up-conversion U8 -> U16 */ execute_window_loop( - win, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); - - const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)), - vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}}; - - vst1q_u16(output_ptr + x, texels.val[0]); - vst1q_u16(output_ptr + x + 8, texels.val[1]); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val); - } - }, - input, output); + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)), + vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}}; + + vst1q_u16(output_ptr + x, texels.val[0]); + vst1q_u16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); break; } default: diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp index 091d38c56..e3a77c6b1 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -50,7 +50,7 @@ using namespace arm_compute; NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() - : _input(nullptr), _lookups(nullptr), _output(nullptr) + : _input(nullptr), _lookups(nullptr), _output(nullptr) { } @@ -79,8 +79,8 @@ Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input, { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); @@ -119,16 +119,17 @@ void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, - [&](const Coordinates &id) { - const int32_t lookup = *reinterpret_cast<int32_t *>( - _lookups->ptr_to_element(Coordinates{id[lookup_dim]})); - Coordinates input_id{id}; - input_id.set(lookup_dim, lookup); - memcpy(output_it.ptr(), _input->ptr_to_element(input_id), - _output->info()->dimension(0) * _output->info()->element_size()); - }, - output_it); + execute_window_loop( + out_slice, + [&](const Coordinates &id) { + const int32_t lookup = + *reinterpret_cast<int32_t *>(_lookups->ptr_to_element(Coordinates{id[lookup_dim]})); + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + }, + output_it); } while (window.slide_window_slice_4D(out_slice)); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp index 93963a504..c9f0799d4 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -71,7 +71,7 @@ template <typename U> void validate_indices(const ITensor *indices) } // namespace NEGatherKernelEx::NEGatherKernelEx() - : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{} + : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{} { } @@ -85,36 +85,35 @@ inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadIn Iterator output_it(_output, window); execute_window_loop( - window, - [&](const Coordinates &id) { - Coordinates gather_id(id); - gather_id.collapse(_indices_rank); - - U new_index; - switch (_indices_rank) - { - case 1: - new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); - break; - case 2: - new_index = - *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); - break; - case 3: - new_index = *( - reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); - break; - default: - ARM_COMPUTE_ERROR("Wrong num of dimensions"); - break; - } - - gather_id.set(0, new_index); - - std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), - output_it.ptr()); - }, - output_it); + window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); + break; + case 2: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); + break; + case 3: + new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(0, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), + output_it.ptr()); + }, + output_it); } template <typename U> @@ -130,37 +129,36 @@ void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &inf Iterator output_it(_output, output_window); execute_window_loop( - output_window, - [&](const Coordinates &id) { - Coordinates gather_id(id); - gather_id.collapse(_indices_rank, _axis); - - U new_index; - switch (_indices_rank) - { - case 1: - new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); - break; - case 2: - new_index = *(reinterpret_cast<U *>( - _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); - break; - case 3: - new_index = *(reinterpret_cast<U *>( - _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); - break; - default: - ARM_COMPUTE_ERROR("Wrong num of dimensions"); - break; - } - - gather_id.set(_axis, new_index); - - std::copy_n(_input->ptr_to_element(gather_id), - _input->info()->dimension(0) * _output->info()->element_size(), - output_it.ptr()); - }, - output_it); + output_window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank, _axis); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); + break; + case 2: + new_index = *( + reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); + break; + case 3: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(_axis, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), + _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr()); + }, + output_it); } void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, @@ -170,8 +168,8 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); _input = input; _indices = indices; @@ -217,7 +215,7 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I } // Output auto initialization if not yet initialized TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); // Create window @@ -243,15 +241,15 @@ Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), axis); + input->tensor_shape(), indices->tensor_shape(), axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp index 30787c0a4..52b40e767 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -57,7 +57,7 @@ constexpr size_t NOT_HIT = 0xFFFFFFFF; } // namespace NEHashtableLookupKernel::NEHashtableLookupKernel() - : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} + : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} { } @@ -66,7 +66,7 @@ void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *k { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_THROW_ON( - validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); _lookups = lookups; _keys = keys; @@ -92,8 +92,8 @@ Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); @@ -134,8 +134,8 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) const size_t lookup_dim = _output->info()->num_dimensions() - 1; const int const_0 = _output->info()->data_type() == DataType::QASYMM8 - ? _output->info()->quantization_info().uniform().offset - : 0; + ? _output->info()->quantization_info().uniform().offset + : 0; std::unordered_map<int32_t, size_t> key_index_map; for (size_t n = 0; n < _keys->info()->dimension(0); ++n) @@ -174,24 +174,24 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, - [&](const Coordinates &id) { - const auto lookup = lookup_indices.at(id[lookup_dim]); - if (lookup == NOT_HIT) - { - memset(output_it.ptr(), const_0, - _output->info()->dimension(0) * _output->info()->element_size()); - } - else - { - Coordinates input_id{id}; - input_id.set(lookup_dim, lookup); - memcpy(output_it.ptr(), _input->ptr_to_element(input_id), - _output->info()->dimension(0) * _output->info()->element_size()); - } - - }, - output_it); + execute_window_loop( + out_slice, + [&](const Coordinates &id) { + const auto lookup = lookup_indices.at(id[lookup_dim]); + if (lookup == NOT_HIT) + { + memset(output_it.ptr(), const_0, + _output->info()->dimension(0) * _output->info()->element_size()); + } + else + { + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + } + }, + output_it); } while (window.slide_window_slice_4D(out_slice)); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp index 49adf1462..4dc0f5535 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -63,7 +63,7 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma { /** NEON vector tag type. */ using ExactTagType = - typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; // Clear X/Y dimensions on execution window as we handle the planes manually Window win = window; @@ -73,107 +73,107 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma constexpr int window_step_x = 16 / sizeof(T); const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); const auto channel_idx = - get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); Iterator input_it(input, win); execute_window_loop( - win, - [&](const Coordinates &id) { - Window win_plane = window; - win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); - win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); - - Iterator input_plane_it(input, win_plane); - Iterator output_plane_it(output, win_plane); - - auto sum_h_w = static_cast<T>(0.f); - auto sum_squares_h_w = static_cast<T>(0.f); - - execute_window_loop( - win_plane, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); - - auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - - // Compute S elements per iteration - int x = window.x().start(); - for (; x <= (window.x().end() - window_step_x); x += window_step_x) - { - auto vec_input_val = wrapper::vloadq(input_ptr + x); - vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); - vec_sum_squares_h_w = - wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); - } - - auto vec2_sum_h_w = - wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); - auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), - wrapper::vgetlow(vec_sum_squares_h_w)); - for (int i = 0; i < window_step_x / 4; ++i) - { - vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); - vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); - } - sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); - sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); - - // Compute left-over elements - for (; x < window.x().end(); ++x) - { - const auto value = *(input_ptr + x); - sum_h_w += value; - sum_squares_h_w += value * value; - } - }, - input_plane_it, output_plane_it); - - const auto mean_h_w = sum_h_w / elements_plane; - const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; - - auto gamma_val = 1.0f; - if (gamma != nullptr) - { - gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); - } - const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); - const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); - const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); - auto beta_val = 0.0f; - if (beta != nullptr) - { - beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); - } - const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); - - execute_window_loop( - win_plane, - [&](const Coordinates &) { - auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); - auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); - - // Compute S elements per iteration - int x = window.x().start(); - auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); - for (; x <= (window.x().end() - window_step_x); x += window_step_x) - { - vec_val = wrapper::vloadq(input_ptr + x); - vec_val = wrapper::vadd( - wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); - wrapper::vstore(output_ptr + x, vec_val); - } - - // Compute left-over elements - for (; x < window.x().end(); ++x) - { - *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; - } - }, - input_plane_it, output_plane_it); - }, - input_it); + win, + [&](const Coordinates &id) { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<T>(0.f); + auto sum_squares_h_w = static_cast<T>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); + vec_sum_squares_h_w = + wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); + } + + auto vec2_sum_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), + wrapper::vgetlow(vec_sum_squares_h_w)); + for (int i = 0; i < window_step_x / 4; ++i) + { + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + } + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = *(input_ptr + x); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + auto gamma_val = 1.0f; + if (gamma != nullptr) + { + gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); + } + const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); + auto beta_val = 0.0f; + if (beta != nullptr) + { + beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); + } + const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + vec_val = wrapper::vloadq(input_ptr + x); + vec_val = wrapper::vadd( + wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); + wrapper::vstore(output_ptr + x, vec_val); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; + } + }, + input_plane_it, output_plane_it); + }, + input_it); } Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, @@ -199,8 +199,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( - input->data_layout(), DataLayoutDimension::CHANNEL)) != - gamma->dimension(0), + input->data_layout(), DataLayoutDimension::CHANNEL)) != + gamma->dimension(0), "Gamma's size must be the same as size of input's channel"); } @@ -208,8 +208,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( - input->data_layout(), DataLayoutDimension::CHANNEL)) != - beta->dimension(0), + input->data_layout(), DataLayoutDimension::CHANNEL)) != + beta->dimension(0), "Beta's size must be the same as size of input's channel"); } @@ -234,8 +234,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe } // namespace NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx() - : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), - _epsilon(1e-12) + : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), + _epsilon(1e-12) { } @@ -251,7 +251,7 @@ void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *ou _epsilon = epsilon; ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); + validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); if (_input->info()->data_type() == DataType::F32) { @@ -282,7 +282,7 @@ Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp index b92130cec..ad4728175 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -123,15 +123,17 @@ inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale) const float32x4_t vscale = vdupq_n_f32(scale); const float32x4x4_t ret = {{ - vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), - vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), }}; return ret; } } // namespace NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) { } @@ -140,7 +142,7 @@ void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -180,25 +182,25 @@ template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &w Iterator output(_output, win_collapsed); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); execute_window_loop( - win_collapsed, - [&](const Coordinates &id) { - auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); - scale *= _multiplier; - - const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); - auto output_ptr = reinterpret_cast<T *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = input_ptr[x] * scale; - } - }, - input, output); + win_collapsed, + [&](const Coordinates &id) { + auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); + scale *= _multiplier; + + const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = input_ptr[x] * scale; + } + }, + input, output); } void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info) diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp index 0a11eb509..0daff5c6a 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp @@ -101,8 +101,8 @@ bool isOnValue(U index, U depth) } // namespace NEOneHotKernel::NEOneHotKernel() - : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1}, - _output{nullptr}, _func{} + : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, + _off_value{nullptr}, _axis{-1}, _output{nullptr}, _func{} { } @@ -117,22 +117,22 @@ void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info) Iterator output_it(_output, output_window); const U off_value = *reinterpret_cast<U *>(_off_value->buffer()); execute_window_loop( - output_window, - [&](const Coordinates &id) { - std::fill_n(output_it.ptr(), - _output->info()->dimension(0) * _output->info()->element_size(), off_value); - Coordinates indices_id(id); - indices_id.remove(0); - const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); - if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) - { - Coordinates onehot_id(id); - onehot_id.set(0, new_index); - std::copy_n(_on_value->buffer(), _output->info()->element_size(), - _output->ptr_to_element(onehot_id)); - } - }, - output_it); + output_window, + [&](const Coordinates &id) { + std::fill_n(output_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size(), + off_value); + Coordinates indices_id(id); + indices_id.remove(0); + const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(0, new_index); + std::copy_n(_on_value->buffer(), _output->info()->element_size(), + _output->ptr_to_element(onehot_id)); + } + }, + output_it); } template <typename U> @@ -142,22 +142,22 @@ inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo // Validate that the indices are not negative validate_depth<U>(_depth, _output, _axis); Iterator output_it(_output, window); - execute_window_loop(window, - [&](const Coordinates &id) { - Coordinates indices_id(id); - indices_id.remove(_axis); - const U new_index = - *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); - if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) - { - Coordinates onehot_id(id); - onehot_id.set(_axis, new_index); - std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer() - : _off_value->buffer(), - _output->info()->element_size(), output_it.ptr()); - } - }, - output_it); + execute_window_loop( + window, + [&](const Coordinates &id) { + Coordinates indices_id(id); + indices_id.remove(_axis); + const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(_axis, new_index); + std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer() + : _off_value->buffer(), + _output->info()->element_size(), output_it.ptr()); + } + }, + output_it); } void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth, @@ -215,7 +215,7 @@ Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *d const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ON_ERROR( - validate_arguments(indices, depth, on_value, off_value, output, axis)); + validate_arguments(indices, depth, on_value, off_value, output, axis)); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp index 5841f1d69..2306228d5 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -107,19 +107,15 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, const int32x4x4_t rf = {{ #ifdef __aarch64__ - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), #else //__aarch64__ - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), #endif //__aarch64__ }}; const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); @@ -129,7 +125,7 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, } // namespace NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel() - : _input(nullptr), _output(nullptr), _scale_factor(nullptr) + : _input(nullptr), _output(nullptr), _scale_factor(nullptr) { } @@ -138,7 +134,7 @@ void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *out { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), output->info(), scale_factor->info())); + validate_arguments(input->info(), output->info(), scale_factor->info())); _input = input; _output = output; @@ -182,40 +178,40 @@ template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window const auto dim_x = _input->info()->dimension(0); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); execute_window_loop( - win_collapsed, - [&](const Coordinates &id) { - const auto start = reinterpret_cast<const T *>(input.ptr()); - const auto min_max = std::minmax_element(start, start + dim_x); - const auto int8_scale = 127; - auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); - if (range == 0) - { - *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; - range = 1; - } - else - { - *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; - } - const auto scale_factor_inv = int8_scale / range; - - auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], - vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); - quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); - output_ptr[x] = static_cast<int8_t>(quantized); - } - }, - input, output); + win_collapsed, + [&](const Coordinates &id) { + const auto start = reinterpret_cast<const T *>(input.ptr()); + const auto min_max = std::minmax_element(start, start + dim_x); + const auto int8_scale = 127; + auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); + if (range == 0) + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; + range = 1; + } + else + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; + } + const auto scale_factor_inv = int8_scale / range; + + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], + vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); + quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); + output_ptr[x] = static_cast<int8_t>(quantized); + } + }, + input, output); } void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info) diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp index 267228eac..b02a48ef2 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp @@ -50,8 +50,8 @@ namespace arm_compute { CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), - _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() + : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), + _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() { } @@ -60,13 +60,13 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && - op != ReductionOperation::ARG_IDX_MIN, + op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const unsigned int num_of_stages = - calculate_number_of_stages_only_x_axis(input->dimension(0), axis); + calculate_number_of_stages_only_x_axis(input->dimension(0), axis); DataType output_data_type = DataType::S32; TensorInfo not_reshaped_output; @@ -76,9 +76,9 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT if (output->total_size() != 0) { output_data_type = output->data_type(); - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( - arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, - false)); + const TensorInfo expected_output_shape = + output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape( + input->tensor_shape(), axis, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } @@ -87,9 +87,9 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) { ti.set_data_type(data_type) - .set_tensor_shape(shape) - .set_num_channels(num_channels) - .set_quantization_info(qinfo); + .set_tensor_shape(shape) + .set_num_channels(num_channels) + .set_quantization_info(qinfo); }; initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, @@ -98,7 +98,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT if (num_of_stages == 1) { ARM_COMPUTE_RETURN_ON_ERROR( - CLArgMinMaxLayerKernelEx::validate(input, nullptr, ¬_reshaped_output, axis, op)); + CLArgMinMaxLayerKernelEx::validate(input, nullptr, ¬_reshaped_output, axis, op)); } else { @@ -118,19 +118,19 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT // Validate ReductionOperation only on first kernel ARM_COMPUTE_RETURN_ON_ERROR( - CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op)); + CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op)); // Validate ReductionOperation on intermediate stages for (unsigned int i = 1; i < num_of_stages - 1; ++i) { - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], - &sums_vector[i], axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op)); } // Validate ReductionOperation on the last stage const unsigned int last_stage = num_of_stages - 1; ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate( - input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); + input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); } ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(¬_reshaped_output, output)); return Status{}; @@ -144,16 +144,16 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor * _reduction_axis = axis; const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape( - input->info()->tensor_shape(), axis, false); + input->info()->tensor_shape(), axis, false); DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) - ? DataType::S32 - : output->info()->data_type(); + ? DataType::S32 + : output->info()->data_type(); auto_init_if_empty(*output->info(), input->info() - ->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); // Configure reduction operation kernels _reduction_kernels_vector.resize(_num_of_stages); @@ -166,11 +166,11 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor * TensorShape output_shape{input->info()->tensor_shape()}; output_shape.set(axis, 1); auto_init_if_empty(*_not_reshaped_output.info(), input->info() - ->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _not_reshaped_output.info()->set_tensor_shape(output_shape); _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op); } @@ -182,7 +182,7 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor * { shape.set(0, ceil(shape.x() / 128.f)); _results_vector[i].allocator()->init( - input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); + input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); } // Apply ReductionOperation only on first kernel diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp index 3dede0562..6359b4bcb 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp @@ -53,16 +53,10 @@ namespace arm_compute using namespace arm_compute::misc::shape_calculator; CLDirectTransposeConvLayer::CLDirectTransposeConvLayer( - std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _scale_f(), - _conv_f(), - _flip_weights(), - _scaled_output(), - _original_weights(nullptr), - _weights_flipped(), - _flip_axis(), - _is_prepared(false) + std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), _scale_f(), _conv_f(), _flip_weights(), + _scaled_output(), _original_weights(nullptr), _weights_flipped(), _flip_axis(), + _is_prepared(false) { } @@ -74,7 +68,7 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); const DataLayout data_layout = input->data_layout(); @@ -86,8 +80,8 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); auto out_dims = transposeconv_output_dimensions( - input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), - weights->dimension(idx_h), info, invalid_right, invalid_bottom); + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); @@ -117,19 +111,19 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen unsigned int pad_right = 0; unsigned int pad_top = 0; unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, - pad_bottom); + const TensorShape scale_out_shape = + compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); TensorInfo scale_out_info(input->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(scale_out_shape) - .set_data_layout(data_layout)); + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, - conv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); return Status{}; } @@ -171,22 +165,22 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); auto out_dims = transposeconv_output_dimensions( - input->info()->dimension(idx_w), input->info()->dimension(idx_h), - weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, - invalid_bottom); + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); const TensorShape output_shape = - compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); // Output auto initialization if not yet initialized auto_init_if_empty( - *output->info(), - input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate( - input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), - info, invalid_right, invalid_bottom)); + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, + invalid_right, invalid_bottom)); _is_prepared = weights_info.retain_internal_weights(); @@ -195,8 +189,8 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order // to match output shape const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, - pad_right, pad_top, pad_bottom); + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp index 01989461e..79d0929a9 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp @@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I ARM_COMPUTE_UNUSED(weights); ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); return Status{}; } @@ -80,12 +80,12 @@ Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in } CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), - _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(), - _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(), - _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false), - _original_weights(nullptr) + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), + _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(), + _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(), + _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false), + _original_weights(nullptr) { } void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, @@ -107,8 +107,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _accumulate_biases = false; @@ -140,10 +140,10 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen bool is_fc_after_conv = false; if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -158,28 +158,28 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen { // Reshape the weights _reshape_weights_output.allocator()->init( - weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights->info()))); + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output); weights_to_use = &_reshape_weights_output; } // Extract scale factor _scale_factor.allocator()->init( - TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type())); + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type())); _memory_group.manage(&_scale_factor); _scale_factor_kernel.configure(input, &_scale_factor); // Quantize input _quantized_input.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); _memory_group.manage(&_quantized_input); _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input); // GEMMLowp _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); _memory_group.manage(&_gemmlowp_output); configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output, fc_info.retain_internal_weights); @@ -209,15 +209,15 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe const GPUTarget gpu_target = CLScheduler::get().target(); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); + CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); } // With the Fully Connected layer we can have 4 different cases: @@ -247,33 +247,32 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); weights_to_use = &reshaped_weights; } // Validate Scale factor kernel const ITensorInfo &scale_factor = - TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type()); + TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor)); // Validate quantization symm8 kernel - const ITensorInfo &quantized_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED)); ARM_COMPUTE_RETURN_ON_ERROR( - CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); + CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); // Fully Connected layer after a Fully Connected Layer without batches ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); // Validate matrix multiply kernel const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); // Multiply scale ARM_COMPUTE_RETURN_ON_ERROR( - CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output)); + CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output)); return Status{}; } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp index 2ff4b9659..13d3acbac 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp @@ -79,7 +79,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn int output_multiplier = 0; int output_shift = 0; ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one( - multiplier, &output_multiplier, &output_shift)); + multiplier, &output_multiplier, &output_shift)); // Set the GEMMLowp output stage info gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; @@ -99,7 +99,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I { GEMMLowpOutputStageInfo gemmlowp_output_stage; ARM_COMPUTE_RETURN_ON_ERROR( - construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); + construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -125,14 +125,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I // Validate gemmlowp function ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( - &input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, - gemm_info)); + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, + gemm_info)); } else { ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); + CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); } return Status{}; @@ -154,12 +154,12 @@ Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input, CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), - _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), - _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager), - _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), - _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), - _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) + : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), + _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), + _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager), + _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), + _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), + _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) { } void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights, @@ -190,9 +190,9 @@ void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTens const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); input->info()->set_quantization_info(QuantizationInfo( - input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); weights->info()->set_quantization_info(QuantizationInfo( - weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp.configure(input, weights, bias, output, gemm_info); @@ -214,8 +214,8 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC const FullyConnectedLayerInfo &fc_info) { ARM_COMPUTE_ERROR_ON( - (weights->info()->dimension(1) != - (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be // linearized @@ -223,11 +223,11 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC // Initialize output tensor for flatten TensorShape shape_flatten = compute_flatten_shape(input->info()); _flatten_output.allocator()->init(input->info() - ->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(shape_flatten) - .set_data_layout(DataLayout::NCHW)); + ->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(shape_flatten) + .set_data_layout(DataLayout::NCHW)); // Configure flatten kernel _memory_group.manage(&_flatten_output); @@ -258,8 +258,8 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_converted = true; _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; @@ -285,10 +285,10 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor const bool is_batched_fc_layer = output->info()->dimension(1) > 1; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -302,7 +302,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor { _reshape_weights_managed_function.configure(weights); weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->acquire(weights, &_reshape_weights_managed_function)); + _weights_manager->acquire(weights, &_reshape_weights_managed_function)); } else { @@ -320,7 +320,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(), fc_info.weights_trained_layout); weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->acquire(weights, &_convert_weights_managed)); + _weights_manager->acquire(weights, &_convert_weights_managed)); } else { @@ -359,16 +359,16 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor bool is_fc_after_conv = true; const ITensorInfo &flatten_input = TensorInfo(input->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(compute_flatten_shape(input)) - .set_data_layout(DataLayout::NCHW)); + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(input)) + .set_data_layout(DataLayout::NCHW)); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = - weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) - : TensorInfo(*reshaped_weights.clone()); + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -396,7 +396,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); + CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -404,7 +404,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate convert weights kernel ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate( - weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } @@ -412,8 +412,8 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Fully Connected layer after a Convolution Layer without batches ARM_COMPUTE_RETURN_ERROR_ON( - (weights_to_use->dimension(1) != - (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); @@ -427,7 +427,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor // Validate matrix multiply kernel ARM_COMPUTE_RETURN_ON_ERROR( - validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); + validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); return Status{}; } @@ -457,7 +457,7 @@ void CLFullyConnectedLayerEx::run() if (_weights_manager && _weights_manager->are_weights_managed(cur_weights)) { _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + _weights_manager->run(cur_weights, &_reshape_weights_managed_function)); } else { diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp index 157b4d977..ac6982e6f 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -41,7 +41,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp // reshape auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( - _input->info()->data_layout())); + _input->info()->data_layout())); _cl_reshape.configure(_input, &_cl_buffer); input_to_use = &_cl_buffer; } @@ -57,7 +57,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp { bool is_hybrid = (input->info()->data_type() == DataType::F32 || input->info()->data_type() == DataType::F16) && - (weights->info()->data_type() == DataType::S8 || + (weights->info()->data_type() == DataType::QSYMM8 || weights->info()->data_type() == DataType::QASYMM8_SIGNED); if (is_hybrid) @@ -81,7 +81,6 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp { throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type"); } - }(); if (_needs_reshape) diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp index 02ee4ad8a..c246041bb 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -46,8 +46,8 @@ using namespace arm_compute; CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), - _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() + : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), + _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() { } @@ -91,13 +91,13 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo * for (size_t i = 0; i < num_of_kernels; ++i, ++it) { ARM_COMPUTE_RETURN_ON_ERROR( - CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); } if (!keep_dims) { ARM_COMPUTE_RETURN_ON_ERROR( - CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); + CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); } return Status{}; diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp index a502f032e..12c0aa829 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp @@ -134,8 +134,8 @@ void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &ou // Output auto inizialitation if not yet initialized TensorInfo tmp_output_info = *output->info()->clone(); auto_init_if_empty( - tmp_output_info, - input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); + tmp_output_info, + input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); // Update coordinate on axis start_coords.set(split_dim, axis_offset); @@ -153,7 +153,7 @@ void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &ou } // namespace CLSplitVEx::CLSplitVEx() - : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions() + : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions() { } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp index 3ac95a8e6..accd51302 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -49,14 +49,14 @@ namespace arm_compute { CLTopKV2::CLTopKV2() - : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), - _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), - _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), - _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), - _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), - _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), - _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), - _reorder_negatives_kernel(), _store_kernel()*/ + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), _p_out_key_buf(nullptr), + _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel()*/ { } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp index 3215d01a7..0754fd813 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -53,7 +53,7 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _function() + : _memory_manager(std::move(memory_manager)), _function() { } @@ -105,20 +105,20 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); switch (CLTransposeConvLayer::get_deconvolution_method( - input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) { case DeconvolutionMethod::DIRECT: { // Validate direct convolution layer ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate( - input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); break; } case DeconvolutionMethod::GEMM: { // Validate gemm-based convolution layer ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); + CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); break; } default: @@ -130,9 +130,9 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf } DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method( - const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, - ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, - unsigned int invalid_bottom, const WeightsInfo &weights_info) + const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, + ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(output, bias, weights_info); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp index a123439d9..e212a03c7 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -58,7 +58,7 @@ namespace Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) { ARM_COMPUTE_RETURN_ON_ERROR( - NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); return Status{}; } @@ -78,11 +78,11 @@ Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in } NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), - _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), - _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), - _accumulate_biases(false), _is_prepared(false) + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), + _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), + _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), + _accumulate_biases(false), _is_prepared(false) { } @@ -103,8 +103,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _accumulate_biases = false; @@ -132,10 +132,10 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor bool _is_fc_after_conv; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -150,23 +150,23 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor { // Reshape the weights _reshape_weights_output.allocator()->init( - weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights->info()))); + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output); weights_to_use = &_reshape_weights_output; } // Quantize input _quantized_input.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); _scale_factor.allocator()->init( - TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); // GEMM _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output); // Multiply scale @@ -195,8 +195,8 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr) @@ -217,7 +217,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -225,20 +225,19 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); // Validate quantization kernel - const ITensorInfo &quantized_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED)); const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR( - NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); // Validate matrix multiply kernel ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate( - &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale)); + &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale)); return Status{}; } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp index cb7557a5a..a639f2979 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -69,14 +69,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I // Validate gemmlowp function ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( - &input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate( - &input, &weights, nullptr, &output, 1.f, 0.0f, - GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */))); } return Status{}; @@ -84,12 +84,12 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } // namespace NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), - _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), - _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), - _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), - _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), - _accumulate_biases(false), _is_quantized(false), _is_prepared(false) + : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), + _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), + _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false), + _is_quantized(false), _is_prepared(false) { } @@ -105,9 +105,9 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor * const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); input->info()->set_quantization_info(QuantizationInfo( - input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); weights->info()->set_quantization_info(QuantizationInfo( - weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp.configure(input, weights, nullptr, output); @@ -129,8 +129,8 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen ITensor *output) { ARM_COMPUTE_ERROR_ON( - (weights->info()->dimension(1) != - (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be // linearized @@ -138,8 +138,7 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen // Initialize output tensor for flatten TensorShape shape_flatten = compute_flatten_shape(input->info()); _flatten_output.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - shape_flatten)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten)); // Configure flatten kernel _memory_group.manage(&_flatten_output); @@ -169,8 +168,8 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_converted = true; _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; @@ -183,8 +182,7 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei if (_is_quantized) { _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); } // Configure accumulate biases kernel for non quantized asymmetric types @@ -208,10 +206,10 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei const bool is_batched_fc_layer = output->info()->dimension(1) > 1; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -284,16 +282,16 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); const ITensorInfo &flatten_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_flatten_shape(input))); + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = - weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) - : TensorInfo(*reshaped_weights.clone()); + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr && !is_quantized) @@ -330,7 +328,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -338,7 +336,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate convert weights kernel ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate( - weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } @@ -346,8 +344,8 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Fully Connected layer after a Convolution Layer without batches ARM_COMPUTE_RETURN_ERROR_ON( - (weights_to_use->dimension(1) != - (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); @@ -365,7 +363,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor if (is_quantized) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( - &gemmlowp_output, biases, output)); + &gemmlowp_output, biases, output)); } return Status{}; diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp index dc6c78478..234c783f9 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -56,7 +56,7 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); bool is_hybrid = input->info()->data_type() == DataType::F32 && - (weights->info()->data_type() == DataType::S8 || + (weights->info()->data_type() == DataType::QSYMM8 || weights->info()->data_type() == DataType::QASYMM8_SIGNED); if (is_hybrid) diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp index 16d74e62d..451aa0997 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp @@ -46,9 +46,9 @@ namespace arm_compute { NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), - _permute_input(), _permute_output(), _permuted_input(), _permuted_output() + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), + _permute_input(), _permute_output(), _permuted_input(), _permuted_output() { } @@ -88,8 +88,8 @@ Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const float epsilon) { return NEInstanceNormalizationLayerKernelEx::validate( - &input->clone()->set_data_layout(DataLayout::NCHW), - &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); + &input->clone()->set_data_layout(DataLayout::NCHW), + &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); } void NEInstanceNormalizationLayerEx::run() diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp index cb1a26304..c45c335b3 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -49,8 +49,8 @@ using namespace arm_compute; NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), - _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() { } @@ -125,7 +125,7 @@ void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_a for (unsigned int i = 0; i < _reduction_ops; ++i) { TensorShape out_shape = - i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp index 26a887912..b21717e86 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -47,8 +47,8 @@ using namespace arm_compute; NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), - _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() { } @@ -122,7 +122,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b for (unsigned int i = 0; i < _reduction_ops; ++i) { TensorShape out_shape = - i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); @@ -135,7 +135,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()) - .set_data_layout(input->info()->data_layout())); + .set_data_layout(input->info()->data_layout())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::SUM); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp index aa165cc15..50311071b 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -51,17 +51,9 @@ namespace arm_compute { NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _conv_f(), - _upsample_f(), - _flip_weights(), - _scaled_output(), - _weights_flipped(), - _flip_axis(), - _original_weights(nullptr), - _input(nullptr), - _info(), - _is_prepared(false) + : _memory_group(std::move(memory_manager)), _conv_f(), _upsample_f(), _flip_weights(), + _scaled_output(), _weights_flipped(), _flip_axis(), _original_weights(nullptr), _input(nullptr), + _info(), _is_prepared(false) { } @@ -76,15 +68,15 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); const unsigned int width_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); const unsigned int height_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); auto out_dims = transposeconv_output_dimensions( - input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), - weights->dimension(height_idx), info, invalid_right, invalid_bottom); + input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), + weights->dimension(height_idx), info, invalid_right, invalid_bottom); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); if (bias != nullptr) @@ -117,24 +109,24 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf unsigned int pad_right = 0; unsigned int pad_top = 0; unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, - pad_bottom); + const TensorShape scale_out_shape = + compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); TensorInfo scale_out_info( - input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); const unsigned int batches_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); const unsigned int channel_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx)); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, - conv_info, WeightsInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo())); return Status{}; } @@ -146,21 +138,21 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( - input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), - info, invalid_right, invalid_bottom)); + input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); auto out_dims = transposeconv_output_dimensions( - input->info()->dimension(width_idx), input->info()->dimension(height_idx), - weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, - invalid_right, invalid_bottom); + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, + invalid_right, invalid_bottom); const TensorShape output_shape = - compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); _input = input; _original_weights = weights; @@ -188,8 +180,8 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, - pad_right, pad_top, pad_bottom); + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, DimensionRoundingType::FLOOR); |