diff options
author | Michalis Spyrou <michalis.spyrou@arm.com> | 2021-01-20 16:41:12 +0000 |
---|---|---|
committer | Michalis Spyrou <michalis.spyrou@arm.com> | 2021-02-09 18:25:46 +0000 |
commit | 373b407558f99eb4bba632c170d03d807941dd2a (patch) | |
tree | 448bb0225fa8b5fdfa48ddee973ec0b51a115f44 | |
parent | 4841c97170b85be0706b65d424e967e561cef932 (diff) | |
download | armcl-373b407558f99eb4bba632c170d03d807941dd2a.tar.gz armcl-373b407558f99eb4bba632c170d03d807941dd2a.tar.bz2 armcl-373b407558f99eb4bba632c170d03d807941dd2a.zip |
Make Softmax kernels and operator stateless
COMPMID-3997
Change-Id: I3a3cc76d8247dd769d9a5e6e171d718ea909312c
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4986
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r-- | Android.bp | 3 | ||||
-rw-r--r-- | arm_compute/core/experimental/Types.h | 2 | ||||
-rw-r--r-- | arm_compute/runtime/NEON/functions/NEFillBorder.h | 3 | ||||
-rw-r--r-- | arm_compute/runtime/NEON/functions/NESoftmaxLayer.h | 45 | ||||
-rw-r--r-- | docs/00_introduction.dox | 12 | ||||
-rw-r--r-- | src/core/NEON/NEKernels.h | 1 | ||||
-rw-r--r-- | src/core/NEON/kernels/NESoftmaxLayerKernel.h | 141 | ||||
-rw-r--r-- | src/core/cpu/kernels/CpuSoftmaxKernel.cpp (renamed from src/core/NEON/kernels/NESoftmaxLayerKernel.cpp) | 184 | ||||
-rw-r--r-- | src/core/cpu/kernels/CpuSoftmaxKernel.h | 107 | ||||
-rw-r--r-- | src/core/cpu/kernels/softmax/impl/NEON/list.h (renamed from src/core/NEON/kernels/softmax/impl/NEON/list.h) | 4 | ||||
-rw-r--r-- | src/core/cpu/kernels/softmax/impl/SVE/list.h (renamed from src/core/NEON/kernels/softmax/impl/SVE/list.h) | 0 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEFillBorder.cpp | 7 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NESoftmaxLayer.cpp | 149 | ||||
-rw-r--r-- | src/runtime/cpu/operators/CpuSoftmax.cpp | 204 | ||||
-rw-r--r-- | src/runtime/cpu/operators/CpuSoftmax.h | 105 |
15 files changed, 604 insertions, 363 deletions
diff --git a/Android.bp b/Android.bp index 31bc14b6b..04f9d93c6 100644 --- a/Android.bp +++ b/Android.bp @@ -300,7 +300,6 @@ cc_library_static { "src/core/NEON/kernels/NESobel3x3Kernel.cpp", "src/core/NEON/kernels/NESobel5x5Kernel.cpp", "src/core/NEON/kernels/NESobel7x7Kernel.cpp", - "src/core/NEON/kernels/NESoftmaxLayerKernel.cpp", "src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp", "src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp", "src/core/NEON/kernels/NEStackLayerKernel.cpp", @@ -405,6 +404,7 @@ cc_library_static { "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp", "src/core/cpu/kernels/CpuPoolingKernel.cpp", "src/core/cpu/kernels/CpuReshapeKernel.cpp", + "src/core/cpu/kernels/CpuSoftmaxKernel.cpp", "src/core/cpu/kernels/CpuSubKernel.cpp", "src/core/cpu/kernels/activation/NEON/fp16.cpp", "src/core/cpu/kernels/activation/NEON/fp32.cpp", @@ -801,6 +801,7 @@ cc_library_static { "src/runtime/cpu/operators/CpuPooling.cpp", "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp", "src/runtime/cpu/operators/CpuReshape.cpp", + "src/runtime/cpu/operators/CpuSoftmax.cpp", "src/runtime/cpu/operators/CpuSub.cpp", "src/runtime/gpu/cl/operators/ClActivation.cpp", "src/runtime/gpu/cl/operators/ClAdd.cpp", diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h index 81b4dc875..f615678e3 100644 --- a/arm_compute/core/experimental/Types.h +++ b/arm_compute/core/experimental/Types.h @@ -46,10 +46,12 @@ enum TensorType : int32_t ACL_DST = 30, ACL_DST_0 = 30, ACL_DST_1 = 31, + ACL_DST_2 = 32, ACL_INT = 50, ACL_INT_0 = 50, ACL_INT_1 = 51, ACL_INT_2 = 52, + ACL_INT_3 = 53, ACL_SRC_VEC = 256, }; diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h index e9a08ef7e..8a8a0c7dc 100644 --- a/arm_compute/runtime/NEON/functions/NEFillBorder.h +++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -39,6 +39,7 @@ class NEFillBorderKernel; class NEFillBorder : public IFunction { public: + NEFillBorder(); /** Initialize the function's source, destination and border_mode. * * @note This function fills the borders within the XY-planes. diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h index 40fa38afd..8a2ae1012 100644 --- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,33 +26,14 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEPermute.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> namespace arm_compute { class ITensor; -class NELogits1DMaxKernel; -template <bool IS_LOG> -class NELogits1DSoftmaxKernel; -class NEFillBorderKernel; +class ITensorInfo; -/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. - * - * Softmax is calculated by : - * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f] - * - * Log Softmax is calculated by : - * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f] - * - * This function runs the following function/kernels: - * -# If axis is not 0: - * -# @ref NEPermute - * -# @ref NEFillBorderKernel - * -# @ref NELogits1DMaxKernel - * -# @ref NELogits1DSoftmaxKernel - */ +/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. */ template <bool IS_LOG = false> class NESoftmaxLayerGeneric : public IFunction { @@ -62,17 +43,17 @@ public: /** Prevent instances of this class from being copied (As this class contains pointers) */ NESoftmaxLayerGeneric(const NESoftmaxLayerGeneric &) = delete; /** Default move constructor */ - NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default; + NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&); /** Prevent instances of this class from being copied (As this class contains pointers) */ NESoftmaxLayerGeneric &operator=(const NESoftmaxLayerGeneric &) = delete; /** Default move assignment operator */ - NESoftmaxLayerGeneric &operator=(NESoftmaxLayerGeneric &&) = default; + NESoftmaxLayerGeneric &operator=(NESoftmaxLayerGeneric &&); /** Default destructor */ ~NESoftmaxLayerGeneric(); /** Set the input and output tensors. * * @param[in,out] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. If the width is not a - * multiple of the internal processing block size, @ref NEFillBorderKernel replicates the + * multiple of the internal processing block size, @ref NEFillBorder replicates the * last value of each row to the nearest multiple. * @param[out] output Destination tensor. Data types supported: same as @p input. * @param[in] beta (Optional) A scaling factor for the exponent. @@ -96,17 +77,9 @@ public: void run() override; private: - MemoryGroup _memory_group; - NEPermute _permute_input; - NEPermute _permute_output; - std::unique_ptr<NELogits1DMaxKernel> _max_kernel; - std::unique_ptr<NELogits1DSoftmaxKernel<IS_LOG>> _softmax_kernel; - std::unique_ptr<NEFillBorderKernel> _fill_border_kernel; - Tensor _max; - Tensor _tmp; - Tensor _input_permuted; - Tensor _output_permuted; - bool _needs_permute; + MemoryGroup _memory_group; + struct Impl; + std::unique_ptr<Impl> _impl; }; using NESoftmaxLayer = NESoftmaxLayerGeneric<false>; diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index 4c1112f2d..3dc86fe05 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -96,8 +96,8 @@ v21.02 Public major release - @ref NEActivationLayer - @ref NEArithmeticAddition - @ref NEBatchNormalizationLayerKernel - - @ref NELogits1DSoftmaxKernel - - @ref NELogits1DMaxKernel + - NELogits1DSoftmaxKernel + - NELogits1DMaxKernel - NEElementwiseUnaryKernel - Remove padding from OpenCL kernels: - @ref CLDirectConvolutionLayerKernel @@ -460,8 +460,8 @@ v20.08 Public major release - @ref NEBatchNormalizationLayerKernel - NEArithmeticSubtractionKernel - @ref NEBoundingBoxTransformKernel - - @ref NELogits1DMaxKernel - - @ref NELogits1DSoftmaxKernel + - NELogits1DMaxKernel + - NELogits1DSoftmaxKernel - @ref NEROIPoolingLayerKernel - @ref NEROIAlignLayerKernel - NEYOLOLayerKernel @@ -1269,7 +1269,7 @@ v17.04 Public bug fixes release - NEHarrisScoreFP16Kernel - @ref NEHarrisScoreKernel - @ref NEHOGDetectorKernel - - @ref NELogits1DMaxKernel + - NELogits1DMaxKernel - NELogits1DShiftExpSumKernel - NELogits1DNormKernel - @ref NENonMaximaSuppression3x3FP16Kernel @@ -1284,7 +1284,7 @@ v17.03.1 First Major public release of the sources - New NEON kernels / functions: - @ref NENormalizationLayerKernel / @ref NENormalizationLayer - @ref NETransposeKernel / @ref NETranspose - - @ref NELogits1DMaxKernel, NELogits1DShiftExpSumKernel, NELogits1DNormKernel / @ref NESoftmaxLayer + - NELogits1DMaxKernel, NELogits1DShiftExpSumKernel, NELogits1DNormKernel / @ref NESoftmaxLayer - @ref NEIm2ColKernel, @ref NECol2ImKernel, NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer - NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer - @ref NEGEMMLowpMatrixMultiplyKernel / NEGEMMLowp diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h index c636e5b3b..66309f929 100644 --- a/src/core/NEON/NEKernels.h +++ b/src/core/NEON/NEKernels.h @@ -117,7 +117,6 @@ #include "src/core/NEON/kernels/NESobel3x3Kernel.h" #include "src/core/NEON/kernels/NESobel5x5Kernel.h" #include "src/core/NEON/kernels/NESobel7x7Kernel.h" -#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h" #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h" #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h" #include "src/core/NEON/kernels/NEStackLayerKernel.h" diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.h b/src/core/NEON/kernels/NESoftmaxLayerKernel.h deleted file mode 100644 index 70e2417fc..000000000 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H -#define ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H - -#include "src/core/NEON/INEKernel.h" -#include "src/core/NEON/INESimpleKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the identifying the max value of 1D Logits */ -class NELogits1DMaxKernel : public INESimpleKernel -{ -public: - const char *name() const override - { - return "NELogits1DMaxKernel"; - } - /** Default constructor */ - NELogits1DMaxKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NELogits1DMaxKernel(const NELogits1DMaxKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NELogits1DMaxKernel &operator=(const NELogits1DMaxKernel &) = delete; - /** Allow instances of this class to be moved */ - NELogits1DMaxKernel(NELogits1DMaxKernel &&) = default; - /** Allow instances of this class to be moved */ - NELogits1DMaxKernel &operator=(NELogits1DMaxKernel &&) = default; - /** Default destructor */ - ~NELogits1DMaxKernel() = default; - /** Set the input and output tensors. - * - * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] output Destination tensor. Data types supported: same as @p input - */ - void configure(const ITensor *input, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel - * - * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] output Destination tensor. Data types supported: same as @p input - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - BorderSize _border_size; -}; - -/** Interface for softmax computation for QASYMM8 with pre-computed max. */ -template <bool IS_LOG = false> -class NELogits1DSoftmaxKernel : public INEKernel -{ -public: - const char *name() const override - { - if(IS_LOG) - { - return "NELogits1DSoftmaxKernel"; - } - else - { - return "NELogits1DLogSoftmaxKernel"; - } - } - /** Default constructor */ - NELogits1DSoftmaxKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NELogits1DSoftmaxKernel(const NELogits1DSoftmaxKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NELogits1DSoftmaxKernel &operator=(const NELogits1DSoftmaxKernel &) = delete; - /** Allow instances of this class to be moved */ - NELogits1DSoftmaxKernel(NELogits1DSoftmaxKernel &&) = default; - /** Allow instances of this class to be moved */ - NELogits1DSoftmaxKernel &operator=(NELogits1DSoftmaxKernel &&) = default; - /** Default destructor */ - ~NELogits1DSoftmaxKernel() = default; - /** Set the input and output tensors. - * - * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] max Max values tensor. Same shape as input with dimension 0 set to 1. - * Data types supported: same as @p input. - * @param[out] output Destination tensor. Data types supported: same as @p input. - * @param[in] beta A scaling factor for the exponent. - * - * @param tmp Auxiliary tensor. Must be type F32 and same shape as the input. - */ - void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp); - /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel - * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. - * Data types supported: same as @p input. - * @param[in] output Destination tensor info. Data types supported: same as @p input. - * @param[in] beta A scaling factor for the exponent. - * @param[in] tmp Tensor info of auxiliary. Must be type F32 and same shape as the input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *max, - const ITensorInfo *output, const float beta, const ITensorInfo *tmp); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - const ITensor *_input; - const ITensor *_max; - ITensor *_output; - float _beta; - ITensor *_tmp; //Temporary. Used internally -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H */ diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp index fe09f1ec5..a8542b6be 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h" +#include "src/core/cpu/kernels/CpuSoftmaxKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -33,12 +33,16 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/core/NEON/kernels/softmax/impl/NEON/list.h" -#include "src/core/NEON/kernels/softmax/impl/SVE/list.h" #include "src/core/common/Registrars.h" +#include "src/core/cpu/kernels/softmax/impl/NEON/list.h" +#include "src/core/cpu/kernels/softmax/impl/SVE/list.h" namespace arm_compute { +namespace cpu +{ +namespace kernels +{ namespace { struct SoftmaxSelectorData @@ -208,98 +212,90 @@ Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorI } // namespace -NELogits1DMaxKernel::NELogits1DMaxKernel() - : _border_size() +CpuLogits1DMaxKernel::CpuLogits1DMaxKernel() { } -BorderSize NELogits1DMaxKernel::border_size() const +void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) { - return _border_size; -} + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); -void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info()); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*input->info(), *output->info())); - // Configure kernel window + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); // Softmax across the x dimension - const TensorShape output_shape = TensorShape(input->info()->tensor_shape()).set(0, 1); + const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1); // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - Window win = calculate_max_window(*input->info(), Steps()); + Window win = calculate_max_window(*src, Steps()); Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - _input = input; - _output = output; - - const int input_width = input->info()->valid_region().shape.x(); - const int num_elems_processed_per_iteration = 16U / data_size_from_type(input->info()->data_type()); - const int num_elems_read_per_iteration = ceil_to_multiple(input_width, num_elems_processed_per_iteration); + coord.set_num_dimensions(dst->num_dimensions()); + dst->set_valid_region(ValidRegion(coord, dst->tensor_shape())); - _border_size = BorderSize(0, num_elems_read_per_iteration - input_width, 0, 0); - - INEKernel::configure(win); + ICpuKernel::configure(win); } -Status NELogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*input, *output)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst)); return Status{}; } -void NELogits1DMaxKernel::run(const Window &window, const ThreadInfo &info) +void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); - const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ _input->info()->data_type() }); - uk->ukernel(_input, _output, window); + const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->info()->data_type() }); + uk->ukernel(src, dst, window); +} + +const char *CpuLogits1DMaxKernel::name() const +{ + return "CpuLogits1DMaxKernel"; } namespace { -Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensorInfo &max, - const ITensorInfo &output, const float beta, const ITensorInfo &tmp, bool is_log) +Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max, + const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log) { ARM_COMPUTE_UNUSED(beta); // Check input - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type()); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); // Check max - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &max); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(input.tensor_shape()).set(0, 1), max.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &max); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); // Check output if configured - if(output.total_size() != 0) + if(dst.total_size() != 0) { - const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(input.data_type(), is_log) : output.quantization_info(); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON(output.quantization_info() != output_quantization); + const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info(); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); + ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization); } // Check tmp if configured if(tmp.total_size() != 0) { - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type(); + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type(); ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); // We could potentially reduce tmp memory if we could predict or make an assumption // on the maximum number of threads that will run in parallel. - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &tmp); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp); } return Status{}; @@ -307,74 +303,90 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor } // namespace template <bool IS_LOG> -NELogits1DSoftmaxKernel<IS_LOG>::NELogits1DSoftmaxKernel() - : _input(nullptr), _max(nullptr), _output(nullptr), _beta(1.0f), _tmp(nullptr) +CpuLogits1DSoftmaxKernel<IS_LOG>::CpuLogits1DSoftmaxKernel() + : _beta(1.0f) { } template <bool IS_LOG> -void NELogits1DSoftmaxKernel<IS_LOG>::configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp) +void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp); - ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), max->info(), output->info(), tmp->info()); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*input->info(), *max->info(), *output->info(), beta, *tmp->info(), IS_LOG)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + + _beta = beta; // Configure kernel window - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type()); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); // Output auto initialization if not yet initialized - const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(input->info()->data_type(), IS_LOG) : output->info()->quantization_info(); - auto_init_if_empty(*output->info(), TensorInfo(*input->info()).set_quantization_info(output_quantization).reset_padding()); + const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info(); + auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); // Tmp auto initialization if not yet initialized - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input->info()->data_type(); - auto_init_if_empty(*tmp->info(), TensorInfo(*input->info()).set_data_type(tmp_data_type).reset_padding()); + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); + auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); // Configure kernel window - Window win = calculate_max_window(*max->info(), Steps()); + Window win = calculate_max_window(*max, Steps()); Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); - - _input = input; - _max = max; - _output = output; - _beta = beta; - _tmp = tmp; + coord.set_num_dimensions(dst->num_dimensions()); + dst->set_valid_region(ValidRegion(coord, dst->tensor_shape())); - INEKernel::configure(win); + ICpuKernel::configure(win); } template <bool IS_LOG> -Status NELogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *max, - const ITensorInfo *output, const float beta, const ITensorInfo *tmp) +Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max, + const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*input, *max, *output, beta, *tmp, IS_LOG)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); return Status{}; } template <bool IS_LOG> -void NELogits1DSoftmaxKernel<IS_LOG>::run(const Window &window, const ThreadInfo &info) +void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - const unsigned int num_elems_processed_per_iteration = _input->info()->valid_region().shape.x(); - const unsigned int tmp_size_for_thread = _tmp->info()->element_size() * num_elems_processed_per_iteration; + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto max = tensors.get_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); - ARM_COMPUTE_ERROR_ON(_tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); + const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); + const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; - void *tmp_for_thread = _tmp->buffer() + (info.thread_id * tmp_size_for_thread); + ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); + + void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); + + const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->info()->data_type() }); + uk->ukernel(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); +} - const auto *uk = get_implementation_logits(SoftmaxSelectorData{ _input->info()->data_type() }); - uk->ukernel(_input, _max, tmp_for_thread, _output, _beta, IS_LOG, window); +template <bool IS_LOG> +const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const +{ + if(IS_LOG) + { + return "CpuLogits1DSoftmaxKernel"; + } + else + { + return "CpuLogits1DLogSoftmaxKernel"; + } } -template class NELogits1DSoftmaxKernel<true>; -template class NELogits1DSoftmaxKernel<false>; +template class CpuLogits1DSoftmaxKernel<true>; +template class CpuLogits1DSoftmaxKernel<false>; +} // namespace kernels +} // namespace cpu } // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.h b/src/core/cpu/kernels/CpuSoftmaxKernel.h new file mode 100644 index 000000000..aa1046796 --- /dev/null +++ b/src/core/cpu/kernels/CpuSoftmaxKernel.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SOFTMAXKERNEL_H +#define ARM_COMPUTE_CPU_SOFTMAXKERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the identifying the max value of 1D Logits */ +class CpuLogits1DMaxKernel : public ICpuKernel +{ +public: + /** Constructor */ + CpuLogits1DMaxKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel); + /** Set the input and output tensors. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p input + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DMaxKernel + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] dst Destination tensor info. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; + +/** Interface for softmax computation for QASYMM8 with pre-computed max. */ +template <bool IS_LOG = false> +class CpuLogits1DSoftmaxKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuLogits1DSoftmaxKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel); + + /** Set the input and output tensors. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. + * Data types supported: same as @p input. + * @param[out] dst Destination tensor info. Data types supported: same as @p input. + * @param[in] beta A scaling factor for the exponent. + * + * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. + */ + void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); + /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DSoftmaxKernel + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. + * Data types supported: same as @p input. + * @param[in] dst Destination tensor info. Data types supported: same as @p input. + * @param[in] beta A scaling factor for the exponent. + * @param[in] tmp Tensor info of auxiliary. Must be type F32 and same shape as the input. + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *max, + const ITensorInfo *dst, const float beta, const ITensorInfo *tmp); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + float _beta; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SOFTMAXKERNEL_H */ diff --git a/src/core/NEON/kernels/softmax/impl/NEON/list.h b/src/core/cpu/kernels/softmax/impl/NEON/list.h index a8f781f43..1aa7e8fac 100644 --- a/src/core/NEON/kernels/softmax/impl/NEON/list.h +++ b/src/core/cpu/kernels/softmax/impl/NEON/list.h @@ -24,10 +24,10 @@ #ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H #define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H -#include "src/core/NEON/wrapper/wrapper.h" -#include "support/SaturateCast.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "support/SaturateCast.h" namespace arm_compute { diff --git a/src/core/NEON/kernels/softmax/impl/SVE/list.h b/src/core/cpu/kernels/softmax/impl/SVE/list.h index 0936bd5a5..0936bd5a5 100644 --- a/src/core/NEON/kernels/softmax/impl/SVE/list.h +++ b/src/core/cpu/kernels/softmax/impl/SVE/list.h diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp index bb57222eb..256aad6d3 100644 --- a/src/runtime/NEON/functions/NEFillBorder.cpp +++ b/src/runtime/NEON/functions/NEFillBorder.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,11 @@ namespace arm_compute { +NEFillBorder::NEFillBorder() + : _border_handler(nullptr) +{ +} + void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) { _border_handler = std::make_unique<NEFillBorderKernel>(); diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index 6be34ad1a..3f1e43a8f 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,49 +22,62 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h" -#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/Tensor.h" +#include "src/core/cpu/kernels/CpuSoftmaxKernel.h" #include "src/core/helpers/SoftmaxHelpers.h" +#include "src/runtime/cpu/operators/CpuSoftmax.h" namespace arm_compute { template <bool IS_LOG> -NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default; +struct NESoftmaxLayerGeneric<IS_LOG>::Impl +{ + const ITensor *src{ nullptr }; + ITensor *dst{ nullptr }; + Tensor max{ nullptr }; + Tensor tmp{ nullptr }; + Tensor input_permuted{ nullptr }; + Tensor output_permuted{ nullptr }; + std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{ nullptr }; +}; template <bool IS_LOG> NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _permute_input(), _permute_output(), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp(), _input_permuted(), _output_permuted(), - _needs_permute(false) + : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>()) { } template <bool IS_LOG> +NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default; +template <bool IS_LOG> +NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default; +template <bool IS_LOG> +NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default; + +template <bool IS_LOG> void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, float beta, int32_t axis) { - // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis)); - const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions()))); + _impl->src = input; + _impl->dst = output; + _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>(); + _impl->op->configure(input->info(), output->info(), beta, axis); - _needs_permute = actual_axis > 0; - - if(_needs_permute) + const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions()))); + const bool needs_permute = actual_axis > 0; + if(needs_permute) { // Add to the memory manager _input_permuted - _memory_group.manage(&_input_permuted); - - _permute_input.configure(input, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + auto permute_input = std::make_unique<cpu::CpuPermute>(); + _memory_group.manage(&_impl->input_permuted); + permute_input->configure(input->info(), _impl->input_permuted.info(), softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case) // or it is the original input case (2D case) - ITensor *tmp_input = (_needs_permute ? &_input_permuted : input); + ITensor *tmp_input = (needs_permute ? &_impl->input_permuted : input); // Create intermediate tensors shapes const TensorInfo input_info = tmp_input->info()->clone()->reset_padding().set_is_resizable(true); @@ -74,80 +87,49 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f // Init intermediate tensors TensorShape max_sum_shape = tmp_input->info()->tensor_shape(); max_sum_shape.set(0, 1); - _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape)); - _tmp.allocator()->init(tensor_info_tmp); + _impl->max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape)); + _impl->tmp.allocator()->init(tensor_info_tmp); // Manage intermediate buffers - _memory_group.manage(&_max); - _memory_group.manage(&_tmp); + _memory_group.manage(&_impl->max); + _memory_group.manage(&_impl->tmp); // Configure kernels - _max_kernel = std::make_unique<NELogits1DMaxKernel>(); - _softmax_kernel = std::make_unique<NELogits1DSoftmaxKernel<IS_LOG>>(); - _max_kernel->configure(tmp_input, &_max); - if(_needs_permute) + auto max_kernel = std::make_unique<cpu::kernels::CpuLogits1DMaxKernel>(); + auto softmax_kernel = std::make_unique<cpu::kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>(); + max_kernel->configure(tmp_input->info(), _impl->max.info()); + + if(needs_permute) { + auto permute_output = std::make_unique<cpu::CpuPermute>(); // Add to the memory manager _output_permuted - _memory_group.manage(&_output_permuted); + _memory_group.manage(&_impl->output_permuted); // The normalization kernel stores the result in a permuted output tensor - _softmax_kernel->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); - _input_permuted.allocator()->allocate(); + softmax_kernel->configure(tmp_input->info(), _impl->max.info(), _impl->output_permuted.info(), beta, _impl->tmp.info()); + _impl->input_permuted.allocator()->allocate(); // Re-permute the permuted output into the requested (4D) output - _permute_output.configure(&_output_permuted, output, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + permute_output->configure(_impl->output_permuted.info(), output->info(), softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); // Allocate the intermediate permuted tensors - _output_permuted.allocator()->allocate(); + _impl->output_permuted.allocator()->allocate(); } else { - // Softmax 2D case - _fill_border_kernel = std::make_unique<NEFillBorderKernel>(); - _fill_border_kernel->configure(tmp_input, _max_kernel->border_size(), BorderMode::REPLICATE); - _softmax_kernel->configure(tmp_input, &_max, output, beta, &_tmp); + softmax_kernel->configure(tmp_input->info(), _impl->max.info(), output->info(), beta, _impl->tmp.info()); } // Allocate intermediate buffers - _max.allocator()->allocate(); - _tmp.allocator()->allocate(); + _impl->max.allocator()->allocate(); + _impl->tmp.allocator()->allocate(); } template <bool IS_LOG> Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { - // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported"); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-input->num_dimensions()) || static_cast<int32_t>(input->num_dimensions()) <= axis); - - // Create intermediate tensor info - DataType tmp_data_type = input->data_type(); - const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); - - TensorShape max_sum_shape = input->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true)); - const TensorInfo dont_care; - - const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(input->num_dimensions()))); - - const bool needs_permute = actual_axis > 0; - - if(needs_permute) - { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector); - TensorInfo input_permuted(input->clone()->set_tensor_shape(permuted_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &input_permuted, permutation_vector)); - TensorInfo output_permuted(output->clone()->set_tensor_shape(permuted_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&output_permuted, output, permutation_vector)); - } - - ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care)); - + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis)); return Status{}; } @@ -155,23 +137,14 @@ template <bool IS_LOG> void NESoftmaxLayerGeneric<IS_LOG>::run() { MemoryGroupResourceScope scope_mg(_memory_group); - - if(_needs_permute) - { - _permute_input.run(); - } - else - { - NEScheduler::get().schedule(_fill_border_kernel.get(), Window::DimY); - } - - NEScheduler::get().schedule(_max_kernel.get(), Window::DimY); - NEScheduler::get().schedule(_softmax_kernel.get(), Window::DimY); - - if(_needs_permute) - { - _permute_output.run(); - } + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, _impl->src); + pack.add_tensor(TensorType::ACL_DST, _impl->dst); + pack.add_tensor(TensorType::ACL_INT_0, &_impl->tmp); + pack.add_tensor(TensorType::ACL_INT_1, &_impl->max); + pack.add_tensor(TensorType::ACL_INT_2, &_impl->input_permuted); + pack.add_tensor(TensorType::ACL_INT_3, &_impl->output_permuted); + _impl->op->run(pack); } template class NESoftmaxLayerGeneric<false>; diff --git a/src/runtime/cpu/operators/CpuSoftmax.cpp b/src/runtime/cpu/operators/CpuSoftmax.cpp new file mode 100644 index 000000000..0e1bcd5c6 --- /dev/null +++ b/src/runtime/cpu/operators/CpuSoftmax.cpp @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/cpu/operators/CpuSoftmax.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/cpu/kernels/CpuSoftmaxKernel.h" +#include "src/core/helpers/SoftmaxHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +template <bool IS_LOG> +CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric() + : _permute_input(), _permute_output(), _max_kernel(), _softmax_kernel(), _max(nullptr), _tmp(nullptr), _input_permuted(nullptr), _output_permuted(nullptr), _needs_permute(false) +{ +} + +template <bool IS_LOG> +void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis) +{ + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis)); + + const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + + _needs_permute = actual_axis > 0; + + if(_needs_permute) + { + _input_permuted = std::make_unique<TensorInfo>(); + _permute_input.configure(src, _input_permuted.get(), softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + } + + // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case) + // or it is the original input case (2D case) + const ITensorInfo *tmp_input = (_needs_permute ? _input_permuted.get() : src); + + // Create intermediate tensors shapes + TensorShape max_sum_shape = tmp_input->tensor_shape(); + max_sum_shape.set(0, 1); + const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); + DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); + TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); + TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); + + // Init intermediate tensors + _max = std::make_unique<TensorInfo>(max_info); + _tmp = std::make_unique<TensorInfo>(tensor_info_tmp); + + // Configure kernels + auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>(); + mk->configure(tmp_input, _max.get()); + _max_kernel = std::move(mk); + + auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>(); + if(_needs_permute) + { + _output_permuted = std::make_unique<TensorInfo>(); + + // The normalization kernel stores the result in a permuted output tensor + sm->configure(tmp_input, _max.get(), _output_permuted.get(), beta, _tmp.get()); + + // Re-permute the permuted output into the requested (4D) output + _permute_output.configure(_output_permuted.get(), dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + } + else + { + // Softmax 2D case + sm->configure(tmp_input, _max.get(), dst, beta, _tmp.get()); + } + _softmax_kernel = std::move(sm); +} + +template <bool IS_LOG> +Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis) +{ + // Perform validation step + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis); + + // Create intermediate tensor info + DataType tmp_data_type = src->data_type(); + const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); + + TensorShape max_sum_shape = src->tensor_shape(); + max_sum_shape.set(0, 1); + const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true)); + const TensorInfo dont_care; + + const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + + const bool needs_permute = actual_axis > 0; + + if(needs_permute) + { + const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); + TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector)); + TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector)); + } + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); + + return Status{}; +} + +template <bool IS_LOG> +void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + ITensorPack max_pack; + ITensorPack softmax_pack; + + if(_needs_permute) + { + ITensorPack permute_in_pack; + permute_in_pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC)); + permute_in_pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_INT_2)); + _permute_input.run(permute_in_pack); + + max_pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(ACL_INT_2)); + + softmax_pack.add_tensor(TensorType::ACL_SRC_0, tensors.get_tensor(ACL_INT_2)); + softmax_pack.add_tensor(TensorType::ACL_SRC_1, tensors.get_tensor(ACL_INT_1)); + softmax_pack.add_tensor(TensorType::ACL_DST_0, tensors.get_tensor(ACL_INT_3)); + softmax_pack.add_tensor(TensorType::ACL_DST_1, tensors.get_tensor(ACL_INT_0)); + } + else + { + max_pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC)); + softmax_pack.add_tensor(TensorType::ACL_SRC_0, tensors.get_const_tensor(ACL_SRC)); + softmax_pack.add_tensor(TensorType::ACL_SRC_1, tensors.get_tensor(ACL_INT_1)); + softmax_pack.add_tensor(TensorType::ACL_DST_0, tensors.get_tensor(ACL_DST)); + softmax_pack.add_tensor(TensorType::ACL_DST_1, tensors.get_tensor(ACL_INT_0)); + } + + max_pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_INT_1)); + + NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); + NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); + + if(_needs_permute) + { + ITensorPack permute_out_pack; + permute_out_pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(ACL_INT_3)); + permute_out_pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); + _permute_output.run(permute_out_pack); + } +} + +template <bool IS_LOG> +experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const +{ + experimental::MemoryRequirements req{}; + + req.push_back({ TensorType::ACL_INT_0, _tmp->total_size(), 0 }); + req.push_back({ TensorType::ACL_INT_1, _max->total_size(), 0 }); + + if(_needs_permute) + { + req.push_back({ TensorType::ACL_INT_2, _input_permuted->total_size(), 0 }); + req.push_back({ TensorType::ACL_INT_3, _output_permuted->total_size(), 0 }); + } + + return req; +} + +template class CpuSoftmaxGeneric<false>; +template class CpuSoftmaxGeneric<true>; +} // namespace cpu +} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuSoftmax.h b/src/runtime/cpu/operators/CpuSoftmax.h new file mode 100644 index 000000000..9f18e0e4c --- /dev/null +++ b/src/runtime/cpu/operators/CpuSoftmax.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SOFTMAX_H +#define ARM_COMPUTE_CPU_SOFTMAX_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/experimental/Types.h" +#include "src/core/cpu/ICpuKernel.h" +#include "src/runtime/cpu/ICpuOperator.h" +#include "src/runtime/cpu/operators/CpuPermute.h" +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +class CpuLogits1DMaxKernel; +template <bool IS_LOG> +class CpuLogits1DSoftmaxKernel; + +/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. + * + * Softmax is calculated by : + * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f] + * + * Log Softmax is calculated by : + * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f] + * + * This function runs the following function/kernels: + * -# If axis is not 0: + * -# @ref CpuPermute + * -# @ref kernels::CpuLogits1DMaxKernel + * -# @ref kernels::CpuLogits1DSoftmaxKernel + */ +template <bool IS_LOG = false> +class CpuSoftmaxGeneric : public ICpuOperator +{ +public: + /** Constructor */ + CpuSoftmaxGeneric(); + /** Set the input and output tensors. + * + * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * last value of each row to the nearest multiple. + * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. + * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and + * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); + + /** Static function to check if given info will lead to a valid configuration of @ref CpuSoftmax + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] dst Destination tensor info. Data types supported: same as @p input + * @param[in] beta (Optional) A scaling factor for the exponent. + * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and + * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + CpuPermute _permute_input; + CpuPermute _permute_output; + std::unique_ptr<ICpuKernel> _max_kernel; + std::unique_ptr<ICpuKernel> _softmax_kernel; + std::unique_ptr<ITensorInfo> _max; + std::unique_ptr<ITensorInfo> _tmp; + std::unique_ptr<ITensorInfo> _input_permuted; + std::unique_ptr<ITensorInfo> _output_permuted; + bool _needs_permute; +}; +using CpuSoftmax = CpuSoftmaxGeneric<false>; +using CpuLogSoftmax = CpuSoftmaxGeneric<true>; + +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */ |