summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2021-02-03 16:05:00 +0000
committerMichele Di Giorgio <michele.digiorgio@arm.com>2021-02-08 09:46:35 +0000
commit1928904316e80ba0549b94ae1f905d7e79bda812 (patch)
treeac44d4118f2beb6c6b454995abaeb76228ab54ab
parentdda6914c6e923187c2ca2c3bfd71677e9c9e5c68 (diff)
downloadarmcl-1928904316e80ba0549b94ae1f905d7e79bda812.tar.gz
armcl-1928904316e80ba0549b94ae1f905d7e79bda812.tar.bz2
armcl-1928904316e80ba0549b94ae1f905d7e79bda812.zip
Make NEON Pooling kernels and functions state-less
Partially resolves COMPMID-3999 Change-Id: Ib39d40694df5c5f0a9401488e0c3af3ac26e8c55 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4984 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp7
-rw-r--r--arm_compute/runtime/NEON/functions/NEPoolingLayer.h18
-rw-r--r--docs/00_introduction.dox6
-rw-r--r--src/core/NEON/NEKernels.h1
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.cpp26
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.h14
-rw-r--r--src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h4
-rw-r--r--src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp269
-rw-r--r--src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp276
-rw-r--r--src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h (renamed from src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h)71
-rw-r--r--src/core/cpu/kernels/CpuPoolingKernel.cpp (renamed from src/core/NEON/kernels/NEPoolingLayerKernel.cpp)899
-rw-r--r--src/core/cpu/kernels/CpuPoolingKernel.h (renamed from src/core/NEON/kernels/NEPoolingLayerKernel.h)175
-rw-r--r--src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp156
-rw-r--r--src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h149
-rw-r--r--src/runtime/NEON/functions/NEPoolingLayer.cpp103
-rw-r--r--src/runtime/cpu/operators/CpuPooling.cpp130
-rw-r--r--src/runtime/cpu/operators/CpuPooling.h102
-rw-r--r--src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp98
-rw-r--r--src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h89
19 files changed, 1334 insertions, 1259 deletions
diff --git a/Android.bp b/Android.bp
index bc5ae34bb..5653fc8a6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -289,7 +289,6 @@ cc_library_static {
"src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
"src/core/NEON/kernels/NEPadLayerKernel.cpp",
"src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp",
- "src/core/NEON/kernels/NEPoolingLayerKernel.cpp",
"src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp",
"src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp",
"src/core/NEON/kernels/NEQuantizationLayerKernel.cpp",
@@ -340,7 +339,6 @@ cc_library_static {
"src/core/NEON/kernels/arm_gemm/quantized.cpp",
"src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
"src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
- "src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp",
"src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp",
"src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp",
"src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp",
@@ -409,6 +407,8 @@ cc_library_static {
"src/core/cpu/kernels/CpuFillKernel.cpp",
"src/core/cpu/kernels/CpuFloorKernel.cpp",
"src/core/cpu/kernels/CpuPermuteKernel.cpp",
+ "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp",
+ "src/core/cpu/kernels/CpuPoolingKernel.cpp",
"src/core/cpu/kernels/CpuReshapeKernel.cpp",
"src/core/cpu/kernels/CpuSubKernel.cpp",
"src/core/cpu/kernels/activation/NEON/fp16.cpp",
@@ -736,7 +736,6 @@ cc_library_static {
"src/runtime/NEON/functions/NEPermute.cpp",
"src/runtime/NEON/functions/NEPhase.cpp",
"src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp",
- "src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp",
"src/runtime/NEON/functions/NEPoolingLayer.cpp",
"src/runtime/NEON/functions/NEPriorBoxLayer.cpp",
"src/runtime/NEON/functions/NEQLSTMLayer.cpp",
@@ -796,6 +795,8 @@ cc_library_static {
"src/runtime/cpu/operators/CpuFill.cpp",
"src/runtime/cpu/operators/CpuFloor.cpp",
"src/runtime/cpu/operators/CpuPermute.cpp",
+ "src/runtime/cpu/operators/CpuPooling.cpp",
+ "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp",
"src/runtime/cpu/operators/CpuReshape.cpp",
"src/runtime/cpu/operators/CpuSub.cpp",
"src/runtime/gpu/cl/operators/ClActivation.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index d23913816..91b3a709f 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -32,17 +32,15 @@
namespace arm_compute
{
+// Forward declarations
class ITensor;
class ITensorInfo;
-class NEPoolingLayerKernel;
-class NEFillBorderKernel;
-class NEPoolingAssemblyDispatch;
/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following NEON kernels:
*
* -# @ref NEFillBorderKernel (executed if padding size is different from zero)
- * -# @ref NEPoolingLayerKernel
- * -# @ref NEPoolingAssemblyDispatch
+ * -# @ref cpu::kernels::CpuPoolingKernel
+ * -# @ref cpu::CpuPoolingAssemblyDispatch
*/
class NEPoolingLayer : public IFunction
{
@@ -86,14 +84,8 @@ public:
void run() override;
private:
- std::shared_ptr<IMemoryManager> _memory_manager;
-
- std::unique_ptr<NEPoolingLayerKernel> _pooling_layer_kernel;
- std::unique_ptr<NEFillBorderKernel> _border_handler;
- std::unique_ptr<NEPoolingAssemblyDispatch> _asm_glue;
-
- bool _is_global_pooling_layer;
- DataLayout _data_layout;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
}
#endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 735f60ad2..ab2495dbf 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -166,7 +166,7 @@ v20.11 Public major release
- NELocallyConnectedMatrixMultiplyKernel
- @ref NEGEMMLowpOffsetContributionKernel
- @ref NEGEMMTranspose1xWKernel
- - @ref NEPoolingLayerKernel
+ - NEPoolingLayerKernel
- @ref NEConvolutionKernel
- @ref NEDepthwiseConvolutionLayerNativeKernel
- @ref NEGEMMLowpMatrixMultiplyKernel
@@ -1120,7 +1120,7 @@ v18.01 Public maintenance release
- Added QASYMM8 support to the following NEON kernels:
- NEDepthwiseConvolutionLayer3x3Kernel
- @ref NEFillBorderKernel
- - @ref NEPoolingLayerKernel
+ - NEPoolingLayerKernel
- Added new examples:
- graph_cl_mobilenet_qasymm8.cpp
- graph_inception_v3.cpp
@@ -1299,7 +1299,7 @@ v17.03 Sources preview
- New NEON kernels / functions:
- NEActivationLayerKernel / @ref NEActivationLayer
- GEMM refactoring + FP16 support (Requires armv8.2 CPU): @ref NEGEMMInterleave4x4Kernel, @ref NEGEMMTranspose1xWKernel, @ref NEGEMMMatrixMultiplyKernel, @ref NEGEMMMatrixAdditionKernel / @ref NEGEMM
- - @ref NEPoolingLayerKernel / @ref NEPoolingLayer
+ - NEPoolingLayerKernel / @ref NEPoolingLayer
v17.02.1 Sources preview
- New OpenCL kernels / functions:
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 87eec3860..c636e5b3b 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -101,7 +101,6 @@
#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
#include "src/core/NEON/kernels/NEPadLayerKernel.h"
#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 488079062..10384d417 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,12 +33,8 @@
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/core/helpers/WindowHelpers.h"
-#include <algorithm>
-#include <cstdint>
-
namespace arm_compute
{
-class Coordinates;
namespace
{
inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
@@ -100,20 +96,26 @@ NEFillBorderKernel::NEFillBorderKernel()
void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ _tensor = tensor;
+ configure(tensor->info(), border_size, border_mode, constant_border_value);
+}
+
+void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
- ARM_COMPUTE_ERROR_ON(tensor->info()->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_ERROR_ON(tensor->data_type() == DataType::UNKNOWN);
- _tensor = tensor;
_border_size = border_size;
_mode = border_mode;
_constant_border_value = constant_border_value;
- _border_size.limit(tensor->info()->padding());
+ _border_size.limit(tensor->padding());
Window win;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
win.set(Window::DimY, Window::Dimension(0, 1, 1));
- win.use_tensor_dimensions(_tensor->info()->tensor_shape(), Window::DimZ);
+ win.use_tensor_dimensions(tensor->tensor_shape(), Window::DimZ);
INEKernel::configure(win);
}
@@ -156,6 +158,12 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
}
}
+void NEFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+ _tensor = tensors.get_tensor(TensorType::ACL_SRC_DST);
+ run(window, info);
+}
+
void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
{
uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
index 65908bebe..2c851583e 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -65,9 +65,21 @@ public:
*
*/
void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ /** Initialise the function.
+ *
+ * @note This kernel fills the borders within the XY-planes.
+ *
+ * @param[in,out] tensor Tensor info to process. Data types supported: All.
+ * @param[in] border_size Size of the border to fill in elements.
+ * @param[in] border_mode Border mode to use for the convolution.
+ * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+ *
+ */
+ void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
private:
void fill_replicate_single_channel(const Window &window);
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index 8cdfe2b95..f42272826 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,7 +56,7 @@ public:
*
* @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
* @param[in] indices Tensor containing the offset to store the input elements in the output tensor.
- * @ref NEPoolingLayerKernel with indices should precede this function in order to
+ * @ref cpu::kernels::CpuPoolingKernel with indices should precede this function in order to
* properly reconstruct the output tensor.
* The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
* @param[out] output Destination tensor. Data types supported: Same as @p input.
diff --git a/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp
deleted file mode 100644
index 04406663f..000000000
--- a/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.cpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-void NEPoolingAssemblyWrapperKernel::configure(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output initialization if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, info)));
-
- const bool requantize = input->quantization_info() != output->quantization_info();
-
- switch(input->data_type())
- {
- case DataType::QASYMM8:
- if(requantize)
- {
- create_arm_pooling_requant<uint8_t, uint8_t>(input, output, info, cpu_info);
- }
- else
- {
- create_arm_pooling<uint8_t, uint8_t>(input, output, info, cpu_info);
- }
- break;
- case DataType::QASYMM8_SIGNED:
- if(requantize)
- {
- create_arm_pooling_requant<int8_t, int8_t>(input, output, info, cpu_info);
- }
- else
- {
- create_arm_pooling<int8_t, int8_t>(input, output, info, cpu_info);
- }
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- create_arm_pooling<float16_t, float16_t>(input, output, info, cpu_info);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- create_arm_pooling<float, float>(input, output, info, cpu_info);
- break;
- default:
- break;
- }
-
- Window win = calculate_max_window(*output, Steps());
- INEKernel::configure(win);
-}
-
-Status NEPoolingAssemblyWrapperKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-#ifndef __aarch64__
- ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
-#endif /* __aarch64__ */
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
- "Only AVG and MAX pooling are supported by assembly kernels");
-
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- const auto input_qinfo = input->quantization_info().uniform();
- const auto output_qinfo = output->quantization_info().uniform();
-
- if(input_qinfo != output_qinfo)
- {
- const float multiplier = input_qinfo.scale / output_qinfo.scale;
- int32_t output_multiplier{};
- int32_t output_shift{};
- ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
- }
- else
- {
- if(input->data_type() == DataType::QASYMM8)
- {
- const bool has_padding = info.pad_stride_info.has_padding();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same input/output quantization info");
- }
- }
- }
- else
- {
- if(input->data_type() == DataType::QASYMM8)
- {
- // If output is not configured, the quantization info are the same
- const bool has_padding = info.pad_stride_info.has_padding();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same input/output quantization info");
- }
- }
- return Status{};
-}
-
-void NEPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_UNUSED(window);
- ARM_COMPUTE_UNUSED(info);
-
- ARM_COMPUTE_ERROR_ON(tensors.empty());
-
- const ITensor *input = tensors.get_const_tensor(TensorType::ACL_SRC);
- ITensor *output = tensors.get_tensor(TensorType::ACL_DST_0);
- ITensor *workspace = tensors.get_tensor(TensorType::ACL_DST_1);
-
- const auto in_ptr = input->buffer() + input->info()->offset_first_element_in_bytes();
- auto out_ptr = output->buffer() + output->info()->offset_first_element_in_bytes();
- auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
-
- const auto input_shape = input->info()->tensor_shape();
- const auto output_shape = output->info()->tensor_shape();
- const auto input_padding = input->info()->padding();
- const auto output_padding = output->info()->padding();
-
- const size_t ld_input_col = input_shape[0] + input_padding.left + input_padding.right;
- const size_t ld_input_row = ld_input_col * (input_shape[1] + input_padding.top + input_padding.bottom);
- const size_t ld_input_batch = ld_input_row * input_shape[2];
- const size_t ld_output_col = output_shape[0] + output_padding.right;
- const size_t ld_output_row = ld_output_col * (output_shape[1] + output_padding.top + output_padding.bottom);
- const size_t ld_output_batch = ld_output_row * output_shape[2];
-
- _kernel_asm->execute(in_ptr, ld_input_col, ld_input_row, ld_input_batch,
- out_ptr, ld_output_col, ld_output_row, ld_output_batch,
- working_space, info.thread_id, info.num_threads);
-}
-
-size_t NEPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
-{
- return _kernel_asm->get_working_size(num_threads);
-}
-
-bool NEPoolingAssemblyWrapperKernel::is_configured() const
-{
- return _kernel_asm != nullptr;
-}
-
-template <typename TypeInput, typename TypeOutput>
-void NEPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
- const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
- arm_conv::pooling::PoolingWindow window{};
- window.cols = static_cast<unsigned int>(info.pool_size.x());
- window.rows = static_cast<unsigned int>(info.pool_size.y());
-
- arm_conv::pooling::PoolingStride stride{};
- std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
- const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
- constexpr unsigned int idx_width = 1;
- constexpr unsigned int idx_height = 2;
- constexpr unsigned int idx_channels = 0;
- constexpr unsigned int idx_batches = 3;
-
- const unsigned int n_batches = input->dimension(idx_batches);
- const unsigned int input_rows = input->dimension(idx_height);
- const unsigned int input_cols = input->dimension(idx_width);
- const unsigned int n_channels = input->dimension(idx_channels);
- const unsigned int output_rows = output->dimension(idx_height);
- const unsigned int output_cols = output->dimension(idx_width);
-
- arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, input_rows, input_cols, n_channels, output_rows, output_cols, padding, nullptr);
-
- // Configure assembly pooling kernel
- auto pooling_kernel_asm = arm_conv::pooling::pooling<TypeInput, TypeOutput>(args);
- if(pooling_kernel_asm == nullptr)
- {
- // Configuration not supported: Leave function unconfigured:
- return;
- }
-
- _kernel_asm = std::move(pooling_kernel_asm);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void NEPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
- const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
- arm_conv::pooling::PoolingWindow window{};
- window.cols = static_cast<unsigned int>(info.pool_size.x());
- window.rows = static_cast<unsigned int>(info.pool_size.y());
-
- arm_conv::pooling::PoolingStride stride{};
- std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
- const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
- constexpr unsigned int idx_width = 1;
- constexpr unsigned int idx_height = 2;
- constexpr unsigned int idx_channels = 0;
- constexpr unsigned int idx_batches = 3;
-
- const unsigned int n_batches = input->dimension(idx_batches);
- const unsigned int input_rows = input->dimension(idx_height);
- const unsigned int input_cols = input->dimension(idx_width);
- const unsigned int n_channels = input->dimension(idx_channels);
- const unsigned int output_rows = output->dimension(idx_height);
- const unsigned int output_cols = output->dimension(idx_width);
-
- arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, input_rows, input_cols, n_channels, output_rows, output_cols, padding, nullptr);
-
- const auto input_qinfo = input->quantization_info().uniform();
- const auto output_qinfo = output->quantization_info().uniform();
-
- const float multiplier = input_qinfo.scale / output_qinfo.scale;
- int32_t output_multiplier{};
- int32_t output_shift{};
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-
- const arm_conv::pooling::Requantize32 requant_args(input_qinfo.offset,
- output_qinfo.offset,
- output_shift, // left shift
- 0, // right shift
- output_multiplier);
-
- // Configure assembly pooling kernel with requantization
- auto pooling_kernel_asm = arm_conv::pooling::pooling<TypeInput, TypeOutput, arm_conv::pooling::Requantize32>(args, requant_args);
- if(pooling_kernel_asm == nullptr)
- {
- // Configuration not supported: Leave function unconfigured:
- return;
- }
-
- _kernel_asm = std::move(pooling_kernel_asm);
-}
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
new file mode 100644
index 000000000..19a0e90d0
--- /dev/null
+++ b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // dst initialization if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
+
+ const bool requantize = src->quantization_info() != dst->quantization_info();
+
+ switch(src->data_type())
+ {
+ case DataType::QASYMM8:
+ if(requantize)
+ {
+ create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
+ }
+ else
+ {
+ create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
+ }
+ break;
+ case DataType::QASYMM8_SIGNED:
+ if(requantize)
+ {
+ create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
+ }
+ else
+ {
+ create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
+ }
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ case DataType::F32:
+ create_arm_pooling<float, float>(src, dst, info, cpu_info);
+ break;
+ default:
+ break;
+ }
+
+ Window win = calculate_max_window(*dst, Steps());
+ INEKernel::configure(win);
+}
+
+Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+#ifndef __aarch64__
+ ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
+#endif /* __aarch64__ */
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
+ "Only AVG and MAX pooling are supported by assembly kernels");
+
+ if(dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+ const auto src_qinfo = src->quantization_info().uniform();
+ const auto dst_qinfo = dst->quantization_info().uniform();
+
+ if(src_qinfo != dst_qinfo)
+ {
+ const float multiplier = src_qinfo.scale / dst_qinfo.scale;
+ int32_t dst_multiplier{};
+ int32_t dst_shift{};
+ ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
+ }
+ else
+ {
+ if(src->data_type() == DataType::QASYMM8)
+ {
+ const bool has_padding = info.pad_stride_info.has_padding();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+ }
+ }
+ }
+ else
+ {
+ if(src->data_type() == DataType::QASYMM8)
+ {
+ // If dst is not configured, the quantization info are the same
+ const bool has_padding = info.pad_stride_info.has_padding();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+ }
+ }
+ return Status{};
+}
+
+void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_UNUSED(info);
+
+ ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+ const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0);
+ ITensor *workspace = tensors.get_tensor(TensorType::ACL_DST_1);
+
+ const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
+ auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+ auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+
+ const auto src_shape = src->info()->tensor_shape();
+ const auto dst_shape = dst->info()->tensor_shape();
+ const auto src_padding = src->info()->padding();
+ const auto dst_padding = dst->info()->padding();
+
+ const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
+ const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
+ const size_t ld_src_batch = ld_src_row * src_shape[2];
+ const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right;
+ const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
+ const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
+
+ _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
+ out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+ working_space, info.thread_id, info.num_threads);
+}
+
+size_t CpuPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
+{
+ return _kernel_asm->get_working_size(num_threads);
+}
+
+bool CpuPoolingAssemblyWrapperKernel::is_configured() const
+{
+ return _kernel_asm != nullptr;
+}
+
+template <typename Typesrc, typename Typedst>
+void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+ const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+
+ arm_conv::pooling::PoolingWindow window{};
+ window.cols = static_cast<unsigned int>(info.pool_size.x());
+ window.rows = static_cast<unsigned int>(info.pool_size.y());
+
+ arm_conv::pooling::PoolingStride stride{};
+ std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
+
+ const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+
+ constexpr unsigned int idx_width = 1;
+ constexpr unsigned int idx_height = 2;
+ constexpr unsigned int idx_channels = 0;
+ constexpr unsigned int idx_batches = 3;
+
+ const unsigned int n_batches = src->dimension(idx_batches);
+ const unsigned int src_rows = src->dimension(idx_height);
+ const unsigned int src_cols = src->dimension(idx_width);
+ const unsigned int n_channels = src->dimension(idx_channels);
+ const unsigned int dst_rows = dst->dimension(idx_height);
+ const unsigned int dst_cols = dst->dimension(idx_width);
+
+ arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+
+ // Configure assembly pooling kernel
+ auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
+ if(pooling_kernel_asm == nullptr)
+ {
+ // Configuration not supported: Leave function unconfigured:
+ return;
+ }
+
+ _kernel_asm = std::move(pooling_kernel_asm);
+}
+
+template <typename Typesrc, typename Typedst>
+void CpuPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+{
+ const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+
+ arm_conv::pooling::PoolingWindow window{};
+ window.cols = static_cast<unsigned int>(info.pool_size.x());
+ window.rows = static_cast<unsigned int>(info.pool_size.y());
+
+ arm_conv::pooling::PoolingStride stride{};
+ std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
+
+ const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+
+ constexpr unsigned int idx_width = 1;
+ constexpr unsigned int idx_height = 2;
+ constexpr unsigned int idx_channels = 0;
+ constexpr unsigned int idx_batches = 3;
+
+ const unsigned int n_batches = src->dimension(idx_batches);
+ const unsigned int src_rows = src->dimension(idx_height);
+ const unsigned int src_cols = src->dimension(idx_width);
+ const unsigned int n_channels = src->dimension(idx_channels);
+ const unsigned int dst_rows = dst->dimension(idx_height);
+ const unsigned int dst_cols = dst->dimension(idx_width);
+
+ arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+
+ const auto src_qinfo = src->quantization_info().uniform();
+ const auto dst_qinfo = dst->quantization_info().uniform();
+
+ const float multiplier = src_qinfo.scale / dst_qinfo.scale;
+ int32_t dst_multiplier{};
+ int32_t dst_shift{};
+ quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
+
+ const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
+ dst_qinfo.offset,
+ dst_shift, // left shift
+ 0, // right shift
+ dst_multiplier);
+
+ // Configure assembly pooling kernel with requantization
+ auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
+ if(pooling_kernel_asm == nullptr)
+ {
+ // Configuration not supported: Leave function unconfigured:
+ return;
+ }
+
+ _kernel_asm = std::move(pooling_kernel_asm);
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
index b2fa5b571..34ec452de 100644
--- a/src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h
+++ b/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
@@ -21,58 +21,63 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_ASSEMBLY_POOLING_KERNEL_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_POOLING_KERNEL_WRAPPER_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
-#include "src/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
#include "src/core/NEON/kernels/assembly/pooling.hpp"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
#include "pool_common.hpp"
namespace arm_compute
{
-class ITensor;
-
+namespace cpu
+{
+namespace kernels
+{
/** This class is a wrapper for the assembly kernels.
*
* Some kernels were written in assembly and highly optimised for specific
* CPUs like A53 or A55. The arm compute library creates an instance of
- * NEPoolingAssemblyWrapperKernel and other auxiliary data structures to
+ * CpuPoolingAssemblyWrapperKernel and other auxiliary data structures to
* execute a single assembly kernel in the context of an NEFunction.
*
*/
-class NEPoolingAssemblyWrapperKernel final : public INEKernel
+class CpuPoolingAssemblyWrapperKernel final : public ICpuKernel
{
public:
/** Constructor
*/
- NEPoolingAssemblyWrapperKernel() = default;
- NEPoolingAssemblyWrapperKernel(NEPoolingAssemblyWrapperKernel &) = delete;
- NEPoolingAssemblyWrapperKernel(NEPoolingAssemblyWrapperKernel &&) = default;
- NEPoolingAssemblyWrapperKernel &operator=(NEPoolingAssemblyWrapperKernel &) = delete;
+ CpuPoolingAssemblyWrapperKernel() = default;
+ CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &) = delete;
+ CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &&) = default;
+ CpuPoolingAssemblyWrapperKernel &operator=(CpuPoolingAssemblyWrapperKernel &) = delete;
const char *name() const override
{
- return "NEPoolingAssemblyWrapperKernel";
+ return "CpuPoolingAssemblyWrapperKernel";
}
- /** Initialise the kernel's input and output.
+ /** Initialise the kernel's src and dst.
*
- * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] output Output tensor to store the result of pooling. Data types supported: same as @p input.
- * @param[in] info Pooling meta-data
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Destination tensor info to store the result of pooling. Data types supported: same as @p src.
+ * @param[in] info Pooling meta-data.
+ * @param[in] cpu_info CPU information needed to select the most appropriate kernel.
*/
- void configure(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+ void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
/** Indicates whether or not this function can be used to process the given parameters.
*
- * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] output Output tensor to store the result of pooling. Data types supported: same as @p input.
- * @param[in] info Pooling meta-data
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] dst Destination tensor to store the result of pooling. Data types supported: same as @p src.
+ * @param[in] info Pooling meta-data
*
* @return a status.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info);
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -94,23 +99,25 @@ public:
private:
/** Helper function to create the assembly kernel.
*
- * @param[in] input Input tensor info.
- * @param[in] output Output tensor info.
- * @param[in] info Pooling layer meta-data.
+ * @param[in] src Source tensor info.
+ * @param[in] dst Destination tensor info.
+ * @param[in] info Pooling layer meta-data.
*/
- template <typename TypeInput, typename TypeOutput>
- void create_arm_pooling(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+ template <typename Typesrc, typename Typedst>
+ void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
/** Helper function to create the assembly kernel with requantization support
*
- * @param[in] input Input tensor info.
- * @param[in] output Output tensor info.
- * @param[in] info Pooling layer meta-data.
+ * @param[in] src Source tensor info.
+ * @param[in] dst Destination tensor info.
+ * @param[in] info Pooling layer meta-data.
*/
- template <typename TypeInput, typename TypeOutput>
- void create_arm_pooling_requant(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+ template <typename Typesrc, typename Typedst>
+ void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
};
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_POOLING_KERNEL_WRAPPER_KERNEL_H */
+#endif /* ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/cpu/kernels/CpuPoolingKernel.cpp
index b46843bad..a29aef498 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/cpu/kernels/CpuPoolingKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,13 +21,10 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "src/core/cpu/kernels/CpuPoolingKernel.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -41,16 +38,14 @@
#include "support/ToolchainSupport.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include <algorithm>
#include <arm_neon.h>
-#include <cmath>
-#include <limits>
-#include <set>
-#include <string>
-#include <tuple>
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
using namespace misc::shape_calculator;
namespace
@@ -138,10 +133,10 @@ inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates
v = wrapper::vsetlane(elems[7], v, 7);
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
int pool_stride_x = 0;
int pool_stride_y = 0;
@@ -149,25 +144,25 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
if(indices)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
}
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
- && (input->data_layout() == DataLayout::NHWC),
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
+ && (src->data_layout() == DataLayout::NHWC),
"exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
- if(output->total_size() != 0)
+ if(dst->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
- || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON((dst->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
+ || (dst->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
if(indices)
{
@@ -188,29 +183,29 @@ Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsign
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
unsigned int &num_elems_processed_per_iteration,
BorderSize &border_size,
unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
+ // dst auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
if(indices)
{
// Indices auto inizialitation if not yet initialized
- auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
- pool_info)))
+ auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
+ pool_info)))
.set_data_type(DataType::U32) /* we store the offset to the element */);
}
- const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
+ const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
unsigned int num_elems_read_per_iteration = 0;
unsigned int num_elems_horizontal_window = 0;
int pool_stride_x = 0;
int pool_stride_y = 0;
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int input_width = input->dimension(idx_width);
- const int input_height = input->dimension(idx_height);
+ const int src_width = src->dimension(idx_width);
+ const int src_height = src->dimension(idx_height);
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
const int pool_pad_right = pad_stride_info.pad_right();
@@ -219,9 +214,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
const int pool_pad_bottom = pad_stride_info.pad_bottom();
const bool is_square = pool_size_x == pool_size_y;
- // Check output dimensions
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
- input->dimension(idx_height),
+ // Check dst dimensions
+ std::tie(pooled_w, pooled_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
pool_size_x,
pool_size_y,
pad_stride_info);
@@ -233,7 +228,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
if(is_square)
{
- switch(input->data_type())
+ switch(src->data_type())
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
@@ -299,28 +294,28 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Number of iterations in X dimension
const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
// Upper limit for the number of right/bottom border elements that are accessed
- const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
- const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
+ const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
+ const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
border_size.right = std::max(upper_bound_w, pool_pad_right);
border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
- TensorShape output_shape{ input->tensor_shape() };
- output_shape.set(0, pooled_w);
- output_shape.set(1, pooled_h);
- TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
- win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
- AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
+ TensorShape dst_shape{ src->tensor_shape() };
+ dst_shape.set(0, pooled_w);
+ dst_shape.set(1, pooled_h);
+ TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
+ win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
+ AccessWindowStatic src_access(src, -pool_pad_left, -pool_pad_top, src_width + border_size.right, src_height + border_size.bottom);
+ AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window);
if(indices)
{
AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
- window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
+ window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
}
else
{
- window_changed = update_window_and_padding(win, input_access, output_access);
+ window_changed = update_window_and_padding(win, src_access, dst_access);
}
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
}
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -438,79 +433,71 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo
} // namespace
-NEPoolingLayerKernel::NEPoolingLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
-{
-}
-
-BorderSize NEPoolingLayerKernel::border_size() const
+BorderSize CpuPoolingKernel::border_size() const
{
return _border_size;
}
-void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
+void CpuPoolingKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
const bool is_global_pooling = pool_info.is_global_pooling;
const int pool_stride_x = pad_stride_info.stride().first;
// Get data layout
- const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
+ const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Update pool size in case of global pooling
const Size2D pool_size(
- is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
- is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
+ is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+ is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
// Validate pool info before calling scaled_dimensions
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
- // Check output dimensions
+ // Check dst dimensions
unsigned int pooled_w;
unsigned int pooled_h;
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
- input->info()->dimension(idx_height),
+ std::tie(pooled_w, pooled_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
pool_size.x(),
pool_size.y(),
pad_stride_info);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, pooled_w, pooled_h, indices, pool_size));
// Set instance variables
- _input = input;
- _output = output;
- _indices = indices;
_pool_info = pool_info;
- _data_layout = input->info()->data_layout();
+ _data_layout = src->data_layout();
_is_square = (pool_size.x() == pool_size.y());
// Get data type
- const DataType data_type = input->info()->data_type();
+ const DataType data_type = src->data_type();
const bool is_nchw = _data_layout == DataLayout::NCHW;
if(data_type == DataType::QASYMM8)
{
if(!is_nchw)
{
- _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
+ _func = &CpuPoolingKernel::poolingMxN_q8_nhwc<uint8_t>;
}
else
{
if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
{
- _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
+ _func = &CpuPoolingKernel::pooling2_q8_nchw<uint8_t>;
}
else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
{
- _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
+ _func = &CpuPoolingKernel::pooling3_q8_nchw<uint8_t>;
}
else
{
- _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
+ _func = &CpuPoolingKernel::poolingMxN_q8_nchw<uint8_t>;
}
}
}
@@ -518,21 +505,21 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
if(!is_nchw)
{
- _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
+ _func = &CpuPoolingKernel::poolingMxN_q8_nhwc<int8_t>;
}
else
{
if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
{
- _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
+ _func = &CpuPoolingKernel::pooling2_q8_nchw<int8_t>;
}
else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
{
- _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
+ _func = &CpuPoolingKernel::pooling3_q8_nchw<int8_t>;
}
else
{
- _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
+ _func = &CpuPoolingKernel::poolingMxN_q8_nchw<int8_t>;
}
}
}
@@ -540,7 +527,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
if(!is_nchw)
{
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
+ _func = &CpuPoolingKernel::poolingMxN_f16_nhwc;
}
else
{
@@ -550,24 +537,24 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
case 2:
{
- _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
+ _func = &CpuPoolingKernel::pooling2_f16_nchw;
}
break;
case 3:
{
- _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
+ _func = &CpuPoolingKernel::pooling3_f16_nchw;
}
break;
default:
{
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+ _func = &CpuPoolingKernel::poolingMxN_f16_nchw;
break;
}
}
}
else
{
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+ _func = &CpuPoolingKernel::poolingMxN_f16_nchw;
}
}
}
@@ -575,7 +562,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
if(!is_nchw)
{
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
+ _func = &CpuPoolingKernel::poolingMxN_f32_nhwc;
}
else
{
@@ -585,29 +572,29 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
case 2:
{
- _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
+ _func = &CpuPoolingKernel::pooling2_f32_nchw;
break;
}
case 3:
{
- _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
+ _func = &CpuPoolingKernel::pooling3_f32_nchw;
break;
}
case 7:
{
- _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
+ _func = &CpuPoolingKernel::pooling7_f32_nchw;
break;
}
default:
{
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+ _func = &CpuPoolingKernel::poolingMxN_f32_nchw;
break;
}
}
}
else
{
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+ _func = &CpuPoolingKernel::poolingMxN_f32_nchw;
}
}
}
@@ -615,19 +602,19 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
if(!is_nchw)
{
// Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
+ Window win = calculate_max_window(*dst, Steps());
Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
- INEKernel::configure(win);
+ coord.set_num_dimensions(dst->num_dimensions());
+ dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
+ ICpuKernel::configure(win);
}
else
{
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
- pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
+ auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
+ _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ ICpuKernel::configure(win_config.second);
}
}
@@ -666,10 +653,10 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id,
}
template <typename T>
-void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling2_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
/** NEON vector types */
using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -688,26 +675,26 @@ void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Wi
const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
- const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
- const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+ const T *const src_top_ptr = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+ const T *const src_bottom_ptr = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
- const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
- const bool have_different_qinfo = input_qinfo != output_qinfo;
+ const UniformQuantizationInfo src_qinfo = _src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo dst_qinfo = _dst->info()->quantization_info().uniform();
+ const bool have_different_qinfo = src_qinfo != dst_qinfo;
- const float requant_scale = output_qinfo.scale / input_qinfo.scale;
- const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
+ const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
+ const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
execute_window_loop(window, [&](const Coordinates & id)
{
- const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
- const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
+ const auto top_data = wrapper::vloadq(src_top_ptr + src.offset());
+ const auto bottom_data = wrapper::vloadq(src_bottom_ptr + src.offset());
q8x8_t lower_res = {};
q8x8_t upper_res = {};
@@ -774,32 +761,32 @@ void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Wi
if(have_different_qinfo)
{
- const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
- lower_res = wrapper::vgetlow(requantized_output);
- upper_res = wrapper::vgethigh(requantized_output);
+ const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
+ lower_res = wrapper::vgetlow(requantized_dst);
+ upper_res = wrapper::vgethigh(requantized_dst);
}
// Store result
if(pool_stride_x == 1)
{
const q8x8x2_t res = { { lower_res, upper_res } };
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()), res);
}
else
{
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()), lower_res);
}
},
- input, output);
+ src, dst);
}
-void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling3_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
ARM_COMPUTE_UNUSED(pooling_type);
ARM_COMPUTE_UNUSED(exclude_padding);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
constexpr const int pool_size = 3;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
@@ -809,18 +796,18 @@ void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const W
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
- const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+ const unsigned char *const src_top_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const unsigned char *const src_middle_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const unsigned char *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
execute_window_loop(window, [&](const Coordinates & id)
{
- float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
- float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
- float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+ float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + src.offset()));
+ float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(src_middle_ptr + src.offset()));
+ float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + src.offset()));
float16x4_t res = {};
// Get power of 2 in case of l2 pooling
@@ -854,11 +841,11 @@ void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const W
res = vinv_f16(vinvsqrt_f16(res));
}
- *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+ *(reinterpret_cast<float16_t *>(dst.ptr())) = vget_lane_f16(res, 0);
},
- input, output);
+ src, dst);
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window_src);
ARM_COMPUTE_UNUSED(window);
ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -867,52 +854,52 @@ void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const W
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <typename T>
inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t input)
+f16_to_f32(float16x4_t src)
{
- float32x2_t output = { static_cast<float>(vget_lane_f16(input, 0)), static_cast<float>(vget_lane_f16(input, 1)) };
- return output;
+ float32x2_t dst = { static_cast<float>(vget_lane_f16(src, 0)), static_cast<float>(vget_lane_f16(src, 1)) };
+ return dst;
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
template <typename T>
inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t input)
+f16_to_f32(float32x2_t src)
{
- return input;
+ return src;
}
template <typename T>
-void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window)
+void CpuPoolingKernel::pooling2_nchw_maxpool_indices(const Window &window_src, const Window &window)
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
Iterator indices(_indices, window);
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const int pad_left = _input->info()->padding().left;
- const int pad_right = _input->info()->padding().right;
- const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
+ const uint8_t *const src_top_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const uint8_t *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const int pad_left = _src->info()->padding().left;
+ const int pad_right = _src->info()->padding().right;
+ const int in_stride_y = static_cast<int>(_src->info()->strides_in_bytes().y());
execute_window_loop(window, [&](const Coordinates & id)
{
- auto top_data = wrapper::vload(reinterpret_cast<const T *>(input_top_ptr + input.offset()));
- auto bottom_data = wrapper::vload(reinterpret_cast<const T *>(input_bottom_ptr + input.offset()));
+ auto top_data = wrapper::vload(reinterpret_cast<const T *>(src_top_ptr + src.offset()));
+ auto bottom_data = wrapper::vload(reinterpret_cast<const T *>(src_bottom_ptr + src.offset()));
float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
// Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
- const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
- const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
- const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
- *(reinterpret_cast<T *>(output.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
+ const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
+ const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
+ const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
+ *(reinterpret_cast<T *>(dst.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
// Calculate max data indice, which will be used in max unpool.
- const uint32_t offset_base = offset_no_padding<T>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+ const uint32_t offset_base = offset_no_padding<T>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
@@ -921,22 +908,22 @@ void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_in
const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
*(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
},
- input, output, indices);
+ src, dst, indices);
}
-void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling2_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
ARM_COMPUTE_UNUSED(pooling_type);
ARM_COMPUTE_UNUSED(exclude_padding);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
if(pooling_type == PoolingType::MAX && _indices)
{
- pooling2_nchw_maxpool_indices<float16_t>(window_input, window);
+ pooling2_nchw_maxpool_indices<float16_t>(window_src, window);
}
else
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
constexpr int pool_size = 2;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
@@ -944,16 +931,16 @@ void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const W
const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
int pool_stride_x, pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
- const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const unsigned char *const src_top_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const unsigned char *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
execute_window_loop(window, [&](const Coordinates & id)
{
- float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
- float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+ float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + src.offset()));
+ float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + src.offset()));
float16x4_t res = {};
// Get power of 2 in case of l2 pooling
@@ -984,22 +971,22 @@ void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const W
}
// Store result
- *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+ *(reinterpret_cast<float16_t *>(dst.ptr())) = vget_lane_f16(res, 0);
},
- input, output);
+ src, dst);
}
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window_src);
ARM_COMPUTE_UNUSED(window);
ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
template <typename T>
-void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling3_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
/** NEON vector types */
using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -1017,25 +1004,25 @@ void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Wi
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
- const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
+ const UniformQuantizationInfo &src_qinfo = _src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo &dst_qinfo = _dst->info()->quantization_info().uniform();
- const float requant_scale = output_qinfo.scale / input_qinfo.scale;
- const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
+ const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
+ const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
- const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
- const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
- const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
+ const T *const src_top_ptr = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+ const T *const src_middle_ptr = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+ const T *const src_bottom_ptr = reinterpret_cast<const T *>(_src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
execute_window_loop(window, [&](const Coordinates & id)
{
- const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
- const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
- const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
+ const auto top_data = wrapper::vloadq(src_top_ptr + src.offset());
+ const auto middle_data = wrapper::vloadq(src_middle_ptr + src.offset());
+ const auto bottom_data = wrapper::vloadq(src_bottom_ptr + src.offset());
q8x8_t fres = {};
q8x16_t fqres = {};
@@ -1130,34 +1117,34 @@ void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Wi
// Store result
if(pool_stride_x == 1)
{
- if(input_qinfo != output_qinfo)
+ if(src_qinfo != dst_qinfo)
{
fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
}
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()), fqres);
}
else
{
- if(input_qinfo != output_qinfo)
+ if(src_qinfo != dst_qinfo)
{
fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
}
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()), fres);
}
},
- input, output);
+ src, dst);
}
-void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
ARM_COMPUTE_UNUSED(pooling_type);
ARM_COMPUTE_UNUSED(exclude_padding);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
- const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
- const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
+ const int pool_size_x = _pool_info.is_global_pooling ? _src->info()->tensor_shape().x() : _pool_info.pool_size.width;
+ const int pool_size_y = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.height;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
@@ -1165,8 +1152,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
execute_window_loop(window, [&](const Coordinates & id)
{
@@ -1185,8 +1172,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const
int x = 0;
for(; x <= (pool_size_x - 8); x += 8)
{
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
- (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
// Get power of 2 in case of l2 pooling and accumulate
if(pooling_type == PoolingType::L2)
@@ -1202,8 +1189,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const
// Leftover for loop
for(; x < pool_size_x; ++x)
{
- float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
- + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+ float16_t data = *(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x())
+ + (y - pool_pad_top) * static_cast<int>(_src->info()->strides_in_bytes().y())));
// Get power of 2 in case of l2 pooling
if(pooling_type == PoolingType::L2)
@@ -1235,16 +1222,16 @@ void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const
int x = 0;
for(; x <= (pool_size_x - 8); x += 8)
{
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
- (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
vres = vmaxq_f16(vres, data);
}
// Leftover for loop
for(; x < pool_size_x; ++x)
{
- const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
- + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
+ const float16_t data = *(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x())
+ + (y - pool_pad_top) * static_cast<int>(_src->info()->strides_in_bytes().y())));
res = std::max(res, data);
}
}
@@ -1263,19 +1250,19 @@ void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const
}
// Store result
- *(reinterpret_cast<float16_t *>(output.ptr())) = res;
+ *(reinterpret_cast<float16_t *>(dst.ptr())) = res;
},
- input, output);
+ src, dst);
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window_src);
ARM_COMPUTE_UNUSED(window);
ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
+void CpuPoolingKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_src, const Window &window)
{
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
@@ -1284,8 +1271,8 @@ void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &windo
Window window_out = window;
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(_input, window_input);
- Iterator output(_output, window_out);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window_out);
Iterator indices(_indices, window_out);
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
@@ -1295,9 +1282,9 @@ void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &windo
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int pad_right = _input->info()->padding().right;
- const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
- const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
+ const int pad_right = _src->info()->padding().right;
+ const int in_stride_y = static_cast<int>(_src->info()->strides_in_bytes().y());
+ const int in_stride_z = static_cast<int>(_src->info()->strides_in_bytes().z());
execute_window_loop(window_out, [&](const Coordinates & id)
{
@@ -1306,36 +1293,36 @@ void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &windo
const int pool_limit_y = pool_pad_top - idx_height;
const int pool_limit_x = pool_pad_left - idx_width;
- const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
- const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
- const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z());
- const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z());
- const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z());
- const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z());
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+ const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z());
+ const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z());
+ const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z());
+ const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z());
int x_off = window_start_x;
for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
{
- const auto in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off;
- const auto in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off;
- const auto in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off;
- const auto in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off;
+ const auto in_x0_ptr = reinterpret_cast<const float16_t *>(src.ptr() + in_x0_offset) + x_off;
+ const auto in_x1_ptr = reinterpret_cast<const float16_t *>(src.ptr() + in_x1_offset) + x_off;
+ const auto in_x2_ptr = reinterpret_cast<const float16_t *>(src.ptr() + in_x2_offset) + x_off;
+ const auto in_x3_ptr = reinterpret_cast<const float16_t *>(src.ptr() + in_x3_offset) + x_off;
const auto v_x0 = vld1q_f16(in_x0_ptr);
const auto v_x1 = vld1q_f16(in_x1_ptr);
const auto v_x2 = vld1q_f16(in_x2_ptr);
const auto v_x3 = vld1q_f16(in_x3_ptr);
float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
// Store result
- vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
+ vst1q_f16(reinterpret_cast<float16_t *>(dst.ptr()) + x_off, vres);
- const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+ const uint32_t offset_base = offset_no_padding<float16_t>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
+ const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _src->info()->tensor_shape()[1];
const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
@@ -1362,19 +1349,19 @@ void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &windo
// Left-overs loop
for(; x_off < window_end_x; ++x_off)
{
- const auto x0 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off);
- const auto x1 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off);
- const auto x2 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off);
- const auto x3 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off);
+ const auto x0 = *(reinterpret_cast<const float16_t *>(src.ptr() + in_x0_offset) + x_off);
+ const auto x1 = *(reinterpret_cast<const float16_t *>(src.ptr() + in_x1_offset) + x_off);
+ const auto x2 = *(reinterpret_cast<const float16_t *>(src.ptr() + in_x2_offset) + x_off);
+ const auto x3 = *(reinterpret_cast<const float16_t *>(src.ptr() + in_x3_offset) + x_off);
float16_t res = std::max(std::max(x2, x3), std::max(x0, x1));
// Store result
- *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
+ *(reinterpret_cast<float16_t *>(dst.ptr()) + x_off) = res;
- const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+ const uint32_t offset_base = offset_no_padding<float16_t>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
+ const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _src->info()->tensor_shape()[1];
const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
@@ -1384,18 +1371,18 @@ void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &windo
*(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
}
},
- input, output, indices);
+ src, dst, indices);
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_f16_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
ARM_COMPUTE_UNUSED(pooling_type);
ARM_COMPUTE_UNUSED(exclude_padding);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
{
- pooling2_f16_nhwc_maxpool_indices(window_input, window);
+ pooling2_f16_nhwc_maxpool_indices(window_src, window);
}
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
@@ -1404,11 +1391,11 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
Window window_out = window;
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(_input, window_input);
- Iterator output(_output, window_out);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window_out);
- const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
- const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
+ const int pool_size_x = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.width;
+ const int pool_size_y = _pool_info.is_global_pooling ? _src->info()->tensor_shape().z() : _pool_info.pool_size.height;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
@@ -1416,8 +1403,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
float16x8_t vres;
@@ -1428,10 +1415,10 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
const int pool_limit_y = pool_pad_top - idx_height;
const int pool_limit_x = pool_pad_left - idx_width;
- const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
int x_off = window_start_x;
for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
@@ -1449,8 +1436,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
- (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
// Get power of 2 in case of l2 pooling and accumulate
if(pooling_type == PoolingType::L2)
@@ -1474,8 +1461,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
- (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
vres = vmaxq_f16(vres, data);
}
}
@@ -1489,7 +1476,7 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
}
// Store result
- vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
+ vst1q_f16(reinterpret_cast<float16_t *>(dst.ptr()) + x_off, vres);
}
// Left-overs loop
@@ -1507,8 +1494,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const float data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const float data = *(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
// Get power of 2 in case of l2 pooling and accumulate
if(pooling_type == PoolingType::L2)
@@ -1532,8 +1519,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const float16_t data = *(reinterpret_cast<const float16_t *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
res = std::max(res, data);
}
}
@@ -1546,25 +1533,25 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
}
// Store result
- *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
+ *(reinterpret_cast<float16_t *>(dst.ptr()) + x_off) = res;
}
},
- input, output);
+ src, dst);
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window_src);
ARM_COMPUTE_UNUSED(window);
ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
-void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
- const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
- const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
+ const int pool_size_x = _pool_info.is_global_pooling ? _src->info()->tensor_shape().x() : _pool_info.pool_size.width;
+ const int pool_size_y = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.height;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
@@ -1572,8 +1559,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
execute_window_loop(window, [&](const Coordinates & id)
{
@@ -1592,8 +1579,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
int x = 0;
for(; x <= (pool_size_x - 4); x += 4)
{
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().y())));
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
// Get power of 2 in case of l2 pooling and accumulate
if(pooling_type == PoolingType::L2)
@@ -1609,8 +1596,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
// Leftover for loop
for(; x < pool_size_x; ++x)
{
- float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().y())));
+ float data = *(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
// Get power of 2 in case of l2 pooling
if(pooling_type == PoolingType::L2)
@@ -1645,16 +1632,16 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
int x = 0;
for(; x <= (pool_size_x - 4); x += 4)
{
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().y())));
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
vres = vmaxq_f32(vres, data);
}
// Leftover for loop
for(; x < pool_size_x; ++x)
{
- const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().y())));
+ const float data = *(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
res = std::max(res, data);
}
}
@@ -1676,22 +1663,22 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
}
// Store result
- *(reinterpret_cast<float *>(output.ptr())) = res;
+ *(reinterpret_cast<float *>(dst.ptr())) = res;
},
- input, output);
+ src, dst);
}
-void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
- bool exclude_padding)
+void CpuPoolingKernel::pooling2_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type,
+ bool exclude_padding)
{
if(pooling_type == PoolingType::MAX && _indices)
{
- pooling2_nchw_maxpool_indices<float>(window_input, window);
+ pooling2_nchw_maxpool_indices<float>(window_src, window);
}
else
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
constexpr int pool_size = 2;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
@@ -1700,16 +1687,16 @@ void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const W
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
- const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const uint8_t *const src_top_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const uint8_t *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
execute_window_loop(window, [&](const Coordinates & id)
{
- const auto in_top_ptr = reinterpret_cast<const float *>(input_top_ptr + input.offset());
- const auto in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
+ const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + src.offset());
+ const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + src.offset());
float32x2_t top_data = vld1_f32(in_top_ptr);
float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
float32x2_t res = {};
@@ -1745,16 +1732,16 @@ void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const W
}
// Store result
- *(reinterpret_cast<float *>(output.ptr())) = final_res;
+ *(reinterpret_cast<float *>(dst.ptr())) = final_res;
},
- input, output);
+ src, dst);
}
}
-void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling3_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
constexpr const int pool_size = 3;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
@@ -1764,18 +1751,18 @@ void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const W
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
- const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+ const uint8_t *const src_top_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const uint8_t *const src_middle_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const uint8_t *const src_bottom_ptr = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
execute_window_loop(window, [&](const Coordinates & id)
{
- float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
- float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
- float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+ float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(src_top_ptr + src.offset()));
+ float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(src_middle_ptr + src.offset()));
+ float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(src_bottom_ptr + src.offset()));
float32x2_t res = {};
float final_res = 0;
@@ -1813,15 +1800,15 @@ void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const W
}
// Store result
- *(reinterpret_cast<float *>(output.ptr())) = final_res;
+ *(reinterpret_cast<float *>(dst.ptr())) = final_res;
},
- input, output);
+ src, dst);
}
-void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::pooling7_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
constexpr const int pool_size = 7;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
@@ -1831,13 +1818,13 @@ void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const W
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
- std::array<const uint8_t *, pool_size> input_ptrs{ {} };
+ std::array<const uint8_t *, pool_size> src_ptrs{ {} };
for(int i = 0; i < pool_size; ++i)
{
- input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+ src_ptrs[i] = _src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
}
execute_window_loop(window, [&](const Coordinates & id)
@@ -1851,7 +1838,7 @@ void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const W
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
- float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+ float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + src.offset()));
// Get power of 2 in case of l2 pooling
if(pooling_type == PoolingType::L2)
{
@@ -1861,7 +1848,7 @@ void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const W
float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
for(int i = 1; i < pool_size; ++i)
{
- data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+ data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + src.offset()));
// Get power of 2 in case of l2 pooling
if(pooling_type == PoolingType::L2)
{
@@ -1876,10 +1863,10 @@ void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const W
}
else
{
- float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+ float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + src.offset()));
for(int i = 1; i < pool_size; ++i)
{
- const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+ const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + src.offset()));
max_data = vmax2q_f32(max_data, data);
}
res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
@@ -1895,16 +1882,16 @@ void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const W
}
// Store result
- *(reinterpret_cast<float *>(output.ptr())) = final_res;
+ *(reinterpret_cast<float *>(dst.ptr())) = final_res;
},
- input, output);
+ src, dst);
}
-void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_f32_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
{
- pooling2_f32_nhwc_maxpool_indices(window_input, window);
+ pooling2_f32_nhwc_maxpool_indices(window_src, window);
}
else
{
@@ -1915,11 +1902,11 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
Window window_out = window;
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(_input, window_input);
- Iterator output(_output, window_out);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window_out);
- const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
- const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
+ const int pool_size_x = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.width;
+ const int pool_size_y = _pool_info.is_global_pooling ? _src->info()->tensor_shape().z() : _pool_info.pool_size.height;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
@@ -1927,8 +1914,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
float32x4_t vres;
@@ -1939,10 +1926,10 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
const int pool_limit_y = pool_pad_top - idx_height;
const int pool_limit_x = pool_pad_left - idx_width;
- const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
int x_off = window_start_x;
for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
@@ -1961,8 +1948,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
// Get power of 2 in case of l2 pooling and accumulate
if(pooling_type == PoolingType::L2)
@@ -1985,8 +1972,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
vres = vmaxq_f32(vres, data);
}
}
@@ -2004,7 +1991,7 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
}
// Store result
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
+ vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + x_off, vres);
}
// Left-overs loop
@@ -2022,8 +2009,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const float data = *(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
// Get power of 2 in case of l2 pooling and accumulate
if(pooling_type == PoolingType::L2)
@@ -2047,8 +2034,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const float data = *(reinterpret_cast<const float *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
res = std::max(res, data);
}
}
@@ -2061,14 +2048,14 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
}
// Store result
- *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
+ *(reinterpret_cast<float *>(dst.ptr()) + x_off) = res;
}
},
- input, output);
+ src, dst);
}
}
-void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
+void CpuPoolingKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_src, const Window &window)
{
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
@@ -2077,8 +2064,8 @@ void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &windo
Window window_out = window;
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(_input, window_input);
- Iterator output(_output, window_out);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window_out);
Iterator indices(_indices, window_out);
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
@@ -2091,9 +2078,9 @@ void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &windo
float32x4_t vres;
float res;
- const int pad_right = _input->info()->padding().right;
- const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
- const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
+ const int pad_right = _src->info()->padding().right;
+ const int in_stride_y = static_cast<int>(_src->info()->strides_in_bytes().y());
+ const int in_stride_z = static_cast<int>(_src->info()->strides_in_bytes().z());
execute_window_loop(window_out, [&](const Coordinates & id)
{
@@ -2102,37 +2089,37 @@ void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &windo
const int pool_limit_y = pool_pad_top - idx_height;
const int pool_limit_x = pool_pad_left - idx_width;
- const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
- const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z());
- const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z());
- const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z());
- const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z());
+ const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z());
+ const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z());
+ const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z());
+ const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z());
int x_off = window_start_x;
for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
{
- const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
- const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
- const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
- const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
+ const auto in_x0_ptr = reinterpret_cast<const float *>(src.ptr() + in_x0_offset);
+ const auto in_x1_ptr = reinterpret_cast<const float *>(src.ptr() + in_x1_offset);
+ const auto in_x2_ptr = reinterpret_cast<const float *>(src.ptr() + in_x2_offset);
+ const auto in_x3_ptr = reinterpret_cast<const float *>(src.ptr() + in_x3_offset);
const auto v_x0 = vld1q_f32(in_x0_ptr + x_off);
const auto v_x1 = vld1q_f32(in_x1_ptr + x_off);
const auto v_x2 = vld1q_f32(in_x2_ptr + x_off);
const auto v_x3 = vld1q_f32(in_x3_ptr + x_off);
vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
// Store result
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
+ vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + x_off, vres);
- const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+ const uint32_t offset_base = offset_no_padding<float>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
+ const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _src->info()->tensor_shape()[1];
const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
@@ -2149,19 +2136,19 @@ void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &windo
// Left-overs loop
for(; x_off < window_end_x; ++x_off)
{
- const auto x0 = *(reinterpret_cast<const float *>(input.ptr() + in_x0_offset) + x_off);
- const auto x1 = *(reinterpret_cast<const float *>(input.ptr() + in_x1_offset) + x_off);
- const auto x2 = *(reinterpret_cast<const float *>(input.ptr() + in_x2_offset) + x_off);
- const auto x3 = *(reinterpret_cast<const float *>(input.ptr() + in_x3_offset) + x_off);
+ const auto x0 = *(reinterpret_cast<const float *>(src.ptr() + in_x0_offset) + x_off);
+ const auto x1 = *(reinterpret_cast<const float *>(src.ptr() + in_x1_offset) + x_off);
+ const auto x2 = *(reinterpret_cast<const float *>(src.ptr() + in_x2_offset) + x_off);
+ const auto x3 = *(reinterpret_cast<const float *>(src.ptr() + in_x3_offset) + x_off);
res = std::max(std::max(x2, x3), std::max(x0, x1));
// Store result
- *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
+ *(reinterpret_cast<float *>(dst.ptr()) + x_off) = res;
- const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+ const uint32_t offset_base = offset_no_padding<float>(src.offset(), id, *_src->info(), pool_stride_x, pool_stride_y);
const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
+ const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _src->info()->tensor_shape()[1];
const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
@@ -2171,14 +2158,14 @@ void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &windo
*(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
}
},
- input, output, indices);
+ src, dst, indices);
}
template <typename T>
-void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window);
/** NEON vector types */
using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -2187,8 +2174,8 @@ void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const
using q32_t = typename wrapper::traits::promote_t<q16_t>;
using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
- const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
- const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
+ const int pool_size_x = _pool_info.is_global_pooling ? _src->info()->tensor_shape().x() : _pool_info.pool_size.width;
+ const int pool_size_y = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.height;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
@@ -2196,11 +2183,11 @@ void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
- const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
+ const UniformQuantizationInfo &src_qinfo = _src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo &dst_qinfo = _dst->info()->quantization_info().uniform();
execute_window_loop(window, [&](const Coordinates & id)
{
@@ -2220,8 +2207,8 @@ void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const
int x = 0;
for(; x <= (pool_size_x - 8); x += 8)
{
- const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().y())));
+ const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
const q16x8_t data_q16 = wrapper::vmovl(data);
vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
@@ -2230,8 +2217,8 @@ void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const
// Leftover for loop
for(; x < pool_size_x; ++x)
{
- T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().y())));
+ T data = *(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
sres += data;
}
}
@@ -2252,15 +2239,15 @@ void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const
int x = 0;
for(; x <= (pool_size_x - 8); x += 8)
{
- const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().y())));
+ const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
vres = wrapper::vmax(vres, data);
}
// Leftover for loop
for(; x < pool_size_x; ++x)
{
- const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().y())));
+ const T data = *(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().y())));
res = std::max(res, data);
}
}
@@ -2274,14 +2261,14 @@ void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const
res = std::max(res, wrapper::vgetlane(vres, 0));
}
// Store result
- res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
- *(reinterpret_cast<T *>(output.ptr())) = res;
+ res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
+ *(reinterpret_cast<T *>(dst.ptr())) = res;
},
- input, output);
+ src, dst);
}
template <typename T>
-void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void CpuPoolingKernel::poolingMxN_q8_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
@@ -2291,8 +2278,8 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
Window window_out = window;
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(_input, window_input);
- Iterator output(_output, window_out);
+ Iterator src(_src, window_src);
+ Iterator dst(_dst, window_out);
using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
@@ -2301,8 +2288,8 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
using q32_t = typename wrapper::traits::promote_t<q16_t>;
using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
- const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
- const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
+ const int pool_size_x = _pool_info.is_global_pooling ? _src->info()->tensor_shape().y() : _pool_info.pool_size.width;
+ const int pool_size_y = _pool_info.is_global_pooling ? _src->info()->tensor_shape().z() : _pool_info.pool_size.height;
const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
@@ -2311,20 +2298,20 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
int pool_stride_x = 0;
int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
- const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_w = _src->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _src->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
- const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
+ const UniformQuantizationInfo src_qinfo = _src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo dst_qinfo = _dst->info()->quantization_info().uniform();
- const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
+ const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
// "new_offset" doesn't have to consider the "half_scale_v" in its computation
// With a requantization performed in a single step there won't be uncertainties introduced
- const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
+ const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
- const float requant_scale = output_qinfo.scale / input_qinfo.scale;
- const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
+ const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
+ const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
execute_window_loop(window_out, [&](const Coordinates & id)
@@ -2334,10 +2321,10 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
const int pool_limit_y = pool_pad_top - idx_height;
const int pool_limit_x = pool_pad_left - idx_width;
- const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
int x_off = window_start_x;
for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
@@ -2358,8 +2345,8 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
@@ -2370,7 +2357,7 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
}
}
- if(input_qinfo != output_qinfo)
+ if(src_qinfo != dst_qinfo)
{
const float32x4x4_t vres =
{
@@ -2381,10 +2368,10 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
vcvtq_f32_q32(vres4),
}
};
- const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+ const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
// Store result
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, wrapper::vgetlow(requantized_output));
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, wrapper::vgethigh(requantized_output));
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
}
else
{
@@ -2398,8 +2385,8 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
// Store result
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, res1);
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, res2);
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off, res1);
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off + 8, res2);
}
}
else
@@ -2410,14 +2397,14 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
vres = wrapper::vmax(vres, data);
}
}
// Store result
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
requant_qinfo) :
vres);
}
@@ -2432,15 +2419,15 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
vres = wrapper::vmax(vres, data);
}
}
// Store result
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off,
- (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+ wrapper::vstore(reinterpret_cast<T *>(dst.ptr()) + x_off,
+ (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
}
}
@@ -2460,20 +2447,20 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const T data = *(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
res += data;
}
}
- if(input_qinfo != output_qinfo)
+ if(src_qinfo != dst_qinfo)
{
- const float res_f = static_cast<float>(res);
- const float new_scale = quant_rescale / scale;
- const auto requantized_output = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+ const float res_f = static_cast<float>(res);
+ const float new_scale = quant_rescale / scale;
+ const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
// Store result
- *(reinterpret_cast<T *>(output.ptr()) + x_off) = requantized_output;
+ *(reinterpret_cast<T *>(dst.ptr()) + x_off) = requantized_dst;
}
else
{
@@ -2481,7 +2468,7 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
// Store result
- *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
+ *(reinterpret_cast<T *>(dst.ptr()) + x_off) = res;
}
}
else
@@ -2492,32 +2479,32 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
{
for(int x = pool_start_x; x < pool_end_x; ++x)
{
- const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (_input->info()->strides_in_bytes().z())) + x_off);
+ const T data = *(reinterpret_cast<const T *>(src.ptr() + (x - pool_pad_left) * static_cast<int>(_src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+ (_src->info()->strides_in_bytes().z())) + x_off);
res = std::max(res, data);
}
}
// Store result
- if(input_qinfo != output_qinfo)
+ if(src_qinfo != dst_qinfo)
{
- const float res_f = static_cast<float>(res);
- *(reinterpret_cast<T *>(output.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+ const float res_f = static_cast<float>(res);
+ *(reinterpret_cast<T *>(dst.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
}
else
{
- *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
+ *(reinterpret_cast<T *>(dst.ptr()) + x_off) = res;
}
}
}
},
- input, output);
+ src, dst);
}
-Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
unsigned int pooled_w = 0;
unsigned int pooled_h = 0;
@@ -2529,25 +2516,25 @@ Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInf
unsigned int pool_size_y = 0;
// Get data layout
- const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
+ const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
- pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
+ pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+ pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
// Validate pool info before calling scaled_dimensions
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
- // Check output dimensions
- std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
- input->dimension(idx_height),
+ // Check dst dimensions
+ std::tie(pooled_w, pooled_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
pool_size_x,
pool_size_y,
pool_info.pad_stride_info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
(indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
pool_size_x, pool_size_y)
.first);
@@ -2555,24 +2542,28 @@ Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInf
return Status{};
}
-void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
+void CpuPoolingKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
+ _src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = tensors.get_tensor(TensorType::ACL_DST_0);
+ _indices = tensors.get_tensor(TensorType::ACL_DST_1);
+
const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
const unsigned int pool_size = _pool_info.pool_size.width;
const bool exclude_padding = _pool_info.exclude_padding;
- Window window_input(window);
+ Window window_src(window);
if(_data_layout == DataLayout::NCHW)
{
- // Set step for input in x and y direction for the input
+ // Set step for src in x and y direction for the src
unsigned int window_x_inc = 0;
- switch(_input->info()->data_type())
+ switch(_src->info()->data_type())
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
@@ -2596,17 +2587,19 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR("Not supported");
}
}
- window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
- window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+ window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
+ window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
}
else
{
- window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
- window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
+ window_src.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_src.set(Window::DimY, Window::Dimension(0, _src->info()->dimension(1), pool_stride_x));
+ window_src.set(Window::DimZ, Window::Dimension(0, _src->info()->dimension(2), pool_stride_y));
}
// Run function
- (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
+ (this->*_func)(window_src, window, _pool_info.pool_type, exclude_padding);
}
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.h b/src/core/cpu/kernels/CpuPoolingKernel.h
index aa3d2f3f0..036e43650 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/src/core/cpu/kernels/CpuPoolingKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,209 +21,206 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
+#ifndef ARM_COMPUTE_CPU_POOLING_KERNEL_H
+#define ARM_COMPUTE_CPU_POOLING_KERNEL_H
-#include "src/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
namespace arm_compute
{
-class ITensor;
-
+namespace cpu
+{
+namespace kernels
+{
/** Interface for the pooling layer kernel */
-class NEPoolingLayerKernel : public INEKernel
+class CpuPoolingKernel : public ICpuKernel
{
public:
const char *name() const override
{
- return "NEPoolingLayerKernel";
+ return "CpuPoolingKernel";
}
/** Default constructor */
- NEPoolingLayerKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
- /** Default destructor */
- ~NEPoolingLayerKernel() = default;
- /** Set the input and output tensors.
+ CpuPoolingKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPoolingKernel);
+ /** Configure kernel for a given list of arguments
*
* @note F16 are supported for pool sizes 2 and 3 only
*
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: Same as @p src.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
* @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
*/
- void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices = nullptr);
- /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
+ void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuPoolingKernel
*
* @note F16 are supported for pool sizes 2 and 3 only
*
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] dst Destination tensor info. Data types supported: Same as @p src.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
* @param[in] indices (optional) The indices of the maximal values. Data type supported: U32.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
// Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
BorderSize border_size() const override;
private:
/** Function to perform 2x2 pooling.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void pooling2_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
*/
- void pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window);
+ void pooling2_f32_nhwc_maxpool_indices(const Window &window_src, const Window &window);
/** Function to perform MxN pooling for 32-bit floating point values.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void poolingMxN_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Function to perform MxN pooling for 32-bit floating point values (NHWC).
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void poolingMxN_f32_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Function to perform 7x7 pooling.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void pooling7_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Function to perform 3x3 pooling.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void pooling3_f32_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Function to perform 2x2 pooling for float16_t.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void pooling2_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Function to perform 2x2 pooling and compute the pooling indices for FP32/FP16. The indices can be used for max unpool.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
*/
template <typename T>
- void pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window);
+ void pooling2_nchw_maxpool_indices(const Window &window_src, const Window &window);
/** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
*/
- void pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window);
+ void pooling2_f16_nhwc_maxpool_indices(const Window &window_src, const Window &window);
/** Function to perform 3x3 pooling.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void pooling3_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Function to perform MxN pooling for 16-bit floating point values.
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void poolingMxN_f16_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Function to perform MxN pooling for 16-bit floating point values. (NHWC)
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- void poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void poolingMxN_f16_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Template function to perform 2x2 pooling for 8bit quantized fixed point. (NCHW)
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
template <typename T>
- void pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void pooling2_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Template function to perform 3x3 pooling for 8bit quantized fixed point. (NCHW)
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
template <typename T>
- void pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void pooling3_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Template function to perform MxN pooling for 8-bit quantized. (NCHW)
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
template <typename T>
- void poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void poolingMxN_q8_nchw(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Template function to perform MxN pooling for 8-bit quantized. (NHWC)
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
template <typename T>
- void poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+ void poolingMxN_q8_nhwc(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
/** Common signature for all the specialised Pooling functions
*
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
+ * @param[in] window_src src region on which to execute the kernel.
+ * @param[in] window dst region on which to execute the kernel.
* @param[in] pooling_type Pooling operation to be computed.
* @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
*/
- using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding);
+ using PoolingFunction = void (CpuPoolingKernel::*)(const Window &window_src, const Window &window, PoolingType pooling_type, bool exclude_padding);
private:
- PoolingFunction _func;
- const ITensor *_input;
- ITensor *_output;
- ITensor *_indices;
- PoolingLayerInfo _pool_info;
- DataLayout _data_layout;
- unsigned int _num_elems_processed_per_iteration;
- BorderSize _border_size;
- bool _is_square;
+ PoolingFunction _func{ nullptr };
+ const ITensor *_src{ nullptr };
+ ITensor *_dst{ nullptr };
+ ITensor *_indices{ nullptr };
+ PoolingLayerInfo _pool_info{};
+ DataLayout _data_layout{ DataLayout::UNKNOWN };
+ unsigned int _num_elems_processed_per_iteration{ 0 };
+ BorderSize _border_size{ 0 };
+ bool _is_square{ false };
};
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H */
+#endif /*ARM_COMPUTE_CPU_POOLING_KERNEL_H */
diff --git a/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp
deleted file mode 100644
index 427cd2eb7..000000000
--- a/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/NEPoolingAssemblyWrapperKernel.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-NEPoolingAssemblyDispatch::~NEPoolingAssemblyDispatch() = default;
-
-void NEPoolingAssemblyDispatch::configure(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info)
-{
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- const unsigned int num_threads = NEScheduler::get().num_threads();
-
- // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
- if(!NEPoolingAssemblyDispatch::validate(input, output, info))
- {
- return;
- }
-
- auto pooling_wrapper = std::make_unique<NEPoolingAssemblyWrapperKernel>();
- ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
- pooling_wrapper->configure(input, output, info, ci);
-
- // Check if we have Global Pooling Layer
- _is_global_pooling_layer = (input->dimension(2) == info.pool_size.width) && (input->dimension(1) == info.pool_size.height);
-
- // Set workspace requirements
- const unsigned int alignment = 4096;
- _workspace.push_back(MemoryInfo(TensorType::ACL_DST_1, pooling_wrapper->get_working_size(num_threads), alignment));
-
- _kernel = std::move(pooling_wrapper);
-}
-
-Status NEPoolingAssemblyDispatch::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info)
-{
- return NEPoolingAssemblyWrapperKernel::validate(input, output, info);
-}
-
-bool NEPoolingAssemblyDispatch::is_configured() const
-{
- return _kernel != nullptr;
-}
-
-void NEPoolingAssemblyDispatch::run(ITensorPack &tensors)
-{
- if(tensors.empty())
- {
- ARM_COMPUTE_ERROR("No inputs provided");
- }
-
- if(_is_global_pooling_layer)
- {
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimX, _kernel->window(), tensors);
- }
- else
- {
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
- }
-}
-} // namespace experimental
-
-struct NEPoolingAssemblyDispatch::Impl
-{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- ITensor *workspace{ nullptr };
- std::unique_ptr<experimental::NEPoolingAssemblyDispatch> op{ nullptr };
-};
-
-NEPoolingAssemblyDispatch::NEPoolingAssemblyDispatch(NEPoolingAssemblyDispatch &&) = default;
-
-NEPoolingAssemblyDispatch &NEPoolingAssemblyDispatch::operator=(NEPoolingAssemblyDispatch &&) = default;
-
-NEPoolingAssemblyDispatch::~NEPoolingAssemblyDispatch() = default;
-
-NEPoolingAssemblyDispatch::NEPoolingAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager)
- : _impl(std::make_unique<Impl>()),
- _memory_group(std::move(memory_manager)),
- _workspace()
-{
-}
-
-void NEPoolingAssemblyDispatch::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- _impl->src = input;
- _impl->dst = output;
- _impl->workspace = &_workspace;
-
- _impl->op = std::make_unique<experimental::NEPoolingAssemblyDispatch>();
- _impl->op->configure(input->info(), output->info(), info);
-
- const auto workspace = _impl->op->workspace().at(0);
- if(workspace.size > 0)
- {
- // Allocate workspace
- allocate_workspace(workspace.size, workspace.alignment);
- }
-}
-
-Status NEPoolingAssemblyDispatch::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info)
-{
- return experimental::NEPoolingAssemblyDispatch::validate(input, output, info);
-}
-
-bool NEPoolingAssemblyDispatch::is_configured() const
-{
- return _impl->op->is_configured();
-}
-
-void NEPoolingAssemblyDispatch::run()
-{
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, _impl->src);
- pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
- pack.add_tensor(TensorType::ACL_DST_1, _impl->workspace);
- _impl->op->run(pack);
-}
-
-void NEPoolingAssemblyDispatch::allocate_workspace(size_t workspace_size, size_t alignment)
-{
- ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
- _memory_group.manage(&_workspace);
- _workspace.allocator()->allocate();
-}
-} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h b/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h
deleted file mode 100644
index f6d232b93..000000000
--- a/src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPOOLINGASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_NEPOOLINGASSEMBLYDISPATCH_H
-
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/INEOperator.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward Declarations
-class ITensor;
-struct PoolingLayerInfo;
-
-/** Assembly kernel glue */
-class NEPoolingAssemblyDispatch : public IFunction
-{
-public:
- /** Constructor */
- NEPoolingAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEPoolingAssemblyDispatch(const NEPoolingAssemblyDispatch &) = delete;
- /** Default move constructor */
- NEPoolingAssemblyDispatch(NEPoolingAssemblyDispatch &&);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEPoolingAssemblyDispatch &operator=(const NEPoolingAssemblyDispatch &) = delete;
- /** Default move assignment operator */
- NEPoolingAssemblyDispatch &operator=(NEPoolingAssemblyDispatch &&);
- /** Destructor */
- ~NEPoolingAssemblyDispatch();
-
- /** If supported create an assembly routine, else fallback to Compute Library function.
- *
- * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] output Output tensor to store the result of pooling. Data types supported: same as @p input.
- * @param[in] info Pooling meta-data
- */
- void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &info);
-
- /** Indicates whether or not this function can be used to process the given parameters.
- *
- * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] output Output tensor to store the result of pooling. Data types supported: same as @p input.
- * @param[in] info Pooling meta-data
- *
- * @return a status.
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info);
-
- /** Was the function successfully configured ?
- *
- * @return True if the function is configured and ready to run
- */
- bool is_configured() const;
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- /** Helper function to allocate memory for the workspace needed by the
- * assembly kernels
- *
- * @param[in] workspace_size Total size of the workspace.
- * @param[in] alignment Alignment requirement in bytes.
- */
- void allocate_workspace(size_t workspace_size, size_t alignment);
-
- struct Impl;
- std::unique_ptr<Impl> _impl;
-
- MemoryGroup _memory_group{};
- Tensor _workspace{};
-};
-
-namespace experimental
-{
-/** Basic function to run pooling assembly kernels */
-class NEPoolingAssemblyDispatch : public INEOperator
-{
-public:
- /** Constructor */
- NEPoolingAssemblyDispatch() = default;
- /** Prevent instances of this class from being copied */
- NEPoolingAssemblyDispatch(const NEPoolingAssemblyDispatch &) = delete;
- /** Default move constructor */
- NEPoolingAssemblyDispatch(NEPoolingAssemblyDispatch &&) = default;
- /** Prevent instances of this class from being copied */
- NEPoolingAssemblyDispatch &operator=(const NEPoolingAssemblyDispatch &) = delete;
- /** Default move assignment operator */
- NEPoolingAssemblyDispatch &operator=(NEPoolingAssemblyDispatch &&) = default;
- /** Destructor */
- ~NEPoolingAssemblyDispatch();
-
- /** If supported create an assembly routine, else fallback to Compute Library function.
- *
- * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] output Output tensor to store the result of pooling. Data types supported: same as @p input.
- * @param[in] info Pooling meta-data
- */
- void configure(const ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &info);
-
- /** Indicates whether or not this function can be used to process the given parameters.
- *
- * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] output Output tensor to store the result of pooling. Data types supported: same as @p input.
- * @param[in] info Pooling meta-data
- *
- * @return a status.
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &info);
- /** Was the function successfully configured ?
- *
- * @return True if the function is configured and ready to run
- */
- bool is_configured() const;
- // Run method overriden
- void run(ITensorPack &tensors) override;
-
-private:
- bool _is_global_pooling_layer{ false };
-};
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEPOOLINGASSEMBLYDISPATCH_H */
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 0c857b54d..dd7a3a337 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -23,103 +23,48 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
-#include "src/runtime/NEON/functions/NEPoolingAssemblyDispatch.h"
+#include "arm_compute/core/Validate.h"
+#include "src/runtime/cpu/operators/CpuPooling.h"
namespace arm_compute
{
+struct NEPoolingLayer::Impl
+{
+ ITensor *src{ nullptr };
+ ITensor *dst{ nullptr };
+ ITensor *indices{ nullptr };
+ std::shared_ptr<IMemoryManager> memory_manager{ nullptr };
+ std::unique_ptr<cpu::CpuPooling> op{ nullptr };
+};
+
NEPoolingLayer::~NEPoolingLayer() = default;
NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_manager(std::move(memory_manager)), _pooling_layer_kernel(), _border_handler(), _asm_glue(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
+ : _impl(std::make_unique<Impl>())
{
+ _impl->memory_manager = std::move(memory_manager);
}
void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
{
- // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
- const bool run_optimised = bool(NEPoolingAssemblyDispatch::validate(input->info(), output->info(), pool_info)) && (indices == nullptr);
-
- if(run_optimised)
- {
- _asm_glue = std::make_unique<NEPoolingAssemblyDispatch>(_memory_manager);
- _asm_glue->configure(input, output, pool_info);
- ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
- }
- else
- {
- // Check if we have Global Pooling Layer
- _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size.width) && (input->info()->dimension(1) == pool_info.pool_size.height);
-
- // Get data layout
- _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
-
- // Configure pooling kernel
- _pooling_layer_kernel = std::make_unique<NEPoolingLayerKernel>();
- _pooling_layer_kernel->configure(input, output, pool_info, indices);
-
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- {
- // Configure border depending on operation required (quantize border in case of asymmetric data_type)
- BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
- if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding)
- {
- zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
- }
- _border_handler = std::make_unique<NEFillBorderKernel>();
- _border_handler->configure(input, _pooling_layer_kernel->border_size(), border_mode, zero_value);
- break;
- }
- case DataLayout::NHWC:
- break;
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
- }
+ _impl->src = input;
+ _impl->dst = output;
+ _impl->indices = indices;
+ _impl->op = std::make_unique<cpu::CpuPooling>(_impl->memory_manager);
+ _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
}
Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
{
- const bool run_optimised = bool(NEPoolingAssemblyDispatch::validate(input, output, pool_info)) && (indices == nullptr);
-
- if(run_optimised)
- {
- return Status{};
- }
-
- return NEPoolingLayerKernel::validate(input, output, pool_info, indices);
+ return cpu::CpuPooling::validate(input, output, pool_info, indices);
}
void NEPoolingLayer::run()
{
- if(_asm_glue && _asm_glue->is_configured())
- {
- _asm_glue->run();
- }
- else
- {
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- // Fill border
- NEScheduler::get().schedule(_border_handler.get(), Window::DimY);
-
- // Run pooling layer
- NEScheduler::get().schedule(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY);
- break;
- case DataLayout::NHWC:
- // Run pooling layer
- NEScheduler::get().schedule(_pooling_layer_kernel.get(), Window::DimX);
- break;
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
- }
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+ pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+ pack.add_tensor(TensorType::ACL_DST_1, _impl->indices);
+ _impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPooling.cpp b/src/runtime/cpu/operators/CpuPooling.cpp
new file mode 100644
index 000000000..0b9b38d07
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPooling.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuPooling.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/cpu/kernels/CpuPoolingKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPooling::CpuPooling(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_manager(std::move(memory_manager)), _pooling_layer_kernel(), _border_handler(), _asm_glue(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
+{
+}
+
+CpuPooling::~CpuPooling() = default;
+
+void CpuPooling::configure(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+{
+ // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
+ const bool run_optimised = bool(CpuPoolingAssemblyDispatch::validate(input, output, pool_info)) && (indices == nullptr);
+
+ if(run_optimised)
+ {
+ _asm_glue = std::make_unique<CpuPoolingAssemblyDispatch>(_memory_manager);
+ _asm_glue->configure(input, output, pool_info);
+ ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
+ }
+ else
+ {
+ // Check if we have Global Pooling Layer
+ _is_global_pooling_layer = (input->dimension(0) == pool_info.pool_size.width) && (input->dimension(1) == pool_info.pool_size.height);
+
+ // Get data layout
+ _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
+
+ // Configure pooling kernel
+ auto k = std::make_unique<kernels::CpuPoolingKernel>();
+ k->configure(input, output, pool_info, indices);
+ _pooling_layer_kernel = std::move(k);
+
+ switch(_data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+ BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+ PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
+ if(is_data_type_quantized_asymmetric(input->data_type()) && !pool_info.exclude_padding)
+ {
+ zero_value = PixelValue(0, input->data_type(), input->quantization_info());
+ }
+ auto b = std::make_unique<NEFillBorderKernel>();
+ b->configure(input, _pooling_layer_kernel->border_size(), border_mode, zero_value);
+ _border_handler = std::move(b);
+ break;
+ }
+ case DataLayout::NHWC:
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
+ }
+}
+
+Status CpuPooling::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+ const bool run_optimised = bool(CpuPoolingAssemblyDispatch::validate(input, output, pool_info)) && (indices == nullptr);
+
+ if(run_optimised)
+ {
+ return Status{};
+ }
+
+ return kernels::CpuPoolingKernel::validate(input, output, pool_info, indices);
+}
+
+void CpuPooling::run(ITensorPack &tensors)
+{
+ if(_asm_glue && _asm_glue->is_configured())
+ {
+ _asm_glue->run(tensors);
+ }
+ else
+ {
+ switch(_data_layout)
+ {
+ case DataLayout::NCHW:
+ // Fill border
+ NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
+
+ // Run pooling layer
+ NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+ break;
+ case DataLayout::NHWC:
+ // Run pooling layer
+ NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
+ }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPooling.h b/src/runtime/cpu/operators/CpuPooling.h
new file mode 100644
index 000000000..aa607b4b4
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPooling.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOLING_H
+#define ARM_COMPUTE_CPU_POOLING_H
+
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward Declarations
+struct PoolingLayerInfo;
+
+namespace cpu
+{
+// Forward Declarations
+class CpuPoolingAssemblyDispatch;
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref kernels::CpuPoolingKernel
+ * -# @ref CpuPoolingAssemblyDispatch
+ */
+class CpuPooling : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuPooling(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuPooling(const CpuPooling &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuPooling &operator=(const CpuPooling &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ CpuPooling(CpuPooling &&) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ CpuPooling &operator=(CpuPooling &&) = delete;
+ /** Default destructor */
+ ~CpuPooling();
+ /** Set the src and dst tensors.
+ *
+ * @note F16 is supported for pool sizes 2 and 3 only
+ *
+ * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+ * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
+ */
+ void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuPooling
+ *
+ * @note F16 is supported for pool sizes 2 and 3 only
+ *
+ * @param[in] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] dst Destination tensor info. Data types supported: same as @p src.
+ * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+ * @param[in] indices (optional) Tensor info of the indices of the maximal values. Data type supported: U32.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+
+private:
+ std::shared_ptr<IMemoryManager> _memory_manager;
+
+ std::unique_ptr<INEKernel> _pooling_layer_kernel;
+ std::unique_ptr<INEKernel> _border_handler;
+ std::unique_ptr<CpuPoolingAssemblyDispatch> _asm_glue;
+
+ bool _is_global_pooling_layer;
+ DataLayout _data_layout;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOLING_H */
diff --git a/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp
new file mode 100644
index 000000000..4a5623394
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPoolingAssemblyDispatch::CpuPoolingAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)),
+ _workspace(),
+ _is_global_pooling_layer(false)
+{
+}
+
+CpuPoolingAssemblyDispatch::~CpuPoolingAssemblyDispatch() = default;
+
+void CpuPoolingAssemblyDispatch::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info)
+{
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+
+ // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+ if(!CpuPoolingAssemblyDispatch::validate(src, dst, info))
+ {
+ return;
+ }
+
+ auto pooling_wrapper = std::make_unique<kernels::CpuPoolingAssemblyWrapperKernel>();
+ ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
+ pooling_wrapper->configure(src, dst, info, ci);
+
+ // Check if we have Global Pooling Layer
+ _is_global_pooling_layer = (src->dimension(2) == info.pool_size.width) && (src->dimension(1) == info.pool_size.height);
+
+ // Allocate workspace based on kernel's memory requirements
+ constexpr size_t alignment = 4096;
+ const size_t workspace_size = pooling_wrapper->get_working_size(num_threads);
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
+ _memory_group.manage(&_workspace);
+ _workspace.allocator()->allocate();
+
+ _kernel = std::move(pooling_wrapper);
+}
+
+Status CpuPoolingAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+{
+ return kernels::CpuPoolingAssemblyWrapperKernel::validate(src, dst, info);
+}
+
+bool CpuPoolingAssemblyDispatch::is_configured() const
+{
+ return _kernel != nullptr;
+}
+
+void CpuPoolingAssemblyDispatch::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No srcs provided");
+
+ tensors.add_tensor(TensorType::ACL_DST_1, &_workspace);
+
+ if(_is_global_pooling_layer)
+ {
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimX, _kernel->window(), tensors);
+ }
+ else
+ {
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+ }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h b/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h
new file mode 100644
index 000000000..353bbe1a7
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOLING_ASSEMBLY_DISPATCH_H
+#define ARM_COMPUTE_CPU_POOLING_ASSEMBLY_DISPATCH_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class ITensor;
+
+/** Basic function to run pooling assembly kernels */
+class CpuPoolingAssemblyDispatch : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuPoolingAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied */
+ CpuPoolingAssemblyDispatch(const CpuPoolingAssemblyDispatch &) = delete;
+ /** Default move constructor */
+ CpuPoolingAssemblyDispatch(CpuPoolingAssemblyDispatch &&) = default;
+ /** Prevent instances of this class from being copied */
+ CpuPoolingAssemblyDispatch &operator=(const CpuPoolingAssemblyDispatch &) = delete;
+ /** Default move assignment operator */
+ CpuPoolingAssemblyDispatch &operator=(CpuPoolingAssemblyDispatch &&) = default;
+ /** Destructor */
+ ~CpuPoolingAssemblyDispatch();
+
+ /** If supported create an assembly routine, else fallback to Compute Library function.
+ *
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Destination tensor info to store the result of pooling. Data types supported: same as @p src.
+ * @param[in] info Pooling meta-data
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info);
+
+ /** Indicates whether or not this function can be used to process the given parameters.
+ *
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] dst Destination tensor to store the result of pooling. Data types supported: same as @p src.
+ * @param[in] info Pooling meta-data
+ *
+ * @return a status.
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
+ /** Was the function successfully configured ?
+ *
+ * @return True if the function is configured and ready to run
+ */
+ bool is_configured() const;
+ // Run method overriden
+ void run(ITensorPack &tensors) override;
+
+private:
+ arm_compute::MemoryGroup _memory_group;
+
+ arm_compute::Tensor _workspace;
+ bool _is_global_pooling_layer;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOLING_ASSEMBLY_DISPATCH_H */