diff options
Diffstat (limited to 'runtimes/nn/common/operations/internal')
13 files changed, 0 insertions, 6211 deletions
diff --git a/runtimes/nn/common/operations/internal/common.h b/runtimes/nn/common/operations/internal/common.h deleted file mode 100644 index 1bf1050fd..000000000 --- a/runtimes/nn/common/operations/internal/common.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (C) 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_COMMON_H__ -#define __NNFW_RT_COMMON_H__ - -#ifndef USE_NEON -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#define USE_NEON -#include <arm_neon.h> -#endif -#endif - -#include "gemmlowp.h" -#include "types.h" - -namespace nnfw { -namespace rt { - -template <FusedActivationFunctionType Ac> -struct ActivationFunctionImpl {}; - -template <> -struct ActivationFunctionImpl<FusedActivationFunctionType::kNone> { - static float Eval(float x) { return x; } -}; - -template <> -struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu> { - static float Eval(float x) { return x < 0.f ? 0.f : x; } -}; - -template <> -struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu1> { - static float Eval(float x) { return x > 1.f ? 1.f : x < -1.f ? -1.f : x; } -}; - -template <> -struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu6> { - static float Eval(float x) { return x > 6.f ? 6.f : x < 0.f ? 0.f : x; } -}; - -template <FusedActivationFunctionType Ac> -float ActivationFunction(float x) { - return ActivationFunctionImpl<Ac>::Eval(x); -} - -inline int32 MultiplyByQuantizedMultiplierSmallerThanOne( - int32 x, int32 quantized_multiplier, int right_shift) { - using gemmlowp::RoundingDivideByPOT; - using gemmlowp::SaturatingRoundingDoublingHighMul; - return RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift); -} - -inline int32 MultiplyByQuantizedMultiplierGreaterThanOne( - int32 x, int32 quantized_multiplier, int left_shift) { - using gemmlowp::SaturatingRoundingDoublingHighMul; - return SaturatingRoundingDoublingHighMul(x * (1 << left_shift), - quantized_multiplier); -} - -} // namespace rt -} // namespace nnfw - -#endif // __NNFW_RT_COMMON_H__ diff --git a/runtimes/nn/common/operations/internal/compatibility.h b/runtimes/nn/common/operations/internal/compatibility.h deleted file mode 100644 index fd33cbd97..000000000 --- a/runtimes/nn/common/operations/internal/compatibility.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (C) 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_COMPATIBILITY_H__ -#define __NNFW_RT_COMPATIBILITY_H__ - -#include <cassert> -#include <cstdint> - -#ifndef DCHECK -#define DCHECK(condition) (condition) ? (void)0 : assert(false) -#endif - -#ifndef DCHECK_EQ -#define DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false) -#endif - -#ifndef DCHECK_GE -#define DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false) -#endif - -#ifndef DCHECK_GT -#define DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false) -#endif - -#ifndef DCHECK_LE -#define DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false) -#endif - -#ifndef DCHECK_LT -#define DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false) -#endif - -#ifndef CHECK_EQ -#define CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false) -#endif - -using uint8 = std::uint8_t; -using int16 = std::int16_t; -using uint16 = std::uint16_t; -using int32 = std::int32_t; -using uint32 = std::uint32_t; - -#endif // __NNFW_RT_COMPATIBILITY_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/cpu_check.h b/runtimes/nn/common/operations/internal/optimized/cpu_check.h deleted file mode 100644 index 02f42fd42..000000000 --- a/runtimes/nn/common/operations/internal/optimized/cpu_check.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_ -#define FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_ - -// NEON_OR_PORTABLE(SomeFunc, arcs) calls NeonSomeFunc(args) if NEON is -// enabled at build time, or PortableSomeFunc(args) otherwise. -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__) -#else -#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__) -#endif - -#endif // FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_ diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h deleted file mode 100644 index 5c05bf20f..000000000 --- a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h +++ /dev/null @@ -1,792 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (C) 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__ -#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__ - -#include "gemmlowp.h" -#include "../common.h" -#include "../types.h" - -namespace nnfw { -namespace rt { -namespace optimized_ops { - -// Implementation of float DepthwiseConv - -template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> -struct FloatDepthwiseConvKernel {}; - -#ifdef USE_NEON - -template <> -struct FloatDepthwiseConvKernel<false, 8, 1> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const float* input_ptr, int input_ptr_increment, - const float* filter_ptr, float* acc_buffer_ptr) { - // Load the filters - float32x4_t filter[2]; - for (int i = 0; i < 2; i++) { - filter[i] = vld1q_f32(filter_ptr + 4 * i); - } - int outp = 0; - // Handle 2 output pixels at a time. - for (; outp <= num_output_pixels - 2; outp += 2) { - // Load the inputs - float32x4_t input[4]; - for (int i = 0; i < 4; i++) { - input[i] = vld1q_f32(input_ptr + 4 * i); - } - input_ptr += 16; - // Load the accumulators from acc_buffer - float32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - acc[0] = vmlaq_f32(acc[0], input[0], filter[0]); - acc[1] = vmlaq_f32(acc[1], input[1], filter[1]); - acc[2] = vmlaq_f32(acc[2], input[2], filter[0]); - acc[3] = vmlaq_f32(acc[3], input[3], filter[1]); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle one output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the inputs - float32x4_t input[2]; - for (int i = 0; i < 2; i++) { - input[i] = vld1q_f32(input_ptr + 4 * i); - } - input_ptr += 8; - // Load the accumulators from acc_buffer - float32x4_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - } -}; - -template <> -struct FloatDepthwiseConvKernel<false, 2, 1> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const float* input_ptr, int input_ptr_increment, - const float* filter_ptr, float* acc_buffer_ptr) { - const float32x2_t filters = vld1_f32(filter_ptr); - const float32x4_t filters_dup2 = vcombine_f32(filters, filters); - int outp = 0; - // Handle 8 output pixels at a time. - for (; outp <= num_output_pixels - 8; outp += 8) { - // Load the inputs - float32x4_t input[4]; - for (int i = 0; i < 4; i++) { - input[i] = vld1q_f32(input_ptr + 4 * i); - } - input_ptr += 16; - // Load the accumulators from acc_buffer - float32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 4; i++) { - acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle 4 output pixels at a time. - for (; outp <= num_output_pixels - 4; outp += 4) { - // Load the inputs - float32x4_t input[2]; - for (int i = 0; i < 2; i++) { - input[i] = vld1q_f32(input_ptr + 4 * i); - } - input_ptr += 8; - // Load the accumulators from acc_buffer - float32x4_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - // Handle 2 output pixels at a time. - for (; outp <= num_output_pixels - 2; outp += 2) { - // Load the inputs - const float32x4_t input = vld1q_f32(input_ptr); - input_ptr += 4; - // Load the accumulators from acc_buffer - float32x4_t acc = vld1q_f32(acc_buffer_ptr); - // Multiply-accumulate - acc = vmlaq_f32(acc, input, filters_dup2); - // Store the accumulators back to acc_buffer - vst1q_f32(acc_buffer_ptr, acc); - acc_buffer_ptr += 4; - } - // Handle 1 output pixel at a time - for (; outp < num_output_pixels; outp++) { - // Load the inputs - const float32x2_t input = vld1_f32(input_ptr); - input_ptr += 2; - // Load the accumulators from acc_buffer - float32x2_t acc = vld1_f32(acc_buffer_ptr); - // Multiply-accumulate - acc = vmla_f32(acc, input, filters); - // Store the accumulators back to acc_buffer - vst1_f32(acc_buffer_ptr, acc); - acc_buffer_ptr += 2; - } - } -}; - -template <> -struct FloatDepthwiseConvKernel<true, 0, 1> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const float* input_ptr, int input_ptr_increment, - const float* filter_ptr, float* acc_buffer_ptr) { - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - const float* local_filter_ptr = filter_ptr; - const float* local_input_ptr = input_ptr; - int ic = 0; - // Handle 16 input channels at a time. - for (; ic <= input_depth - 16; ic += 16) { - // Load the filters - float32x4_t filter[4]; - for (int i = 0; i < 4; i++) { - filter[i] = vld1q_f32(local_filter_ptr + 4 * i); - } - local_filter_ptr += 16; - // Load the inputs - float32x4_t input[4]; - for (int i = 0; i < 4; i++) { - input[i] = vld1q_f32(local_input_ptr + 4 * i); - } - local_input_ptr += 16; - // Load the accumulators from acc_buffer - float32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 4; i++) { - acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle 4 input channels at a time. - for (; ic <= input_depth - 4; ic += 4) { - // Load the filters - float32x4_t filter; - filter = vld1q_f32(local_filter_ptr); - local_filter_ptr += 4; - // Load the inputs - float32x4_t input; - input = vld1q_f32(local_input_ptr); - local_input_ptr += 4; - // Load the accumulators from acc_buffer - float32x4_t acc; - acc = vld1q_f32(acc_buffer_ptr); - // Multiply-accumulate - acc = vmlaq_f32(acc, input, filter); - // Store the accumulators back to acc_buffer - vst1q_f32(acc_buffer_ptr, acc); - acc_buffer_ptr += 4; - } - // Handle one input channel at a time. - for (; ic < input_depth; ic++) { - const float input_val = *local_input_ptr++; - const float filter_val = *local_filter_ptr++; - *acc_buffer_ptr++ += filter_val * input_val; - } - input_ptr += input_ptr_increment; - } - } -}; - -template <> -struct FloatDepthwiseConvKernel<true, 0, 8> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const float* input_ptr, int input_ptr_increment, - const float* filter_ptr, float* acc_buffer_ptr) { - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - const float* local_filter_ptr = filter_ptr; - const float* local_input_ptr = input_ptr; - int ic = 0; - // Handle 2 input channels at a time. - for (; ic <= input_depth - 2; ic += 2) { - // Load the filters - float32x4_t filter[4]; - for (int i = 0; i < 4; i++) { - filter[i] = vld1q_f32(local_filter_ptr + 4 * i); - } - local_filter_ptr += 16; - // Load the inputs - const float32x2_t input = vld1_f32(local_input_ptr); - local_input_ptr += 2; - // Load the accumulators from acc_buffer - float32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0); - acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0); - acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1); - acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle one input channel at a time. - for (; ic < input_depth; ic++) { - // Load the filters - float32x4_t filter[2]; - for (int i = 0; i < 2; i++) { - filter[i] = vld1q_f32(local_filter_ptr + 4 * i); - } - local_filter_ptr += 8; - // Load the inputs - const float input_val = *local_input_ptr++; - // Load the accumulators from acc_buffer - float32x4_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - input_ptr += input_ptr_increment; - } - } -}; - -template <> -struct FloatDepthwiseConvKernel<true, 0, 2> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const float* input_ptr, int input_ptr_increment, - const float* filter_ptr, float* acc_buffer_ptr) { - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - const float* local_filter_ptr = filter_ptr; - const float* local_input_ptr = input_ptr; - int ic = 0; - // Handle 8 input channels at a time. - for (; ic <= input_depth - 8; ic += 8) { - // Load the filters - float32x4_t filter[4]; - for (int i = 0; i < 4; i++) { - filter[i] = vld1q_f32(local_filter_ptr + 4 * i); - } - local_filter_ptr += 16; - // Load the inputs - float32x4x2_t input_dup2[2]; - for (int i = 0; i < 2; i++) { - const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i); - input_dup2[i] = vzipq_f32(input, input); - } - local_input_ptr += 8; - // Load the accumulators from acc_buffer - float32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]); - acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]); - acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]); - acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle 4 input channels at a time. - for (; ic <= input_depth - 4; ic += 4) { - // Load the filters - float32x2_t filter[4]; - for (int i = 0; i < 4; i++) { - filter[i] = vld1_f32(local_filter_ptr + 2 * i); - } - local_filter_ptr += 8; - // Load the inputs - const float32x4_t input = vld1q_f32(local_input_ptr); - local_input_ptr += 4; - // Load the accumulators from acc_buffer - float32x2_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); - } - // Multiply-accumulate - acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0); - acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1); - acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0); - acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - // Handle 2 input channels at a time. - for (; ic <= input_depth - 2; ic += 2) { - // Load the filters - const float32x4_t filter = vld1q_f32(local_filter_ptr); - local_filter_ptr += 4; - // Load the inputs - const float32x2_t input = vld1_f32(local_input_ptr); - local_input_ptr += 2; - // Load the accumulators from acc_buffer - float32x2_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); - } - // Multiply-accumulate - acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0); - acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); - } - acc_buffer_ptr += 4; - } - // Handle one input channel at a time. - for (; ic < input_depth; ic++) { - // Load the inputs - const float input_val = *local_input_ptr++; - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc_buffer_ptr[i] += local_filter_ptr[i] * input_val; - } - local_filter_ptr += 2; - acc_buffer_ptr += 2; - } - input_ptr += input_ptr_increment; - } - } -}; - -template <> -struct FloatDepthwiseConvKernel<true, 1, 8> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const float* input_ptr, int input_ptr_increment, - const float* filter_ptr, float* acc_buffer_ptr) { - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - // Load the filters - float32x4_t filter[2]; - for (int i = 0; i < 2; i++) { - filter[i] = vld1q_f32(filter_ptr + 4 * i); - } - // Load the inputs - const float input_val = *input_ptr; - input_ptr += input_ptr_increment; - // Load the accumulators from acc_buffer - float32x4_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - } -}; - -template <> -struct FloatDepthwiseConvKernel<true, 0, 16> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const float* input_ptr, int input_ptr_increment, - const float* filter_ptr, float* acc_buffer_ptr) { - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - const float* local_filter_ptr = filter_ptr; - const float* local_input_ptr = input_ptr; - for (int ic = 0; ic < input_depth; ic++) { - // Load the filters - float32x4_t filter[4]; - for (int i = 0; i < 4; i++) { - filter[i] = vld1q_f32(local_filter_ptr + 4 * i); - } - local_filter_ptr += 16; - // Load the inputs - const float input_val = *local_input_ptr++; - // Load the accumulators from acc_buffer - float32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 4; i++) { - acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - input_ptr += input_ptr_increment; - } - } -}; -#endif - -// Accumulates the effect of one row of the filter, on a segment of one row -// of the output, accessing the corresponding one row of the input. -template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> -void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width, - const float* input_data, int pad_width, - int depth_multiplier, int filter_width, - const float* filter_data, - int out_x_buffer_start, int out_x_buffer_end, - int output_depth, float* acc_buffer) { -#ifdef GEMMLOWP_PROFILING - gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); -#endif - // Sanity check parameters. This is important in particular to ensure - // that we keep the number of template instantiations minimal, so we don't - // increase binary size unnecessarily. - static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); - static_assert(kFixedInputDepth || kAllowStrided, ""); - DCHECK(stride == 1 || kAllowStrided); - if (kFixedInputDepth) { - DCHECK_EQ(input_depth, kFixedInputDepth); - } - if (kFixedDepthMultiplier) { - DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); - } - DCHECK_EQ(output_depth, input_depth * depth_multiplier); - const int input_ptr_increment = stride * input_depth; - const float* filter_base_ptr = filter_data; - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - // For the current (filter_x, filter_y) point in the filter, - // compute the boundaries of the corresponding output row segment. - int out_x_loop_start_unclampled = 0; - int out_x_loop_end_unclampled = 0; - if (kAllowStrided) { - if (stride == 2) { - out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; - out_x_loop_end_unclampled = - (pad_width + input_width - filter_x + 1) / 2; - } else if (stride == 4) { - out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; - out_x_loop_end_unclampled = - (pad_width + input_width - filter_x + 3) / 4; - } else { - out_x_loop_start_unclampled = - (pad_width - filter_x + stride - 1) / stride; - out_x_loop_end_unclampled = - (pad_width + input_width - filter_x + stride - 1) / stride; - } - } else { - out_x_loop_start_unclampled = pad_width - filter_x; - out_x_loop_end_unclampled = pad_width + input_width - filter_x; - } - // The kernel will have to iterate on the segment of the - // output row that starts at out_x_loop_start and out_x_loop_end. - const int out_x_loop_start = - std::max(out_x_buffer_start, out_x_loop_start_unclampled); - const int out_x_loop_end = - std::min(out_x_buffer_end, out_x_loop_end_unclampled); - - float* acc_buffer_ptr = - acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; - const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; - const float* input_ptr = input_data + in_x_origin * input_depth; - const int num_output_pixels = out_x_loop_end - out_x_loop_start; - FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, - kFixedDepthMultiplier>::Run(num_output_pixels, - input_depth, - depth_multiplier, - input_ptr, - input_ptr_increment, - filter_base_ptr, - acc_buffer_ptr); - filter_base_ptr += output_depth; - } -} - -// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized. -inline void FloatDepthwiseConvAccumRowGeneric( - int stride, int input_depth, int input_width, const float* input_data, - int pad_width, int depth_multiplier, int filter_width, - const float* filter_data, int out_x_buffer_start, int out_x_buffer_end, - int output_depth, float* acc_buffer) { - gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); - const float* filter_base_ptr = filter_data; - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int out_x_loop_start = std::max( - out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); - const int out_x_loop_end = - std::min(out_x_buffer_end, - (pad_width + input_width - filter_x + stride - 1) / stride); - - float* acc_buffer_ptr = - acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; - const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; - const float* input_ptr = input_data + in_x_origin * input_depth; - const int input_ptr_increment = (stride - 1) * input_depth; - for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { - const float* filter_ptr = filter_base_ptr; - for (int ic = 0; ic < input_depth; ++ic) { - const float input_val = *input_ptr++; - for (int m = 0; m < depth_multiplier; m++) { - const float filter_val = *filter_ptr++; - *acc_buffer_ptr++ += filter_val * input_val; - } - } - input_ptr += input_ptr_increment; - } - filter_base_ptr += output_depth; - } -} - -// Initializes the accumulator buffer with bias values. -inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, - const float* bias_data, - float* acc_buffer) { - for (int i = 0; i < num_output_pixels; i++) { - memcpy(acc_buffer + i * output_depth, bias_data, - sizeof(acc_buffer[0]) * output_depth); - } -} - -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, int depth_multiplier, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("DepthwiseConv"); - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); -#if 0 // TODO-NNRT : Check if assertion is needed, output depth some times not equal to input * depthmultiplier - DCHECK(output_depth == input_depth * depth_multiplier); -#endif - - static const int kAccBufferMaxSize = 1024; - float acc_buffer[kAccBufferMaxSize]; - DCHECK_GE(kAccBufferMaxSize, output_depth); - const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; - const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; - DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize); - DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); - DCHECK_GE(kOutputPixelsInAccBuffer, 1); - - // row_accum_func will point to the core accumulation function to be used - // for this DepthwiseConv op. - auto* row_accum_func = FloatDepthwiseConvAccumRowGeneric; - - const int kMaxFixedDepthMultiplier = 16; - int fixed_depth_multiplier = 0; - if (depth_multiplier <= kMaxFixedDepthMultiplier) { - fixed_depth_multiplier = depth_multiplier; - } - // kMaxUnrolling is the max number of output values that we aim to handle - // in one unrolled iteration of the inner loop. For practical performance - // reasons, it is limited by the number of available registers. We could - // fine-tune it depending on the architecture, but that's not worth doing - // since this whole code is not very optimized to begin with. The - // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit - // vector registers. - const int kMaxUnrolling = 8; - int fixed_input_depth = 0; - if (fixed_depth_multiplier && - input_depth * fixed_depth_multiplier <= kMaxUnrolling) { - fixed_input_depth = input_depth; - } -#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ - FIXED_DEPTH_MULTIPLIER) \ - if ((stride_width == 1 || ALLOW_STRIDED) && \ - fixed_input_depth == FIXED_INPUT_DEPTH && \ - fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ - row_accum_func = \ - FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ - FIXED_DEPTH_MULTIPLIER>; \ - } - -#ifdef USE_NEON - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) -#endif // USE_NEON - -#undef TFMINI_USE_DEPTHWISECONV_KERNEL - - // Now that we have determined row_accum_func, we can start work. - float* output_ptr = output_data; - for (int b = 0; b < batches; ++b) { - for (int out_y = 0; out_y < output_height; ++out_y) { - const int in_y_origin = (out_y * stride_height) - pad_height; - const int filter_y_start = std::max(0, -in_y_origin); - const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); - for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; - out_x_buffer_start += kOutputPixelsInAccBuffer) { - const int out_x_buffer_end = std::min( - output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); - // We call a 'pixel' a group of activation that share all but the - // 'depth'/'channel' coordinate. num_output_pixels is the number of - // output pixels that we will accumulate in this loop iteration. - const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; - // Initialize our local accumulator with the bias values, so we don't - // have to add them later. - DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, - acc_buffer); - // Accumulation loop. Most of the time should be spent in here. - for (int filter_y = filter_y_start; filter_y < filter_y_end; - ++filter_y) { - const int in_y = in_y_origin + filter_y; - row_accum_func(stride_width, input_depth, input_width, - input_data + in_y * input_dims.strides[2] + - b * input_dims.strides[3], - pad_width, depth_multiplier, filter_width, - filter_data + filter_y * filter_dims.strides[2], - out_x_buffer_start, out_x_buffer_end, output_depth, - acc_buffer); - } - // Finished accumulating. Now store to destination. - const int num_output_values = output_depth * num_output_pixels; - int i = 0; -#ifdef USE_NEON - // Handle 16 values at a time - for (; i <= num_output_values - 16; i += 16) { - float32x4_t acc[4]; - for (int k = 0; k < 4; k++) { - acc[k] = vld1q_f32(acc_buffer + i + 4 * k); - } - if (Ac == FusedActivationFunctionType::kRelu) { - for (int k = 0; k < 4; k++) { - acc[k] = vmaxq_f32(vdupq_n_f32(0.f), acc[k]); - } - } else if (Ac == FusedActivationFunctionType::kRelu6) { - for (int k = 0; k < 4; k++) { - acc[k] = vmaxq_f32(vdupq_n_f32(0.f), - vminq_f32(vdupq_n_f32(6.f), acc[k])); - } - } else if (Ac == FusedActivationFunctionType::kRelu1) { - for (int k = 0; k < 4; k++) { - acc[k] = vmaxq_f32(vdupq_n_f32(-1.f), - vminq_f32(vdupq_n_f32(1.f), acc[k])); - } - } - for (int k = 0; k < 4; k++) { - vst1q_f32(output_ptr + 4 * k, acc[k]); - } - output_ptr += 16; - } - // Handle 4 values at a time - for (; i <= num_output_values - 4; i += 4) { - float32x4_t acc = vld1q_f32(acc_buffer + i); - if (Ac == FusedActivationFunctionType::kRelu) { - acc = vmaxq_f32(vdupq_n_f32(0.f), acc); - } else if (Ac == FusedActivationFunctionType::kRelu6) { - acc = vmaxq_f32(vdupq_n_f32(0.f), vminq_f32(vdupq_n_f32(6.f), acc)); - } else if (Ac == FusedActivationFunctionType::kRelu1) { - acc = - vmaxq_f32(vdupq_n_f32(-1.f), vminq_f32(vdupq_n_f32(1.f), acc)); - } - vst1q_f32(output_ptr, acc); - output_ptr += 4; - } -#endif - // Handle leftover values, one by one. This is very slow. - for (; i < num_output_values; i++) { - float acc = acc_buffer[i]; - if (Ac == FusedActivationFunctionType::kRelu) { - acc = std::max(0.f, acc); - } else if (Ac == FusedActivationFunctionType::kRelu6) { - acc = std::max(0.f, std::min(6.f, acc)); - } else if (Ac == FusedActivationFunctionType::kRelu1) { - acc = std::max(-1.f, std::min(1.f, acc)); - } - *output_ptr++ = acc; - } - } - } - } -} - -} // namespace optimized_ops -} // namespace rt -} // namespace nnfw - - -#endif // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h deleted file mode 100644 index 220f8793e..000000000 --- a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h +++ /dev/null @@ -1,1606 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (C) 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__ -#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__ - -#include "fixedpoint.h" -#include "gemmlowp.h" -#include "../common.h" -#include "../types.h" - -namespace nnfw { -namespace rt { -namespace optimized_ops { - -// Implementation of quantized DepthwiseConv - -template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> -struct QuantizedDepthwiseConvKernel {}; - -#ifdef USE_NEON -template <> -struct QuantizedDepthwiseConvKernel<true, 8, 2> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - uint8x8x2_t filter_u8; - filter_u8.val[0] = vld1_u8(filter_ptr); - filter_u8.val[1] = vld1_u8(filter_ptr + 8); - int16x8_t filter[2]; - for (int i = 0; i < 2; i++) { - filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), - vdupq_n_s16(filter_offset)); - } - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer - int32x4x2_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); - acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); - } - // Load the inputs, add input_offset. - const uint8x8_t input_u8 = vld1_u8(input_ptr); - input_ptr += input_ptr_increment; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - // Duplicate the input values, 2-fold - const int16x8x2_t input_dup2 = vzipq_s16(input, input); - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), - vget_low_s16(input_dup2.val[i])); - acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), - vget_high_s16(input_dup2.val[i])); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); - vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); - } - acc_buffer_ptr += 16; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 8, 1> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - const uint8x8_t filter_u8 = vld1_u8(filter_ptr); - const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); - const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); - - int outp = 0; - // Handle 2 output pixels at a time. - for (; outp <= num_output_pixels - 2; outp += 2) { - // Load the accumulators from acc_buffer. - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Load the inputs, add input_offset. - uint8x8_t input_u8[2]; - for (int i = 0; i < 2; i++) { - input_u8[i] = vld1_u8(input_ptr + 8 * i); - } - input_ptr += 16; - int16x8_t input[2]; - for (int i = 0; i < 2; i++) { - input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); - } - for (int i = 0; i < 2; i++) { - input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); - } - // Multiply-accumulate. - acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); - acc[1] = - vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); - acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); - acc[3] = - vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle 1 output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer. - int32x4_t acc[2]; - acc[0] = vld1q_s32(acc_buffer_ptr); - acc[1] = vld1q_s32(acc_buffer_ptr + 4); - - // Load the inputs, add input_offset. - const uint8x8_t input_u8 = vld1_u8(input_ptr); - input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - // Multiply-accumulate. - acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); - acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); - // Store the accumulators back to acc_buffer - vst1q_s32(acc_buffer_ptr, acc[0]); - vst1q_s32(acc_buffer_ptr + 4, acc[1]); - acc_buffer_ptr += 8; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 4, 2> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - const uint8x8_t filter_u8 = vld1_u8(filter_ptr); - const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); - const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); - - int outp = 0; - // Handle 2 output pixels at a time. - for (; outp <= num_output_pixels - 2; outp += 2) { - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Load the inputs, add input_offset. - const uint8x8_t input_u8 = vld1_u8(input_ptr); - input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - // Duplicate the input values, 2-fold - const int16x8x2_t input_dup2 = vzipq_s16(input, input); - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), - vget_low_s16(input_dup2.val[i])); - acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), - vget_high_s16(input_dup2.val[i])); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle one output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer - int32x4_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); - input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); - input_ptr += 4; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - // Duplicate the input values, 2-fold - const int16x4x2_t input_dup2 = vzip_s16(input, input); - // Multiply-accumulate - acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); - acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 2, 8> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - int16x8_t filter[2]; - for (int i = 0; i < 2; i++) { - const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); - const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); - filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); - } - int outp = 0; - // Handle two output pixels at a time. - for (; outp <= num_output_pixels - 2; outp += 2) { - // Load the accumulators from acc_buffer. - int32x4_t acc[8]; - for (int i = 0; i < 8; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); - input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); - input_ptr += 4; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - // Multiply-accumulate. - acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); - acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); - acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); - acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); - acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); - acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); - acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); - acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); - // Store the accumulators back to acc_buffer. - for (int i = 0; i < 8; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 32; - } - // Handle one output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer. - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_ptr += 2; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - - // Multiply-accumulate. - acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); - acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); - acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); - acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); - - // Store the accumulators back to acc_buffer. - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 2, 2> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - uint8x8_t filter_u8 = vdup_n_u8(0); - filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); - filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); - filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); - filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); - const int16x4_t filter_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); - const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); - - int outp = 0; - // Handle 4 output pixels at a time. - for (; outp <= num_output_pixels - 4; outp += 4) { - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - - // Load the inputs, add input_offset. - const uint8x8_t input_u8 = vld1_u8(input_ptr); - input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - // Duplicate the input values, 2-fold - const int16x8x2_t input_dup2 = vzipq_s16(input, input); - // Multiply-accumulate - acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); - acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); - acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); - acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle one output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer - int32x4_t acc = vld1q_s32(acc_buffer_ptr); - - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_ptr += 2; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - // Duplicate the input values, 2-fold - const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; - // Multiply-accumulate - acc = vmlal_s16(acc, filter, input_dup2); - // Store the accumulators back to acc_buffer - vst1q_s32(acc_buffer_ptr, acc); - acc_buffer_ptr += 4; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 2, 1> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - uint8x8_t filter_u8 = vdup_n_u8(0); - filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); - filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); - filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); - filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); - const int16x4_t filter_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); - const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); - - int outp = 0; - // Handle 8 output pixels at a time. - for (; outp <= num_output_pixels - 8; outp += 8) { - // Load the accumulators from acc_buffer. - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Load the inputs, add input_offset. - uint8x8_t input_u8[2]; - for (int i = 0; i < 2; i++) { - input_u8[i] = vld1_u8(input_ptr + 8 * i); - } - input_ptr += 16; - int16x8_t input[2]; - for (int i = 0; i < 2; i++) { - input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); - } - for (int i = 0; i < 2; i++) { - input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); - } - - // Multiply-accumulate. - acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); - acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); - acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); - acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); - // Store the accumulators back to acc_buffer. - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle 4 output pixels at a time. - for (; outp <= num_output_pixels - 4; outp += 4) { - // Load the accumulators from acc_buffer. - int32x4_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Load the inputs, add input_offset. - const uint8x8_t input_u8 = vld1_u8(input_ptr); - input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - - // Multiply-accumulate. - acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); - acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); - // Store the accumulators back to acc_buffer. - for (int i = 0; i < 2; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - // Handle 2 output pixels at a time. - for (; outp <= num_output_pixels - 2; outp += 2) { - // Load the accumulators from acc_buffer. - int32x4_t acc = vld1q_s32(acc_buffer_ptr); - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); - input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); - input_ptr += 4; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - - // Multiply-accumulate. - acc = vmlal_s16(acc, filter, input); - // Store the accumulators back to acc_buffer. - vst1q_s32(acc_buffer_ptr, acc); - acc_buffer_ptr += 4; - } - // Handle 1 output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer. - int32x2_t acc = vld1_s32(acc_buffer_ptr); - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_ptr += 2; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - - // Multiply-accumulate. - acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); - // Store the accumulators back to acc_buffer. - vst1_s32(acc_buffer_ptr, acc); - acc_buffer_ptr += 2; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 1, 2> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - uint8x8_t filter_u8 = vdup_n_u8(0); - filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); - filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); - filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); - filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); - const int16x4_t filter_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); - const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); - - int outp = 0; - // Handle 8 output pixels at a time. - for (; outp <= num_output_pixels - 8; outp += 8) { - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - - // Load the inputs, add input_offset. - const uint8x8_t input_u8 = vld1_u8(input_ptr); - input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - // Duplicate the input values, 2-fold - const int16x8x2_t input_dup2 = vzipq_s16(input, input); - // Multiply-accumulate - acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); - acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); - acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); - acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle one output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer - int32x2_t acc = vld1_s32(acc_buffer_ptr); - - // Load the inputs, add input_offset. - const uint32 input = *input_ptr++ + input_offset; - - // Multiply-accumulate - acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); - // Store the accumulators back to acc_buffer - vst1_s32(acc_buffer_ptr, acc); - acc_buffer_ptr += 2; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 1, 4> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - uint8x8_t filter_u8 = vdup_n_u8(0); - filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); - filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); - filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); - filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); - const int16x4_t filter_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); - const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); - - int outp = 0; - // Handle 8 output pixels at a time. - for (; outp <= num_output_pixels - 8; outp += 8) { - // Load the accumulators from acc_buffer - int32x4_t acc[8]; - for (int i = 0; i < 8; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vld1_u8(input_ptr); - input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - - // Multiply-accumulate - acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); - acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); - acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); - acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); - acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); - acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); - acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); - acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); - - // Store the accumulators back to acc_buffer - for (int i = 0; i < 8; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 32; - } - // Handle 4 output pixels at a time. - for (; outp <= num_output_pixels - 4; outp += 4) { - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); - input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); - input_ptr += 4; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - - // Multiply-accumulate - acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); - acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); - acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); - acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); - - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle one output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer - int32x4_t acc = vld1q_s32(acc_buffer_ptr); - - // Load the inputs, add input_offset. - const uint32 input = *input_ptr++ + input_offset; - - // Multiply-accumulate - acc = vmlal_n_s16(acc, filter, input); - // Store the accumulators back to acc_buffer - vst1q_s32(acc_buffer_ptr, acc); - acc_buffer_ptr += 4; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 4, 1> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - uint8x8_t filter_u8 = vdup_n_u8(0); - filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); - filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); - filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); - filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); - const int16x4_t filter_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); - const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); - - int outp = 0; - // Handle 4 output pixels at a time. - for (; outp <= num_output_pixels - 4; outp += 4) { - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Load the inputs, add input_offset. - int16x8_t input[2]; - for (int i = 0; i < 2; i++) { - const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i); - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - } - input_ptr += 16; - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[2 * i + 0] = - vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); - acc[2 * i + 1] = - vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle one output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer - int32x4_t acc; - acc = vld1q_s32(acc_buffer_ptr); - - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); - input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); - input_ptr += 4; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - // Multiply-accumulate - acc = vmlal_s16(acc, filter, input); - // Store the accumulators back to acc_buffer - vst1q_s32(acc_buffer_ptr, acc); - acc_buffer_ptr += 4; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<false, 4, 4> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - int16x8_t filter[2]; - for (int i = 0; i < 2; i++) { - const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); - const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); - filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); - } - - int outp = 0; - // Handle 2 output pixels at a time. - for (; outp <= num_output_pixels - 2; outp += 2) { - // Load the accumulators from acc_buffer - int32x4_t acc[8]; - for (int i = 0; i < 8; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vld1_u8(input_ptr); - input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - - // Multiply-accumulate - acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), - vget_low_s16(input), 0); - acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), - vget_low_s16(input), 1); - acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), - vget_low_s16(input), 2); - acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), - vget_low_s16(input), 3); - acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), - vget_high_s16(input), 0); - acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), - vget_high_s16(input), 1); - acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), - vget_high_s16(input), 2); - acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), - vget_high_s16(input), 3); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 8; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 32; - } - // Handle one output pixel at a time. - for (; outp < num_output_pixels; outp++) { - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - - // Load the inputs, add input_offset. - uint8x8_t input_u8 = vdup_n_u8(0); - input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); - input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); - input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); - input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); - input_ptr += 4; - const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); - const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); - - // Multiply-accumulate - acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); - acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); - acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); - acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<true, 0, 3> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // We will have to duplicate bytes in a NEON register, 3-fold. - // We will do that by register-level table-look-up using VTBL instructions. - // Here we prepare the registers containing the table-lookup indices. - static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2}, - {2, 3, 3, 3, 4, 4, 4, 5}, - {5, 5, 6, 6, 6, 7, 7, 7}}; - uint8x8_t dup3_indices[3]; - for (int i = 0; i < 3; i++) { - dup3_indices[i] = vld1_u8(dup3_indices_array[i]); - } - - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - const uint8* local_filter_ptr = filter_ptr; - const uint8* local_input_ptr = input_ptr; - int ic = 0; - // Handle 8 input channels at a time. - for (; ic <= input_depth - 8; ic += 8) { - // Load the filters, add filter_offset. - int16x8_t filter[3]; - uint8x8x3_t filter_u8; - filter_u8.val[0] = vld1_u8(local_filter_ptr); - filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); - filter_u8.val[2] = vld1_u8(local_filter_ptr + 16); - local_filter_ptr += 24; - for (int i = 0; i < 3; i++) { - const int16x8_t filter_s16 = - vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); - filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); - } - // Load the inputs, duplicate 3-fold, add input_offset. - const uint8x8_t input_u8 = vld1_u8(local_input_ptr); - local_input_ptr += 8; - - uint8x8_t input_u8_dup3[3]; - for (int i = 0; i < 3; i++) { - input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]); - } - int16x8_t input_dup3[3]; - for (int i = 0; i < 3; i++) { - const int16x8_t input_s16_dup3 = - vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i])); - input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); - } - // Load the accumulators from acc_buffer - int32x4x3_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); - acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); - acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); - } - // Multiply-accumulate - for (int j = 0; j < 3; j++) { - acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), - vget_low_s16(filter[j])); - acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), - vget_high_s16(filter[j])); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); - vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); - vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); - } - acc_buffer_ptr += 24; - } - // Handle one input channel at a time. - for (; ic < input_depth; ic++) { - const int16 input_val = *local_input_ptr++ + input_offset; - for (int i = 0; i < 3; i++) { - const int16 filter_val = local_filter_ptr[i] + filter_offset; - *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; - } - local_filter_ptr += 3; - } - input_ptr += input_ptr_increment; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<true, 0, 2> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - const uint8* local_filter_ptr = filter_ptr; - const uint8* local_input_ptr = input_ptr; - int ic = 0; - // Handle 8 input channels at a time. - for (; ic <= input_depth - 8; ic += 8) { - // Load the filters, add filter_offset. - int16x8_t filter[2]; - uint8x8x2_t filter_u8; - filter_u8.val[0] = vld1_u8(local_filter_ptr); - filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); - local_filter_ptr += 16; - for (int i = 0; i < 2; i++) { - const int16x8_t filter_s16 = - vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); - filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); - } - // Load the inputs, add input_offset, duplicate 2-fold. - const uint8x8_t input_u8 = vld1_u8(local_input_ptr); - local_input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - const int16x8x2_t input_dup2 = vzipq_s16(input, input); - // Load the accumulators from acc_buffer. - int32x4x2_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); - acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); - } - // Multiply-accumulate. - for (int j = 0; j < 2; j++) { - acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), - vget_low_s16(input_dup2.val[j])); - acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), - vget_high_s16(input_dup2.val[j])); - } - // Store the accumulators back to acc_buffer. - for (int i = 0; i < 2; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); - vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); - } - acc_buffer_ptr += 16; - } - // Handle one input channel at a time. - for (; ic < input_depth; ic++) { - // Load the inputs. - const int16 input_val = *local_input_ptr++ + input_offset; - for (int i = 0; i < 2; i++) { - const int16 filter_val = local_filter_ptr[i] + filter_offset; - *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; - } - local_filter_ptr += 2; - } - input_ptr += input_ptr_increment; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<true, 0, 1> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - const uint8* local_filter_ptr = filter_ptr; - const uint8* local_input_ptr = input_ptr; - int ic = 0; - // Handle 16 input channels at a time. - for (; ic <= input_depth - 16; ic += 16) { - // Load the filters, add filter_offset. - uint8x8_t filter_u8[2]; - for (int i = 0; i < 2; i++) { - filter_u8[i] = vld1_u8(local_filter_ptr + 8 * i); - } - local_filter_ptr += 16; - int16x8_t filter[2]; - for (int i = 0; i < 2; i++) { - filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); - } - for (int i = 0; i < 2; i++) { - filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); - } - // Load the inputs, add input_offset. - uint8x8_t input_u8[2]; - for (int i = 0; i < 2; i++) { - input_u8[i] = vld1_u8(local_input_ptr + 8 * i); - } - local_input_ptr += 16; - int16x8_t input[2]; - for (int i = 0; i < 2; i++) { - input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); - } - for (int i = 0; i < 2; i++) { - input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); - } - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), - vget_low_s16(filter[i])); - acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), - vget_high_s16(filter[i])); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - // Handle 8 input channels at a time. - for (; ic <= input_depth - 8; ic += 8) { - // Load the filters, add filter_offset. - const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr); - local_filter_ptr += 8; - const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); - const int16x8_t filter = - vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); - // Load the inputs, add input_offset. - const uint8x8_t input_u8 = vld1_u8(local_input_ptr); - local_input_ptr += 8; - const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); - const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); - // Load the accumulators from acc_buffer - int32x4_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); - acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - // Handle one input channel at a time. - for (; ic < input_depth; ic++) { - const int16 input_val = *local_input_ptr++ + input_offset; - const int16 filter_val = *local_filter_ptr++ + filter_offset; - *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; - } - input_ptr += input_ptr_increment; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<true, 16, 1> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - uint8x8_t filter_u8[2]; - for (int i = 0; i < 2; i++) { - filter_u8[i] = vld1_u8(filter_ptr + 8 * i); - } - int16x8_t filter[2]; - for (int i = 0; i < 2; i++) { - filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); - } - for (int i = 0; i < 2; i++) { - filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); - } - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - // Load the inputs, add input_offset. - uint8x8_t input_u8[2]; - for (int i = 0; i < 2; i++) { - input_u8[i] = vld1_u8(input_ptr + 8 * i); - } - input_ptr += input_ptr_increment; - int16x8_t input[2]; - for (int i = 0; i < 2; i++) { - input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); - } - for (int i = 0; i < 2; i++) { - input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); - } - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), - vget_low_s16(filter[i])); - acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), - vget_high_s16(filter[i])); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<true, 1, 16> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - uint8x8_t filter_u8[2]; - for (int i = 0; i < 2; i++) { - filter_u8[i] = vld1_u8(filter_ptr + 8 * i); - } - int16x8_t filter[2]; - for (int i = 0; i < 2; i++) { - filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); - } - for (int i = 0; i < 2; i++) { - filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); - } - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - uint8 input_u8 = *input_ptr; - input_ptr += input_ptr_increment; - int16 input = static_cast<int16>(input_u8 + input_offset); - // Load the accumulators from acc_buffer - int32x4_t acc[4]; - for (int i = 0; i < 4; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - for (int i = 0; i < 2; i++) { - acc[2 * i + 0] = - vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); - acc[2 * i + 1] = - vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); - } - // Store the accumulators back to acc_buffer - for (int i = 0; i < 4; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 16; - } - } -}; - -template <> -struct QuantizedDepthwiseConvKernel<true, 1, 8> { - static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8* input_ptr, int16 input_offset, - int input_ptr_increment, const uint8* filter_ptr, - int16 filter_offset, int32* acc_buffer_ptr) { - // Load the filters, add filter_offset. - const uint8x8_t filter_u8 = vld1_u8(filter_ptr); - const int16x8_t filter = vaddq_s16( - vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); - // Handle one output pixel at a time. - for (int outp = 0; outp < num_output_pixels; outp++) { - uint8 input_u8 = *input_ptr; - input_ptr += input_ptr_increment; - int16 input = static_cast<int16>(input_u8 + input_offset); - // Load the accumulators from acc_buffer - int32x4_t acc[2]; - for (int i = 0; i < 2; i++) { - acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); - } - // Multiply-accumulate - acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); - acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); - // Store the accumulators back to acc_buffer - for (int i = 0; i < 2; i++) { - vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); - } - acc_buffer_ptr += 8; - } - } -}; -#endif - -// Accumulates the effect of one row of the filter, on a segment of one row -// of the output, accessing the corresponding one row of the input. -template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> -void QuantizedDepthwiseConvAccumRow( - int stride, int input_depth, int input_width, const uint8* input_data, - int16 input_offset, int pad_width, int depth_multiplier, int filter_width, - const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, - int out_x_buffer_end, int output_depth, int32* acc_buffer) { -#ifdef GEMMLOWP_PROFILING - gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); -#endif - // Sanity check parameters. This is important in particular to ensure - // that we keep the number of template instantiations minimal, so we don't - // increase binary size unnecessarily. - static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); - static_assert(kFixedInputDepth || kAllowStrided, ""); - DCHECK(stride == 1 || kAllowStrided); - if (kFixedInputDepth) { - DCHECK_EQ(input_depth, kFixedInputDepth); - } - if (kFixedDepthMultiplier) { - DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); - } - DCHECK_EQ(output_depth, input_depth * depth_multiplier); - const int input_ptr_increment = stride * input_depth; - const uint8* filter_base_ptr = filter_data; - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - // For the current (filter_x, filter_y) point in the filter, - // compute the boundaries of the corresponding output row segment. - int out_x_loop_start_unclampled = 0; - int out_x_loop_end_unclampled = 0; - if (kAllowStrided) { - if (stride == 2) { - out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; - out_x_loop_end_unclampled = - (pad_width + input_width - filter_x + 1) / 2; - } else if (stride == 4) { - out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; - out_x_loop_end_unclampled = - (pad_width + input_width - filter_x + 3) / 4; - } else { - out_x_loop_start_unclampled = - (pad_width - filter_x + stride - 1) / stride; - out_x_loop_end_unclampled = - (pad_width + input_width - filter_x + stride - 1) / stride; - } - } else { - out_x_loop_start_unclampled = pad_width - filter_x; - out_x_loop_end_unclampled = pad_width + input_width - filter_x; - } - // The kernel will have to iterate on the segment of the - // output row that starts at out_x_loop_start and out_x_loop_end. - const int out_x_loop_start = - std::max(out_x_buffer_start, out_x_loop_start_unclampled); - const int out_x_loop_end = - std::min(out_x_buffer_end, out_x_loop_end_unclampled); - - int32* acc_buffer_ptr = - acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; - const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; - const uint8* input_ptr = input_data + in_x_origin * input_depth; - const int num_output_pixels = out_x_loop_end - out_x_loop_start; - QuantizedDepthwiseConvKernel< - kAllowStrided, kFixedInputDepth, - kFixedDepthMultiplier>::Run(num_output_pixels, input_depth, - depth_multiplier, input_ptr, input_offset, - input_ptr_increment, filter_base_ptr, - filter_offset, acc_buffer_ptr); - filter_base_ptr += output_depth; - } -} - -// generic fallback of DepthwiseConvAccumRow, portable, non-templatized. -inline void QuantizedDepthwiseConvAccumRowGeneric( - int stride, int input_depth, int input_width, const uint8* input_data, - int16 input_offset, int pad_width, int depth_multiplier, int filter_width, - const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, - int out_x_buffer_end, int output_depth, int32* acc_buffer) { - gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); - const uint8* filter_base_ptr = filter_data; - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int out_x_loop_start = std::max( - out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); - const int out_x_loop_end = - std::min(out_x_buffer_end, - (pad_width + input_width - filter_x + stride - 1) / stride); - - int32* acc_buffer_ptr = - acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; - const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; - const uint8* input_ptr = input_data + in_x_origin * input_depth; - const int input_ptr_increment = (stride - 1) * input_depth; - for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { - const uint8* filter_ptr = filter_base_ptr; - for (int ic = 0; ic < input_depth; ++ic) { - const int16 input_val = *input_ptr++ + input_offset; - for (int m = 0; m < depth_multiplier; m++) { - const int16 filter_val = *filter_ptr++ + filter_offset; - *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; - } - } - input_ptr += input_ptr_increment; - } - filter_base_ptr += output_depth; - } -} - -// Initializes the accumulator buffer with bias values. -inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, - const int32* bias_data, - int32* acc_buffer) { - int i = 0; -#ifdef USE_NEON - if (output_depth == 1) { - const int32x4_t b = vdupq_n_s32(bias_data[0]); - for (; i <= num_output_pixels - 16; i += 16) { - vst1q_s32(acc_buffer + i + 0, b); - vst1q_s32(acc_buffer + i + 4, b); - vst1q_s32(acc_buffer + i + 8, b); - vst1q_s32(acc_buffer + i + 12, b); - } - for (; i <= num_output_pixels - 4; i += 4) { - vst1q_s32(acc_buffer + i, b); - } - } else if (output_depth == 2) { - int32x4_t b = vdupq_n_s32(bias_data[0]); - b = vsetq_lane_s32(bias_data[1], b, 1); - b = vsetq_lane_s32(bias_data[1], b, 3); - for (; i <= num_output_pixels - 8; i += 8) { - vst1q_s32(acc_buffer + 2 * i + 0, b); - vst1q_s32(acc_buffer + 2 * i + 4, b); - vst1q_s32(acc_buffer + 2 * i + 8, b); - vst1q_s32(acc_buffer + 2 * i + 12, b); - } - for (; i <= num_output_pixels - 2; i += 2) { - vst1q_s32(acc_buffer + 2 * i, b); - } - } else if (output_depth == 4) { - const int32x4_t b = vld1q_s32(bias_data); - for (; i <= num_output_pixels - 4; i += 4) { - vst1q_s32(acc_buffer + 4 * i + 0, b); - vst1q_s32(acc_buffer + 4 * i + 4, b); - vst1q_s32(acc_buffer + 4 * i + 8, b); - vst1q_s32(acc_buffer + 4 * i + 12, b); - } - for (; i < num_output_pixels; i++) { - vst1q_s32(acc_buffer + 4 * i, b); - } - } else if (output_depth == 8) { - const int32x4_t b0 = vld1q_s32(bias_data); - const int32x4_t b1 = vld1q_s32(bias_data + 4); - for (; i <= num_output_pixels - 2; i += 2) { - vst1q_s32(acc_buffer + 8 * i + 0, b0); - vst1q_s32(acc_buffer + 8 * i + 4, b1); - vst1q_s32(acc_buffer + 8 * i + 8, b0); - vst1q_s32(acc_buffer + 8 * i + 12, b1); - } - for (; i < num_output_pixels; i++) { - vst1q_s32(acc_buffer + 8 * i + 0, b0); - vst1q_s32(acc_buffer + 8 * i + 4, b1); - } - } else if (output_depth == 16) { - const int32x4_t b0 = vld1q_s32(bias_data); - const int32x4_t b1 = vld1q_s32(bias_data + 4); - const int32x4_t b2 = vld1q_s32(bias_data + 8); - const int32x4_t b3 = vld1q_s32(bias_data + 12); - for (; i < num_output_pixels; i++) { - vst1q_s32(acc_buffer + 16 * i + 0, b0); - vst1q_s32(acc_buffer + 16 * i + 4, b1); - vst1q_s32(acc_buffer + 16 * i + 8, b2); - vst1q_s32(acc_buffer + 16 * i + 12, b3); - } - } -#endif - for (; i < num_output_pixels; i++) { - memcpy(acc_buffer + i * output_depth, bias_data, - sizeof(acc_buffer[0]) * output_depth); - } -} - -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - DCHECK_EQ(output_activation_min, 0); - DCHECK_EQ(output_activation_max, 255); - } - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - DCHECK(output_depth == input_depth * depth_multiplier); - - static const int kAccBufferMaxSize = 1024; - int32 acc_buffer[kAccBufferMaxSize]; - DCHECK_GE(kAccBufferMaxSize, output_depth); - const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; - const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; - DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize); - DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); - DCHECK_GE(kOutputPixelsInAccBuffer, 1); - - // row_accum_func will point to the core accumulation function to be used - // for this DepthwiseConv op. - auto* row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; - - const int kMaxFixedDepthMultiplier = 16; - int fixed_depth_multiplier = 0; - if (depth_multiplier <= kMaxFixedDepthMultiplier) { - fixed_depth_multiplier = depth_multiplier; - } - // kMaxUnrolling is the max number of output values that we aim to handle - // in one unrolled iteration of the inner loop. For practical performance - // reasons, it is limited by the number of available registers. We could - // fine-tune it depending on the architecture, but that's not worth doing - // since this whole code is not very optimized to begin with. The - // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit - // vector registers. - const int kMaxUnrolling = 16; - int fixed_input_depth = 0; - if (fixed_depth_multiplier && - input_depth * fixed_depth_multiplier <= kMaxUnrolling) { - fixed_input_depth = input_depth; - } -#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ - FIXED_DEPTH_MULTIPLIER) \ - if ((stride_width == 1 || ALLOW_STRIDED) && \ - fixed_input_depth == FIXED_INPUT_DEPTH && \ - fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ - row_accum_func = \ - QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ - FIXED_DEPTH_MULTIPLIER>; \ - } - -#ifdef USE_NEON - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) - TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) - TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) -#endif // USE_NEON - -#undef TFMINI_USE_DEPTHWISECONV_KERNEL - - // Now that we have determined row_accum_func, we can start work. - uint8* output_ptr = output_data; - for (int b = 0; b < batches; ++b) { - for (int out_y = 0; out_y < output_height; ++out_y) { - const int in_y_origin = (out_y * stride_height) - pad_height; - const int filter_y_start = std::max(0, -in_y_origin); - const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); - for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; - out_x_buffer_start += kOutputPixelsInAccBuffer) { - const int out_x_buffer_end = std::min( - output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); - // We call a 'pixel' a group of activation that share all but the - // 'depth'/'channel' coordinate. num_output_pixels is the number of - // output pixels that we will accumulate in this loop iteration. - const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; - // Initialize our local accumulator with the bias values, so we don't - // have to add them later. - DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, - acc_buffer); - // Accumulation loop. Most of the time should be spent in here. - for (int filter_y = filter_y_start; filter_y < filter_y_end; - ++filter_y) { - const int in_y = in_y_origin + filter_y; - row_accum_func( - stride_width, input_depth, input_width, - input_data + in_y * input_dims.strides[2] + - b * input_dims.strides[3], - input_offset, pad_width, depth_multiplier, filter_width, - filter_data + filter_y * filter_dims.strides[2], filter_offset, - out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); - } - // Finished accumulating int32 values. Now need to convert them to - // the final 8bit form and store them. - gemmlowp::ScopedProfilingLabel label("downquantize+store"); - const int num_output_values = output_depth * num_output_pixels; - int i = 0; -#ifdef USE_NEON - using gemmlowp::RoundingDivideByPOT; - const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); - const int32x4_t output_activation_min_vec = - vdupq_n_s32(output_activation_min); - const int32x4_t output_activation_max_vec = - vdupq_n_s32(output_activation_max); - // Handle 16 values at once. - // This allows us to issue 4 mutually independent int32 - // multiplications (vqrdmulh), which should alleviate most of their - // high latency. - for (; i <= num_output_values - 16; i += 16) { - int32x4_t acc[4]; - for (int j = 0; j < 4; j++) { - acc[j] = vld1q_s32(acc_buffer + i + 4 * j); - } - - // Fixed-point multiplication. - for (int j = 0; j < 4; j++) { - acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); - } - for (int j = 0; j < 4; j++) { - acc[j] = RoundingDivideByPOT(acc[j], output_shift); - } - // Add the output offset. - for (int j = 0; j < 4; j++) { - acc[j] = vaddq_s32(acc[j], output_offset_vec); - } - // Apply the activation function. - if (Ac != FusedActivationFunctionType::kNone) { - for (int j = 0; j < 4; j++) { - acc[j] = vmaxq_s32(acc[j], output_activation_min_vec); - } - for (int j = 0; j < 4; j++) { - acc[j] = vminq_s32(acc[j], output_activation_max_vec); - } - } - // Saturating cast to uint8 and store to destination. - int16x4_t acc_s16[4]; - for (int j = 0; j < 4; j++) { - acc_s16[j] = vqmovn_s32(acc[j]); - } - const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]); - const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]); - const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0); - const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1); - vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1)); - output_ptr += 16; - } - // Handle 8 values at once. - // Not as good as 16 (now we're only issuing 2 mutually independent - // vqrdmulh instructions, so we're probably paying for their high - // latency). - for (; i <= num_output_values - 8; i += 8) { - int32x4_t acc0 = vld1q_s32(acc_buffer + i); - int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); - // Fixed-point multiplication. - acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); - acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); - // Rounding right shift. - acc0 = RoundingDivideByPOT(acc0, output_shift); - acc1 = RoundingDivideByPOT(acc1, output_shift); - // Add the output offset. - acc0 = vaddq_s32(acc0, output_offset_vec); - acc1 = vaddq_s32(acc1, output_offset_vec); - // Apply the activation function. - if (Ac != FusedActivationFunctionType::kNone) { - acc0 = vmaxq_s32(acc0, output_activation_min_vec); - acc1 = vmaxq_s32(acc1, output_activation_min_vec); - acc0 = vminq_s32(acc0, output_activation_max_vec); - acc1 = vminq_s32(acc1, output_activation_max_vec); - } - // Saturating cast to uint8 and store to destination. - const int16x4_t acc0_s16 = vqmovn_s32(acc0); - const int16x4_t acc1_s16 = vqmovn_s32(acc1); - const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16); - const uint8x8_t res_u8 = vqmovun_s16(res_s16); - vst1_u8(output_ptr, res_u8); - output_ptr += 8; - } - // Handle 4 values at once. Now we're paying the full price of the - // high latency of vqrdmulh. Also, storing only 4 bytes at the end - // (without any alignment) can only be done 1 byte at a time. - // Yet, that is still worth doing to minimize the amount of leftover - // that will have to go through the very slow scalar code. - for (; i <= num_output_values - 4; i += 4) { - int32x4_t acc = vld1q_s32(acc_buffer + i); - // Fixed-point multiplication. - acc = vqrdmulhq_n_s32(acc, output_multiplier); - // Rounding right shift. - acc = RoundingDivideByPOT(acc, output_shift); - // Add the output offset. - acc = vaddq_s32(acc, output_offset_vec); - // Apply the activation function. - if (Ac != FusedActivationFunctionType::kNone) { - acc = vmaxq_s32(acc, output_activation_min_vec); - acc = vminq_s32(acc, output_activation_max_vec); - } - // Saturating cast to uint8 and store to destination. - const int16x4_t acc_s16 = vqmovn_s32(acc); - const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); - const uint8x8_t res_u8 = vqmovun_s16(res_s16); - vst1_lane_u8(output_ptr + 0, res_u8, 0); - vst1_lane_u8(output_ptr + 1, res_u8, 1); - vst1_lane_u8(output_ptr + 2, res_u8, 2); - vst1_lane_u8(output_ptr + 3, res_u8, 3); - output_ptr += 4; - } -#endif // USE_NEON - - // Handle leftover values, one by one. This is very slow. - for (; i < num_output_values; i++) { - int32 acc = acc_buffer[i]; - acc = MultiplyByQuantizedMultiplierSmallerThanOne( - acc, output_multiplier, output_shift); - acc += output_offset; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - *output_ptr++ = static_cast<uint8>(acc); - } - } - } - } -} - -} // namespace optimized_ops -} // namespace rt -} // namespace nnfw - -#endif // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc deleted file mode 100644 index 7af122517..000000000 --- a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <string.h> - -#include "ActivationFunctor.h" -#include "tensor_utils_impl.h" - -#ifdef USE_NEON - -#include <arm_neon.h> -#define kFloatWeightsPerNeonLane 4 - -namespace nnfw { -namespace rt { -namespace tensor_utils { - -void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, - int m_cols, const float* vector, - int n_batch, float* result, - int result_stride) { - // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main - // vectorized loop, and we need to process sequentially. postamble_start shows - // the start index where this should happen. - const int postamble_start = - m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1)); - - // The arrays used to cache the vector. - float32x4_t* vector_cache_float32x4 = - new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) * - sizeof(float32x4_t)]; - - for (int b = 0; b < n_batch; b++) { - float* result_in_batch = result + b * m_rows; - const float* vector_in_batch = vector + b * m_cols; - const float* matrix_ptr = matrix; - for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { - vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c); - } - for (int r = 0; r < m_rows; r++) { - float32x4_t acc_32x4 = vmovq_n_f32(0.0); - for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { - float32x4_t temp = vector_cache_float32x4[c >> 2]; - // Load 4 float values from vector1 and vector2 and accumulator. - float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr + c); - // Vector multiply-accumulate 4 float - acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, temp); - } - // Add the 4 intermediate sum values to get the final dot-prod value for - // this column. - *result_in_batch += - (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) + - vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3)); - for (int c = postamble_start; c < m_cols; c++) { - *result_in_batch += matrix_ptr[c] * vector_in_batch[c]; - } - matrix_ptr += m_cols; - result_in_batch += result_stride; - } - } - delete[] vector_cache_float32x4; -} - -void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, - int v_size, float* result) { - // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main - // vectorized loop, and we need to process sequentially. postamble_start shows - // the start index where this should happen. - const int postamble_start = - v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); - for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { - // Load 4 float values from vector1 and vector2. - float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); - float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); - // Vector multiply 4 float - float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4); - // Save to result array. - vst1q_f32(&result[v], mul_32x4); - } - for (int v = postamble_start; v < v_size; v++) { - result[v] = vector1[v] * vector2[v]; - } -} - -void NeonVectorVectorCwiseProductAccumulate(const float* vector1, - const float* vector2, int v_size, - float* result) { - // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main - // vectorized loop, and we need to process sequentially. postamble_start shows - // the start index where this should happen. - const int postamble_start = - v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); - for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { - // Load 4 float values from vector1 and vector2 and accumulator. - float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); - float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); - float32x4_t acc_32x4 = vld1q_f32(result + v); - // Vector multiply-accumulate 4 float - acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4); - // Save to result array. - vst1q_f32(&result[v], acc_32x4); - } - for (int v = postamble_start; v < v_size; v++) { - result[v] += vector1[v] * vector2[v]; - } -} - -void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, float* result) { - // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main - // vectorized loop, and we need to process sequentially. postamble_start shows - // the start index where this should happen. - const int postamble_start = - v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); - - // The arrays used to cache the vector. - float32x4_t* vector_cache_float32x4 = - new float32x4_t[(v_size / kFloatWeightsPerNeonLane) * - sizeof(float32x4_t)]; - for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { - vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v); - } - - float* result_ptr = result; - const float* batch_vector_ptr = batch_vector; - for (int b = 0; b < n_batch; b++) { - for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { - // Load from memory to vectors. - float32x4_t result_f32x4 = vld1q_f32(result_ptr + v); - float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v); - // Multiply-accumulate. - result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4, - vector_cache_float32x4[v >> 2]); - // Store. - vst1q_f32(result_ptr + v, result_f32x4); - } - // Postamble loop - for (int v = postamble_start; v < v_size; v++) { - result_ptr[v] += vector[v] * batch_vector_ptr[v]; - } - // Update the pointers. - result_ptr += v_size; - batch_vector_ptr += v_size; - } - delete[] vector_cache_float32x4; -} - -void NeonSub1Vector(const float* vector, int v_size, float* result) { - // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main - // vectorized loop, and we need to process sequentially. postamble_start shows - // the start index where this should happen. - const int postamble_start = - v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); - - float32x4_t one_f32x4 = vmovq_n_f32(1.0); - for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { - // Load 4 float values from the current pointers of the input column and - // subtract from 1. - float32x4_t v_f32x4 = vld1q_f32(vector + v); - float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4); - // Save to output. - vst1q_f32(result + v, result_f32x4); - } - for (int v = postamble_start; v < v_size; v++) { - result[v] = 1.0f - vector[v]; - } -} - -void NeonClipVector(const float* vector, int v_size, float abs_limit, - float* result) { - // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main - // vectorized loop, and we need to process sequentially. postamble_start shows - // the start index where this should happen. - const int postamble_start = - v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); - - // Replicate abs_limit and -abs_limit in two vectors. - const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit); - const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit); - - for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { - // Load from memory to vector. - float32x4_t v_f32x4 = vld1q_f32(vector + v); - // Clip between abs_limit and -abs_limit. - float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4); - result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4); - // Save to output. - vst1q_f32(result + v, result_f32x4); - } - // Postamble loop. - for (int v = postamble_start; v < v_size; v++) { - result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v]; - result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v]; - } -} - -} // namespace tensor_utils -} // namespace rt -} // namespace nnfw - -#endif // USE_NEON diff --git a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h deleted file mode 100644 index 2a6f31572..000000000 --- a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_NEON_TENSOR_UTILS_H__ -#define __NNFW_RT_NEON_TENSOR_UTILS_H__ - -#include "ActivationFunctor.h" -#include "cpu_check.h" -#include "tensor_utils_impl.h" - -namespace nnfw { -namespace rt { -namespace tensor_utils { - -void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, - int m_cols, const float* vector, - int n_batch, float* result, - int result_stride) { - NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, - vector, n_batch, result, result_stride); -} - -void VectorVectorCwiseProduct(const float* vector1, const float* vector2, - int v_size, float* result) { - NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result); -} - -void VectorVectorCwiseProductAccumulate(const float* vector1, - const float* vector2, int v_size, - float* result) { - NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size, - result); -} - -void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result) { - NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size, - batch_vector, n_batch, result); -} - -float VectorVectorDotProduct(const float* vector1, const float* vector2, - int v_size) { - return PortableVectorVectorDotProduct(vector1, vector2, v_size); -} - -void BatchVectorBatchVectorDotProduct(const float* vector1, - const float* vector2, int v_size, - int n_batch, float* result, - int result_stride) { - PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch, - result, result_stride); -} - -void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch, - float* batch_vector) { - PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector); -} - -void ApplySigmoidToVector(const float* vector, int v_size, float* result) { - PortableApplySigmoidToVector(vector, v_size, result); -} - -void ApplyActivationToVector(const float* vector, int v_size, - ActivationFn activation, float* result) { - PortableApplyActivationToVector(vector, v_size, activation, result); -} - -void CopyVector(const float* vector, int v_size, float* result) { - PortableCopyVector(vector, v_size, result); -} - -void Sub1Vector(const float* vector, int v_size, float* result) { - NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result); -} - -void ZeroVector(float* vector, int v_size) { - PortableZeroVector(vector, v_size); -} - -float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); } - -void ClipVector(const float* vector, int v_size, float abs_limit, - float* result) { - NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result); -} - -// TODO(ghodrat): Implement Neon version. -void VectorShiftLeft(float* vector, int v_size, float shift_value) { - PortableVectorShiftLeft(vector, v_size, shift_value); -} - -// TODO(ghodrat): Implement Neon version. -void ReductionSumVector(const float* input_vector, int input_stride, - float* output_vector, int output_size, - int reduction_size) { - PortableReductionSumVector(input_vector, input_stride, output_vector, - output_size, reduction_size); -} - -} // namespace tensor_utils -} // namespace rt -} // namespace nnfw - -#endif // __NNFW_RT_NEON_TENSOR_UTILS_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/optimized_ops.h b/runtimes/nn/common/operations/internal/optimized/optimized_ops.h deleted file mode 100644 index 33862a0d7..000000000 --- a/runtimes/nn/common/operations/internal/optimized/optimized_ops.h +++ /dev/null @@ -1,2717 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (C) 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_OPTIMIZED_OPS_H__ -#define __NNFW_RT_OPTIMIZED_OPS_H__ - -#include <assert.h> -#include <stdint.h> -#include <sys/types.h> -#include <algorithm> -#include <cmath> -#include <limits> -#include <memory> -#include <tuple> -#include <type_traits> - -#include "Eigen/Core" -#include "fixedpoint.h" -#include "gemmlowp.h" -#include "../common.h" -#include "../types.h" - -namespace nnfw { -namespace rt { -namespace optimized_ops { - -// Make a local VectorMap typedef allowing to map a float array -// as a Eigen vector expression. The std::conditional here is to -// construct the suitable Eigen type for the constness of the -// data. Indeed, for const data, we need to produce -// Eigen::Map<const Eigen::Matrix<float, ...>> -// and not the more straightforward -// Eigen::Map<Eigen::Matrix<const float, ...>> -template <typename Scalar> -using VectorMap = typename std::conditional< - std::is_const<Scalar>::value, - Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, - Eigen::Dynamic, 1>>, - Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; - -template <typename Scalar, int N> -VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) { - const int size = RequiredBufferSizeForDims(dims); - return VectorMap<Scalar>(data, size, 1); -} - -// Make a local VectorMap typedef allowing to map a float array -// as a Eigen matrix expression. The same explanation as for VectorMap -// above also applies here. -template <typename Scalar> -using MatrixMap = typename std::conditional< - std::is_const<Scalar>::value, - Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, - Eigen::Dynamic, Eigen::Dynamic>>, - Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; - -template <typename Scalar, int N> -MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, - const Dims<N>& dims) { - const int rows = dims.sizes[0]; - int cols = 1; - for (int d = 1; d < N; d++) { - cols *= dims.sizes[d]; - } - return MatrixMap<Scalar>(data, rows, cols); -} - -template <typename Scalar, int N> -MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, - const Dims<N>& dims) { - const int cols = dims.sizes[N - 1]; - int rows = 1; - for (int d = 0; d < N - 1; d++) { - rows *= dims.sizes[d]; - } - return MatrixMap<Scalar>(data, rows, cols); -} - -template <typename Scalar> -using ArrayMap = typename std::conditional< - std::is_const<Scalar>::value, - Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type, - Eigen::Dynamic, Eigen::Dynamic>>, - Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; - -template <typename Scalar, int N> -ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data, - const Dims<N>& dims) { - const int rows = dims.sizes[0]; - int cols = 1; - for (int d = 1; d < N; d++) { - cols *= dims.sizes[d]; - } - return ArrayMap<Scalar>(data, rows, cols); -} - -// TODO(b/62193649): this function is only needed as long -// as we have the --variable_batch hack. -template <typename Scalar, int N> -MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data, - const Dims<N>& dims, - int rows) { - int cols = 1; - bool matched_rows = false; - for (int d = 0; d < N; d++) { - cols *= dims.sizes[d]; - if (cols == rows) { - matched_rows = true; - cols = 1; - } - } - DCHECK(matched_rows); - return MatrixMap<Scalar>(data, rows, cols); -} - -// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE -// BROADCASTING. -// -// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional -// rectangular array of numbers. -// -// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. -// However, as Dims<N> is to be deprecated, this class exists as an adaptor -// to enable simple unoptimized implementations of element-wise broadcasting -// operations. -template<int N> -struct NdArrayDesc { - // The "extent" of each dimension. Indices along dimension d must be in the - // half-open interval [0, extents[d]). - int extents[N]; - - // The number of *elements* (not bytes) between consecutive indices of each - // dimension. - int strides[N]; -}; - -// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING -// ELEMENT-WISE BROADCASTING. -// -// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. -inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, - int i3) { - DCHECK(i0 >= 0 && i0 < desc.extents[0]); - DCHECK(i1 >= 0 && i1 < desc.extents[1]); - DCHECK(i2 >= 0 && i2 < desc.extents[2]); - DCHECK(i3 >= 0 && i3 < desc.extents[3]); - return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + - i3 * desc.strides[3]; -} - -// Given the dimensions of the operands for an element-wise binary broadcast, -// adjusts them so that they can be directly iterated over with simple loops. -// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and -// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. -// -// This function assumes that the two input shapes are compatible up to -// broadcasting and the shorter one has already been prepended with 1s to be the -// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), -// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that -// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be -// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). -// -// When two shapes are compatible up to broadcasting, for each dimension d, -// the input extents are either equal, or one of them is 1. -// -// This function performs the following for each dimension d: -// - If the extents are equal, then do nothing since the loop that walks over -// both of the input arrays is correct. -// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 -// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows -// array0 to be referenced *at any index* in dimension d and still access the -// same slice. -template <int N> -inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, - const Dims<N>& input1_dims, - NdArrayDesc<N>* desc0_out, - NdArrayDesc<N>* desc1_out) { - DCHECK(desc0_out != nullptr); - DCHECK(desc1_out != nullptr); - - // Copy dims to desc. - for (int i = 0; i < N; ++i) { - desc0_out->extents[i] = input0_dims.sizes[i]; - desc0_out->strides[i] = input0_dims.strides[i]; - desc1_out->extents[i] = input1_dims.sizes[i]; - desc1_out->strides[i] = input1_dims.strides[i]; - } - - // Walk over each dimension. If the extents are equal do nothing. - // Otherwise, set the desc with extent 1 to have extent equal to the other and - // stride 0. - for (int i = 0; i < N; ++i) { - const int extent0 = ArraySize(input0_dims, i); - const int extent1 = ArraySize(input1_dims, i); - if (extent0 != extent1) { - if (extent0 == 1) { - desc0_out->strides[i] = 0; - desc0_out->extents[i] = extent1; - } else { - DCHECK_EQ(extent1, 1); - desc1_out->strides[i] = 0; - desc1_out->extents[i] = extent0; - } - } - } -} - -#ifdef USE_NEON -template <FusedActivationFunctionType Ac> -void AddBiasAndEvalActivationFunction(const float* bias_data, - const Dims<4>& bias_dims, - float* array_data, - const Dims<4>& array_dims) { - gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction"); - const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3]; - const int array_size = array_dims.sizes[3] * array_dims.strides[3]; - DCHECK_EQ((array_size % bias_size), 0); - float* array_ptr = array_data; - float* array_end_ptr = array_ptr + array_size; - const auto zero = vdupq_n_f32(0); - const auto six = vdupq_n_f32(6); - const auto neg_one = vdupq_n_f32(-1); - const auto one = vdupq_n_f32(1); - for (; array_ptr != array_end_ptr; array_ptr += bias_size) { - int i = 0; - for (; i <= bias_size - 16; i += 16) { - auto b0 = vld1q_f32(bias_data + i); - auto b1 = vld1q_f32(bias_data + i + 4); - auto b2 = vld1q_f32(bias_data + i + 8); - auto b3 = vld1q_f32(bias_data + i + 12); - auto a0 = vld1q_f32(array_ptr + i); - auto a1 = vld1q_f32(array_ptr + i + 4); - auto a2 = vld1q_f32(array_ptr + i + 8); - auto a3 = vld1q_f32(array_ptr + i + 12); - auto x0 = vaddq_f32(a0, b0); - auto x1 = vaddq_f32(a1, b1); - auto x2 = vaddq_f32(a2, b2); - auto x3 = vaddq_f32(a3, b3); - if (Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6) { - x0 = vmaxq_f32(zero, x0); - x1 = vmaxq_f32(zero, x1); - x2 = vmaxq_f32(zero, x2); - x3 = vmaxq_f32(zero, x3); - if (Ac == FusedActivationFunctionType::kRelu6) { - x0 = vminq_f32(six, x0); - x1 = vminq_f32(six, x1); - x2 = vminq_f32(six, x2); - x3 = vminq_f32(six, x3); - } - } else if (Ac == FusedActivationFunctionType::kRelu1) { - x0 = vmaxq_f32(neg_one, x0); - x1 = vmaxq_f32(neg_one, x1); - x2 = vmaxq_f32(neg_one, x2); - x3 = vmaxq_f32(neg_one, x3); - x0 = vminq_f32(one, x0); - x1 = vminq_f32(one, x1); - x2 = vminq_f32(one, x2); - x3 = vminq_f32(one, x3); - } - vst1q_f32(array_ptr + i, x0); - vst1q_f32(array_ptr + i + 4, x1); - vst1q_f32(array_ptr + i + 8, x2); - vst1q_f32(array_ptr + i + 12, x3); - } - for (; i <= bias_size - 4; i += 4) { - auto b = vld1q_f32(bias_data + i); - auto a = vld1q_f32(array_ptr + i); - auto x = vaddq_f32(a, b); - if (Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6) { - x = vmaxq_f32(zero, x); - if (Ac == FusedActivationFunctionType::kRelu6) { - x = vminq_f32(six, x); - } - } else if (Ac == FusedActivationFunctionType::kRelu1) { - x = vmaxq_f32(neg_one, x); - x = vminq_f32(one, x); - } - vst1q_f32(array_ptr + i, x); - } - for (; i < bias_size; i++) { - array_ptr[i] = ActivationFunction<Ac>(array_ptr[i] + bias_data[i]); - } - } -} -#else // not NEON -template <FusedActivationFunctionType Ac> -void AddBiasAndEvalActivationFunction(const float* bias_data, - const Dims<4>& bias_dims, - float* array_data, - const Dims<4>& array_dims) { - gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction"); - const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3]; - const int array_size = array_dims.sizes[3] * array_dims.strides[3]; - DCHECK_EQ((array_size % bias_size), 0); - for (int array_offset = 0; array_offset < array_size; - array_offset += bias_size) { - for (int i = 0; i < bias_size; i++) { - array_data[array_offset + i] = - ActivationFunction<Ac>(array_data[array_offset + i] + bias_data[i]); - } - } -} -#endif - -template <typename Lhs, typename Rhs, typename Result> -void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs, - Eigen::MatrixBase<Result>* result) { - if (rhs.cols() == 1) { - gemmlowp::ScopedProfilingLabel label("GEMV"); - result->col(0).noalias() = lhs * rhs.col(0); - } else { - gemmlowp::ScopedProfilingLabel label("GEMM"); - result->noalias() = lhs * rhs; - } -} - -template <FusedActivationFunctionType Ac> -void FullyConnected(const float* input_data, const Dims<4>& input_dims, - const float* weights_data, const Dims<4>& weights_dims, - const float* bias_data, const Dims<4>& bias_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("FullyConnected"); - // TODO(b/62193649): this convoluted shape computation (determining - // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows) - // is because the current --variable_batch hack consists in overwriting the - // 3rd dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - // When that is fixed, this should become: - // const auto input_matrix_map = - // MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - const int input_rows = ArraySize(weights_dims, 0); - const auto input_matrix_map = - MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows); - const auto filter_matrix_map = - MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims); - auto output_matrix_map = - MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - - Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); - AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, - output_dims); -} - -inline void preload_l1_stream(const uint8* ptr) { -#ifdef GEMMLOWP_ARM_64 - asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :); -#else - gemmlowp::Prefetch(ptr); -#endif -} - -#ifdef USE_NEON -template <FusedActivationFunctionType Ac> -void FullyConnectedAsGEMV(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit"); - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - DCHECK(IsPackedWithoutStrides(input_dims)); - DCHECK(IsPackedWithoutStrides(filter_dims)); - DCHECK(IsPackedWithoutStrides(bias_dims)); - DCHECK(IsPackedWithoutStrides(output_dims)); - DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * - ArraySize(output_dims, 3), - 1); - const int input_size = input_dims.strides[3]; - const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); - static constexpr int kPeel = 4; - for (int k = 0; k < input_size; k += 64) { - preload_l1_stream(input_data + k); - } - for (int k = 0; k < kPeel * input_size; k += 64) { - preload_l1_stream(filter_data + k); - } - DCHECK(!(output_size % kPeel)); - const int32* bias_ptr = bias_data; - uint8* output_ptr = output_data; - for (int out = 0; out < output_size; out += kPeel) { - int32x4_t acc[kPeel]; - for (int k = 0; k < kPeel; k++) { - acc[k] = vdupq_n_s32(0); - } - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset); - int in = 0; - for (; in <= input_size - 16; in += 16) { - const uint8x16_t input_val_u8 = vld1q_u8(input_data + in); - uint8x16_t filter_val_u8[kPeel]; - for (int k = 0; k < kPeel; k++) { - const uint8* filter_ptr = filter_data + in + (out + k) * input_size; - filter_val_u8[k] = vld1q_u8(filter_ptr); - preload_l1_stream(filter_ptr + 64); - } - int16x8_t input_val[2]; - const uint8x8_t low = vget_low_u8(input_val_u8); - const uint8x8_t high = vget_high_u8(input_val_u8); - input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low)); - input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high)); - input_val[0] = vaddq_s16(input_val[0], input_offset_vec); - input_val[1] = vaddq_s16(input_val[1], input_offset_vec); - int16x8_t filter_val[kPeel][2]; - for (int k = 0; k < kPeel; k++) { - const uint8x8_t low = vget_low_u8(filter_val_u8[k]); - const uint8x8_t high = vget_high_u8(filter_val_u8[k]); - filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low)); - filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high)); - filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec); - filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec); - } - for (int p = 0; p < 2; p++) { - for (int k = 0; k < kPeel; k++) { - acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]), - vget_low_s16(input_val[p])); - } - for (int k = 0; k < kPeel; k++) { - acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]), - vget_high_s16(input_val[p])); - } - } - } - for (; in <= input_size - 8; in += 8) { - const uint8x8_t input_val_u8 = vld1_u8(input_data + in); - uint8x8_t filter_val_u8[kPeel]; - for (int k = 0; k < kPeel; k++) { - const uint8* filter_ptr = filter_data + in + (out + k) * input_size; - filter_val_u8[k] = vld1_u8(filter_ptr); - } - int16x8_t input_val; - input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8)); - input_val = vaddq_s16(input_val, input_offset_vec); - int16x8_t filter_val[kPeel]; - for (int k = 0; k < kPeel; k++) { - filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k])); - filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec); - } - for (int k = 0; k < kPeel; k++) { - acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]), - vget_low_s16(input_val)); - } - for (int k = 0; k < kPeel; k++) { - acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]), - vget_high_s16(input_val)); - } - } - if (in < input_size) { - int32 buf[4 * kPeel]; - for (int k = 0; k < 4; k++) { - vst1q_s32(buf + 4 * k, acc[k]); - } - for (; in < input_size; in++) { - int lane = (in + 8 - input_size) % 4; - const int32 input_val = input_data[in] + input_offset; - for (int k = 0; k < kPeel; k++) { - int32 filter_val = - filter_data[in + (out + k) * input_size] + filter_offset; - buf[lane + 4 * k] += filter_val * input_val; - } - } - for (int k = 0; k < 4; k++) { - acc[k] = vld1q_s32(buf + 4 * k); - } - } - - // Horizontally reduce accumulators - int32x2_t pairwise_reduced_acc[kPeel]; - for (int k = 0; k < kPeel; k++) { - pairwise_reduced_acc[k] = - vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k])); - } - static_assert(kPeel == 4, "the code below currently assumes kPeel = 4"); - const int32x2_t reduced_lo = - vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]); - const int32x2_t reduced_hi = - vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]); - int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); - // Add bias values. - int32x4_t bias_vec = vld1q_s32(bias_ptr); - bias_ptr += 4; - reduced = vaddq_s32(reduced, bias_vec); - // Multiply by the fixed-point multiplier. - reduced = vqrdmulhq_n_s32(reduced, output_multiplier); - // Rounding-shift-right. - using gemmlowp::RoundingDivideByPOT; - reduced = RoundingDivideByPOT(reduced, output_shift); - // Add the output offset. - const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); - reduced = vaddq_s32(reduced, output_offset_vec); - // Narrow values down to 16 bit signed. - const int16x4_t res16 = vqmovn_s32(reduced); - // Narrow values down to 8 bit unsigned, saturating. - uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16)); - if (Ac != FusedActivationFunctionType::kNone) { - // Apply the clamping from the activation function - res8 = vmax_u8(res8, vdup_n_u8(output_activation_min)); - res8 = vmin_u8(res8, vdup_n_u8(output_activation_max)); - } - // Store results to destination. Assumes 32bit alignment. - vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr), - vreinterpret_u32_u8(res8), 0); - output_ptr += kPeel; - } -} -#endif // USE_NEON - -template <FusedActivationFunctionType Ac> -struct GemmlowpOutputPipeline { - typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col> - ColVectorMap; - typedef std::tuple< - gemmlowp::OutputStageBiasAddition<ColVectorMap>, - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, - gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> - Pipeline; - static Pipeline Make(const int32* bias_data, int output_rows, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max) { - ColVectorMap bias_vector(bias_data, output_rows); - gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage; - bias_addition_stage.bias_vector = bias_vector; - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint - quantize_down_stage; - quantize_down_stage.result_offset_after_shift = output_offset; - quantize_down_stage.result_fixedpoint_multiplier = output_multiplier; - quantize_down_stage.result_shift = output_shift; - gemmlowp::OutputStageClamp clamp_stage; - clamp_stage.min = output_activation_min; - clamp_stage.max = output_activation_max; - gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; - return std::make_tuple(bias_addition_stage, quantize_down_stage, - clamp_stage, saturating_cast_stage); - } -}; - -template <> -struct GemmlowpOutputPipeline<FusedActivationFunctionType::kNone> { - typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col> - ColVectorMap; - typedef std::tuple< - gemmlowp::OutputStageBiasAddition<ColVectorMap>, - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, - gemmlowp::OutputStageSaturatingCastToUint8> - Pipeline; - static Pipeline Make(const int32* bias_data, int output_rows, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max) { - DCHECK_EQ(output_activation_min, 0); - DCHECK_EQ(output_activation_max, 255); - ColVectorMap bias_vector(bias_data, output_rows); - gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage; - bias_addition_stage.bias_vector = bias_vector; - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint - quantize_down_stage; - quantize_down_stage.result_offset_after_shift = output_offset; - quantize_down_stage.result_fixedpoint_multiplier = output_multiplier; - quantize_down_stage.result_shift = output_shift; - gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; - return std::make_tuple(bias_addition_stage, quantize_down_stage, - saturating_cast_stage); - } -}; - -template <FusedActivationFunctionType Ac> -void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit"); - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - // TODO: This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * - ArraySize(output_dims, 3); -#ifdef USE_NEON - const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); - if (batches == 1 && !(output_size % 4)) { - return FullyConnectedAsGEMV<Ac>( - input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_data, - output_dims); - } -#endif // USE_NEON - const int filter_rows = filter_dims.sizes[1]; - const int filter_cols = filter_dims.sizes[0]; - DCHECK_EQ(filter_dims.sizes[2], 1); - DCHECK_EQ(filter_dims.sizes[3], 1); - const int output_rows = output_dims.sizes[0]; - DCHECK_EQ(output_rows, filter_rows); - DCHECK_EQ(bias_dims.sizes[0], output_rows); - DCHECK_EQ(bias_dims.sizes[1], 1); - DCHECK_EQ(bias_dims.sizes[2], 1); - DCHECK_EQ(bias_dims.sizes[3], 1); - - gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( - filter_data, output_rows, filter_cols, filter_cols); - gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( - input_data, filter_cols, batches, filter_cols); - gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( - output_data, output_rows, batches, output_rows); - const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make( - bias_data, output_rows, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max); - gemmlowp::GemmWithOutputPipeline<uint8, uint8, - gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( - gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, - input_offset, output_pipeline); -} - -template <typename T> -inline void ExtractPatchIntoBufferColumn( - const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth, - int stride_width, int stride_height, int pad_width, int pad_height, - int in_width, int in_height, int in_depth, int single_buffer_length, - int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) { - gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn"); - // This chunk of code reshapes all the inputs corresponding to - // output (b, h, w) to a column vector in conv_buffer(:, buffer_id). - const int kwidth_times_indepth = kwidth * in_depth; - const int inwidth_times_indepth = in_width * in_depth; - const int ih_ungated_start = h * stride_height - pad_height; - const int ih_ungated_end = (ih_ungated_start + kheight); - const int ih_end = std::min(ih_ungated_end, in_height); - const int iw_ungated_start = w * stride_width - pad_width; - const int iw_ungated_end = (iw_ungated_start + kwidth); - const int iw_end = std::min(iw_ungated_end, in_width); - // If the patch is off the edge of the input image, skip writing those rows - // and columns from the patch into the output array. - const int h_offset = std::max(0, -ih_ungated_start); - const int w_offset = std::max(0, -iw_ungated_start); - const int ih_start = std::max(0, ih_ungated_start); - const int iw_start = std::max(0, iw_ungated_start); - const int single_row_num = - std::min(kwidth - w_offset, in_width - iw_start) * in_depth; - const int output_row_offset = (buffer_id * single_buffer_length); - int out_offset = - output_row_offset + (h_offset * kwidth + w_offset) * in_depth; - int in_offset = Offset(input_dims, 0, iw_start, ih_start, b); - - // Express all of the calculations as padding around the input patch. - const int top_padding = h_offset; - const int bottom_padding = (ih_ungated_end - ih_end); - const int left_padding = w_offset; - const int right_padding = (iw_ungated_end - iw_end); - assert(single_row_num == - ((kwidth - (left_padding + right_padding)) * in_depth)); - - // Write out zeroes to the elements representing the top rows of the input - // patch that are off the edge of the input image. - if (top_padding > 0) { - const int top_row_elements = (top_padding * kwidth * in_depth); - memset(conv_buffer_data + output_row_offset, byte_zero, - (top_row_elements * sizeof(T))); - } - - // If the patch is on the interior of the input image horizontally, just copy - // over the rows sequentially, otherwise add zero padding at the start or end. - if ((left_padding == 0) && (right_padding == 0)) { - for (int ih = ih_start; ih < ih_end; ++ih) { - memcpy(conv_buffer_data + out_offset, in_data + in_offset, - single_row_num * sizeof(T)); - out_offset += kwidth_times_indepth; - in_offset += inwidth_times_indepth; - } - } else { - for (int ih = ih_start; ih < ih_end; ++ih) { - if (left_padding > 0) { - const int left_start = (out_offset - (left_padding * in_depth)); - memset(conv_buffer_data + left_start, byte_zero, - (left_padding * in_depth * sizeof(T))); - } - memcpy(conv_buffer_data + out_offset, in_data + in_offset, - single_row_num * sizeof(T)); - if (right_padding > 0) { - const int right_start = (out_offset + single_row_num); - memset(conv_buffer_data + right_start, byte_zero, - (right_padding * in_depth * sizeof(T))); - } - out_offset += kwidth_times_indepth; - in_offset += inwidth_times_indepth; - } - } - - // If the bottom of the patch falls off the input image, pad the values - // representing those input rows with zeroes. - if (bottom_padding > 0) { - const int bottom_row_elements = (bottom_padding * kwidth * in_depth); - const int bottom_start = - output_row_offset + - ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); - memset(conv_buffer_data + bottom_start, byte_zero, - (bottom_row_elements * sizeof(T))); - } -} - -template <typename T> -void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width, - int stride_height, int pad_width, int pad_height, int kheight, - int kwidth, uint8 byte_zero, T* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Im2col"); - DCHECK(IsPackedWithoutStrides(input_dims)); - DCHECK(IsPackedWithoutStrides(output_dims)); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_depth = ArraySize(input_dims, 0); - const int input_width = ArraySize(input_dims, 1); - const int input_height = ArraySize(input_dims, 2); - const int output_depth = ArraySize(output_dims, 0); - const int output_width = ArraySize(output_dims, 1); - const int output_height = ArraySize(output_dims, 2); - - int buffer_id = 0; - // Loop over the output nodes. - for (int b = 0; b < batches; ++b) { - for (int h = 0; h < output_height; ++h) { - for (int w = 0; w < output_width; ++w) { - ExtractPatchIntoBufferColumn( - input_dims, w, h, b, kheight, kwidth, stride_width, stride_height, - pad_width, pad_height, input_width, input_height, input_depth, - output_depth, buffer_id, input_data, output_data, byte_zero); - ++buffer_id; - } - } - } -} - -template <FusedActivationFunctionType Ac> -void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride_width, - int stride_height, int pad_width, int pad_height, float* output_data, - const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { - (void)im2col_data; - (void)im2col_dims; - gemmlowp::ScopedProfilingLabel label("Conv"); - - const float* gemm_input_data = nullptr; - const Dims<4>* gemm_input_dims = nullptr; - const int filter_width = ArraySize(filter_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); - const bool need_im2col = stride_width != 1 || stride_height != 1 || - filter_width != 1 || filter_height != 1; - if (need_im2col) { - DCHECK(im2col_data); - Im2col(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_height, filter_width, 0, im2col_data, - im2col_dims); - gemm_input_data = im2col_data; - gemm_input_dims = &im2col_dims; - } else { -#if 0 // TODO-NNRT : Check if it needs, 'im2col_data' seems to be always not null. - DCHECK(!im2col_data); -#endif - gemm_input_data = input_data; - gemm_input_dims = &input_dims; - } - - const auto im2col_matrix_map = - MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims); - const auto filter_matrix_map = - MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); - auto output_matrix_map = - MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - - Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map); - - AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, - output_dims); -} - -template <FusedActivationFunctionType Ac> -void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int stride_width, - int stride_height, int pad_width, int pad_height, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { - gemmlowp::ScopedProfilingLabel label("Conv/8bit"); - - DCHECK(IsPackedWithoutStrides(input_dims)); - DCHECK(IsPackedWithoutStrides(filter_dims)); - DCHECK(IsPackedWithoutStrides(output_dims)); - - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - - const uint8* gemm_input_data = nullptr; - const Dims<4>* gemm_input_dims = nullptr; - const int filter_width = ArraySize(filter_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); - const bool need_im2col = stride_width != 1 || stride_height != 1 || - filter_width != 1 || filter_height != 1; - if (need_im2col) { - DCHECK(im2col_data); - const int input_zero_point = -input_offset; - DCHECK_GE(input_zero_point, 0); - DCHECK_LE(input_zero_point, 255); - Im2col(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_height, filter_width, input_zero_point, - im2col_data, im2col_dims); - gemm_input_data = im2col_data; - gemm_input_dims = &im2col_dims; - } else { -#if 0 // TODO-NNRT : Check if it needs, 'im2col_data' seems to be always not null. - DCHECK(!im2col_data); -#endif - gemm_input_data = input_data; - gemm_input_dims = &input_dims; - } - - const int gemm_input_rows = gemm_input_dims->sizes[0]; - const int gemm_input_cols = gemm_input_dims->sizes[1] * - gemm_input_dims->sizes[2] * - gemm_input_dims->sizes[3]; - const int filter_rows = filter_dims.sizes[3]; - const int filter_cols = - filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2]; - const int output_rows = output_dims.sizes[0]; - const int output_cols = - output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3]; - DCHECK_EQ(output_rows, filter_rows); - DCHECK_EQ(output_cols, gemm_input_cols); - DCHECK_EQ(filter_cols, gemm_input_rows); - DCHECK_EQ(bias_dims.sizes[0], output_rows); - DCHECK_EQ(bias_dims.sizes[1], 1); - DCHECK_EQ(bias_dims.sizes[2], 1); - DCHECK_EQ(bias_dims.sizes[3], 1); - gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( - filter_data, filter_rows, filter_cols); - gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( - gemm_input_data, gemm_input_rows, gemm_input_cols); - gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( - output_data, output_rows, output_cols); - const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make( - bias_data, output_rows, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max); - gemmlowp::GemmWithOutputPipeline<uint8, uint8, - gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( - gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, - input_offset, output_pipeline); -} - -template <typename T> -inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, - int block_size, T* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("DepthToSpace"); - - const int input_depth = ArraySize(input_dims, 0); - const int input_width = ArraySize(input_dims, 1); - const int input_height = ArraySize(input_dims, 2); - - const int output_depth = ArraySize(output_dims, 0); - const int batch_size = ArraySize(output_dims, 3); - - // Number of continuous values that we can copy in one interation. - const int stride = block_size * output_depth; - - for (int batch = 0; batch < batch_size; ++batch) { - for (int in_h = 0; in_h < input_height; ++in_h) { - const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch); - for (int offset_h = 0; offset_h < block_size; ++offset_h) { - const T* src = input_ptr; - for (int in_w = 0; in_w < input_width; ++in_w) { - memcpy(output_data, src, stride * sizeof(T)); - output_data += stride; - src += input_depth; - } - input_ptr += stride; - } - } - } -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac, typename T> -void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int kheight, int kwidth, - uint8 byte_zero, T* output_data, const Dims<4>& output_dims) { - Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, - kwidth, byte_zero, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void ConvAsGemm(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("ConvAsGemm"); - - const auto input_matrix_map = - MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - const auto filter_matrix_map = - MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); - auto output_matrix_map = - MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - - Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); - - AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, - output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit"); - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - const int input_rows = input_dims.sizes[0]; - const int input_cols = - input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3]; - const int filter_rows = filter_dims.sizes[3]; - const int filter_cols = - filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2]; - const int output_rows = output_dims.sizes[0]; - const int output_cols = - output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3]; - DCHECK_EQ(output_rows, filter_rows); - DCHECK_EQ(output_cols, input_cols); - DCHECK_EQ(filter_cols, input_rows); - DCHECK_EQ(bias_dims.sizes[0], output_rows); - DCHECK_EQ(bias_dims.sizes[1], 1); - DCHECK_EQ(bias_dims.sizes[2], 1); - DCHECK_EQ(bias_dims.sizes[3], 1); - gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( - filter_data, output_rows, filter_cols, filter_cols); - gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( - input_data, filter_cols, output_cols, filter_cols); - gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( - output_data, output_rows, output_cols, output_rows); - const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make( - bias_data, output_rows, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max); - gemmlowp::GemmWithOutputPipeline<uint8, uint8, - gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( - gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, - input_offset, output_pipeline); -} - -template <typename T> -inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, - int block_size, T* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("SpaceToDepth"); - - const int output_depth = ArraySize(output_dims, 0); - const int output_width = ArraySize(output_dims, 1); - const int output_height = ArraySize(output_dims, 2); - - const int input_depth = ArraySize(input_dims, 0); - const int batch_size = ArraySize(input_dims, 3); - - // Number of continuous values that we can copy in one interation. - const int stride = block_size * input_depth; - - for (int batch = 0; batch < batch_size; ++batch) { - for (int out_h = 0; out_h < output_height; ++out_h) { - T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch); - for (int offset_h = 0; offset_h < block_size; ++offset_h) { - T* dst = output_ptr; - for (int out_w = 0; out_w < output_width; ++out_w) { - memcpy(dst, input_data, stride * sizeof(T)); - input_data += stride; - dst += output_depth; - } - output_ptr += stride; - } - } - } -} - -template <FusedActivationFunctionType Ac> -void NonGlobalBatchNormalization( - const float* input_data, const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, const float* multiplier_data, - const Dims<4>& multiplier_dims, const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = - MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2, - offset_dims, 2, output_dims, 2); - const int width = - MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1, - offset_dims, 1, output_dims, 1); - const int depth = - MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, - offset_dims, 0, output_dims, 0); - - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( - (input_data[Offset(input_dims, c, x, y, b)] - - mean_data[Offset(mean_dims, c, x, y, 0)]) * - multiplier_data[Offset(multiplier_dims, c, x, y, 0)] + - offset_data[Offset(offset_dims, c, x, y, 0)]); - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -void GlobalBatchNormalization(const float* input_data, - const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, - const float* multiplier_data, - const Dims<4>& multiplier_dims, - const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = - MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, - offset_dims, 0, output_dims, 0); - - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( - (input_data[Offset(input_dims, c, x, y, b)] - - mean_data[Offset(mean_dims, c, 0, 0, 0)]) * - multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] + - offset_data[Offset(offset_dims, c, 0, 0, 0)]); - } - } - } - } -} - -inline void Relu(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Relu (not fused)"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - float val = input_data[Offset(input_dims, c, x, y, b)]; - const float lower = 0; - float clamped = val < lower ? lower : val; - output_data[Offset(output_dims, c, x, y, b)] = clamped; - } - } - } - } -} - -inline void Relu1(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - float val = input_data[Offset(input_dims, c, x, y, b)]; - const float upper = 1; - const float lower = -1; - float clamped = val > upper ? upper : val < lower ? lower : val; - output_data[Offset(output_dims, c, x, y, b)] = clamped; - } - } - } - } -} - -inline void Relu6(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - float val = input_data[Offset(input_dims, c, x, y, b)]; - const float upper = 6; - const float lower = 0; - float clamped = val > upper ? upper : val < lower ? lower : val; - output_data[Offset(output_dims, c, x, y, b)] = clamped; - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -void L2Normalization(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("L2Normalization"); - static_assert(Ac == FusedActivationFunctionType::kNone, ""); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - float squared_l2_norm = 0; - for (int c = 0; c < depth; ++c) { - float val = input_data[Offset(input_dims, c, x, y, b)]; - squared_l2_norm += val * val; - } - float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm); - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm; - } - } - } - } -} - -inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, - int* output_shift) { - *output_shift = 11; - while (input >= (1 << 29)) { - input /= 4; - ++*output_shift; - } - DCHECK_GT(input, 0); - const unsigned max_left_shift_bits = __builtin_clz(input) - 1; - const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; - const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; - *output_shift -= left_shift_bit_pairs; - input <<= 2 * left_shift_bit_pairs; - DCHECK_GE(input, (1 << 27)); - DCHECK_LT(input, (1 << 29)); - using gemmlowp::FixedPoint; - using gemmlowp::Rescale; - using gemmlowp::SaturatingRoundingMultiplyByPOT; - // Using 3 integer bits gives us enough room for the internal arithmetic in - // this Newton-Raphson iteration. - using F3 = FixedPoint<int32, 3>; - using F0 = FixedPoint<int32, 0>; - const F3 fixedpoint_input = F3::FromRaw(input >> 1); - const F3 fixedpoint_half_input = - SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); - const F3 fixedpoint_half_three = - GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); - // Newton-Raphson iteration - // Naive unoptimized starting guess: x = 1 - F3 x = F3::One(); - // Naive unoptimized number of iterations: 5 - for (int i = 0; i < 5; i++) { - const F3 x3 = Rescale<3>(x * x * x); - x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); - } - const F0 fixedpoint_half_sqrt_2 = - GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); - x = x * fixedpoint_half_sqrt_2; - *output_inv_sqrt = x.raw(); - if (*output_shift < 0) { - *output_inv_sqrt <<= -*output_shift; - *output_shift = 0; - } -} - -inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - DCHECK(IsPackedWithoutStrides(input_dims)); - DCHECK(IsPackedWithoutStrides(output_dims)); - DCHECK_EQ(batches, 1); - DCHECK_EQ(height, 1); - DCHECK_EQ(width, 1); - int32 square_l2_norm = 0; - for (int i = 0; i < depth; i++) { - int32 diff = input_data[i] - input_zero_point; - square_l2_norm += diff * diff; - } - int32 inv_l2norm_multiplier; - int inv_l2norm_shift; - GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier, - &inv_l2norm_shift); - - for (int i = 0; i < depth; i++) { - int32 diff = input_data[i] - input_zero_point; - int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne( - 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); - int32 unclamped_output_val = 128 + rescaled_diff; - int32 output_val = std::min(255, std::max(0, unclamped_output_val)); - output_data[i] = static_cast<uint8>(output_val); - } -} - -template <FusedActivationFunctionType Ac> -void Add(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Add"); - /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, - output_dims, 3); - /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, - output_dims, 2); - /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, - output_dims, 1); - /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, - output_dims, 0); - DCHECK(IsPackedWithoutStrides(input1_dims)); - DCHECK(IsPackedWithoutStrides(input2_dims)); - DCHECK(IsPackedWithoutStrides(output_dims)); - - int i = 0; - const int size = input1_dims.sizes[3] * input1_dims.strides[3]; -#ifdef USE_NEON - const auto zero = vdupq_n_f32(0); - const auto six = vdupq_n_f32(6); - const auto neg_one = vdupq_n_f32(-1); - const auto one = vdupq_n_f32(1); - for (; i <= size - 16; i += 16) { - auto a10 = vld1q_f32(input1_data + i); - auto a11 = vld1q_f32(input1_data + i + 4); - auto a12 = vld1q_f32(input1_data + i + 8); - auto a13 = vld1q_f32(input1_data + i + 12); - auto a20 = vld1q_f32(input2_data + i); - auto a21 = vld1q_f32(input2_data + i + 4); - auto a22 = vld1q_f32(input2_data + i + 8); - auto a23 = vld1q_f32(input2_data + i + 12); - auto x0 = vaddq_f32(a10, a20); - auto x1 = vaddq_f32(a11, a21); - auto x2 = vaddq_f32(a12, a22); - auto x3 = vaddq_f32(a13, a23); - if (Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6) { - x0 = vmaxq_f32(zero, x0); - x1 = vmaxq_f32(zero, x1); - x2 = vmaxq_f32(zero, x2); - x3 = vmaxq_f32(zero, x3); - if (Ac == FusedActivationFunctionType::kRelu6) { - x0 = vminq_f32(six, x0); - x1 = vminq_f32(six, x1); - x2 = vminq_f32(six, x2); - x3 = vminq_f32(six, x3); - } - } else if (Ac == FusedActivationFunctionType::kRelu1) { - x0 = vmaxq_f32(neg_one, x0); - x1 = vmaxq_f32(neg_one, x1); - x2 = vmaxq_f32(neg_one, x2); - x3 = vmaxq_f32(neg_one, x3); - x0 = vminq_f32(one, x0); - x1 = vminq_f32(one, x1); - x2 = vminq_f32(one, x2); - x3 = vminq_f32(one, x3); - } - vst1q_f32(output_data + i, x0); - vst1q_f32(output_data + i + 4, x1); - vst1q_f32(output_data + i + 8, x2); - vst1q_f32(output_data + i + 12, x3); - } - for (; i <= size - 4; i += 4) { - auto a1 = vld1q_f32(input1_data + i); - auto a2 = vld1q_f32(input2_data + i); - auto x = vaddq_f32(a1, a2); - if (Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6) { - x = vmaxq_f32(zero, x); - if (Ac == FusedActivationFunctionType::kRelu6) { - x = vminq_f32(six, x); - } - } else if (Ac == FusedActivationFunctionType::kRelu1) { - x = vmaxq_f32(neg_one, x); - x = vminq_f32(one, x); - } - vst1q_f32(output_data + i, x); - } -#endif // NEON - - for (; i < size; i++) { - auto x = input1_data[i] + input2_data[i]; - output_data[i] = ActivationFunction<Ac>(x); - } -} - -template <FusedActivationFunctionType Ac> -inline void Add(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, int input2_shift, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - DCHECK_EQ(output_activation_min, 0); - DCHECK_EQ(output_activation_max, 255); - } - gemmlowp::ScopedProfilingLabel label("Add/8bit"); - /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, - output_dims, 3); - /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, - output_dims, 2); - /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, - output_dims, 1); - /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, - output_dims, 0); - DCHECK(IsPackedWithoutStrides(input1_dims)); - DCHECK(IsPackedWithoutStrides(input2_dims)); - DCHECK(IsPackedWithoutStrides(output_dims)); - - int i = 0; - const int size = input1_dims.sizes[3] * input1_dims.strides[3]; - DCHECK_GT(input1_offset, -256); - DCHECK_GT(input2_offset, -256); - DCHECK_LT(input1_offset, 256); - DCHECK_LT(input2_offset, 256); -#ifdef USE_NEON - for (; i <= size - 8; i += 8) { - const auto input1_val_original = vld1_u8(input1_data + i); - const auto input2_val_original = vld1_u8(input2_data + i); - const auto input1_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input1_val_original)); - const auto input2_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); - const auto input1_val = - vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset)); - const auto input2_val = - vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset)); - const auto input1_val_high = vget_high_s16(input1_val); - const auto input1_val_low = vget_low_s16(input1_val); - const auto input2_val_high = vget_high_s16(input2_val); - const auto input2_val_low = vget_low_s16(input2_val); - auto x11 = vmovl_s16(input1_val_low); - auto x12 = vmovl_s16(input1_val_high); - auto x21 = vmovl_s16(input2_val_low); - auto x22 = vmovl_s16(input2_val_high); - const auto left_shift_dup = vdupq_n_s32(left_shift); - x11 = vshlq_s32(x11, left_shift_dup); - x12 = vshlq_s32(x12, left_shift_dup); - x21 = vshlq_s32(x21, left_shift_dup); - x22 = vshlq_s32(x22, left_shift_dup); - x11 = vqrdmulhq_n_s32(x11, input1_multiplier); - x12 = vqrdmulhq_n_s32(x12, input1_multiplier); - x21 = vqrdmulhq_n_s32(x21, input2_multiplier); - x22 = vqrdmulhq_n_s32(x22, input2_multiplier); - const auto input1_shift_dup = vdupq_n_s32(-input1_shift); - const auto input2_shift_dup = vdupq_n_s32(-input2_shift); - x11 = vshlq_s32(x11, input1_shift_dup); - x12 = vshlq_s32(x12, input1_shift_dup); - x21 = vshlq_s32(x21, input2_shift_dup); - x22 = vshlq_s32(x22, input2_shift_dup); - auto s1 = vaddq_s32(x11, x21); - auto s2 = vaddq_s32(x12, x22); - s1 = vqrdmulhq_n_s32(s1, output_multiplier); - s2 = vqrdmulhq_n_s32(s2, output_multiplier); - using gemmlowp::RoundingDivideByPOT; - s1 = RoundingDivideByPOT(s1, output_shift); - s2 = RoundingDivideByPOT(s2, output_shift); - const auto s1_narrowed = vmovn_s32(s1); - const auto s2_narrowed = vmovn_s32(s2); - const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), - vdupq_n_s16(output_offset)); - vst1_u8(output_data + i, vqmovun_s16(s)); - } -#endif // NEON - - for (; i < size; i++) { - const int32 input1_val = input1_offset + input1_data[i]; - const int32 input2_val = input2_offset + input2_data[i]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sum, output_multiplier, output_shift) + - output_offset; - const int32 clamped_output = std::min( - output_activation_max, std::max(output_activation_min, raw_output)); - output_data[i] = static_cast<uint8>(clamped_output); - } -} - - -// TODO: We can implement BroadcastAdd on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. -// TODO: BroadcastAdd is intentionally duplicated from -// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> -// is no longer referenced in this file, move NdArrayDesc<T> from types.h to -// reference_ops.h. -template <FusedActivationFunctionType Ac> -void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] + - input2_data[SubscriptToIndex(desc2, c, x, y, b)]); - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - DCHECK_EQ(output_activation_min, 0); - DCHECK_EQ(output_activation_max, 255); - } - gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sum, output_multiplier, output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -void Mul(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Mul"); - /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, - output_dims, 3); - /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, - output_dims, 2); - /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, - output_dims, 1); - /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, - output_dims, 0); - DCHECK(IsPackedWithoutStrides(input1_dims)); - DCHECK(IsPackedWithoutStrides(input2_dims)); - DCHECK(IsPackedWithoutStrides(output_dims)); - - int i = 0; - const int size = input1_dims.sizes[3] * input1_dims.strides[3]; -#ifdef USE_NEON - const auto zero = vdupq_n_f32(0); - const auto six = vdupq_n_f32(6); - const auto neg_one = vdupq_n_f32(-1); - const auto one = vdupq_n_f32(1); - for (; i <= size - 16; i += 16) { - auto a10 = vld1q_f32(input1_data + i); - auto a11 = vld1q_f32(input1_data + i + 4); - auto a12 = vld1q_f32(input1_data + i + 8); - auto a13 = vld1q_f32(input1_data + i + 12); - auto a20 = vld1q_f32(input2_data + i); - auto a21 = vld1q_f32(input2_data + i + 4); - auto a22 = vld1q_f32(input2_data + i + 8); - auto a23 = vld1q_f32(input2_data + i + 12); - auto x0 = vmulq_f32(a10, a20); - auto x1 = vmulq_f32(a11, a21); - auto x2 = vmulq_f32(a12, a22); - auto x3 = vmulq_f32(a13, a23); - if (Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6) { - x0 = vmaxq_f32(zero, x0); - x1 = vmaxq_f32(zero, x1); - x2 = vmaxq_f32(zero, x2); - x3 = vmaxq_f32(zero, x3); - if (Ac == FusedActivationFunctionType::kRelu6) { - x0 = vminq_f32(six, x0); - x1 = vminq_f32(six, x1); - x2 = vminq_f32(six, x2); - x3 = vminq_f32(six, x3); - } - } else if (Ac == FusedActivationFunctionType::kRelu1) { - x0 = vmaxq_f32(neg_one, x0); - x1 = vmaxq_f32(neg_one, x1); - x2 = vmaxq_f32(neg_one, x2); - x3 = vmaxq_f32(neg_one, x3); - x0 = vminq_f32(one, x0); - x1 = vminq_f32(one, x1); - x2 = vminq_f32(one, x2); - x3 = vminq_f32(one, x3); - } - vst1q_f32(output_data + i, x0); - vst1q_f32(output_data + i + 4, x1); - vst1q_f32(output_data + i + 8, x2); - vst1q_f32(output_data + i + 12, x3); - } - for (; i <= size - 4; i += 4) { - auto a1 = vld1q_f32(input1_data + i); - auto a2 = vld1q_f32(input2_data + i); - auto x = vmulq_f32(a1, a2); - if (Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6) { - x = vmaxq_f32(zero, x); - if (Ac == FusedActivationFunctionType::kRelu6) { - x = vminq_f32(six, x); - } - } else if (Ac == FusedActivationFunctionType::kRelu1) { - x = vmaxq_f32(neg_one, x); - x = vminq_f32(one, x); - } - vst1q_f32(output_data + i, x); - } -#endif // NEON - - for (; i < size; i++) { - auto x = input1_data[i] * input2_data[i]; - output_data[i] = ActivationFunction<Ac>(x); - } -} - -// TODO: We can implement BroadcastMul on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. -// TODO: BroadcastMul is intentionally duplicated from -// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> -// is no longer referenced in this file, move NdArrayDesc<T> from types.h to -// reference_ops.h. -template <FusedActivationFunctionType Ac> -void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastMul"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] * - input2_data[SubscriptToIndex(desc2, c, x, y, b)]); - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, - int32 input1_offset, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - DCHECK_EQ(output_activation_min, 0); - DCHECK_EQ(output_activation_max, 255); - } - gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 unclamped_result = - output_offset + - MultiplyByQuantizedMultiplierSmallerThanOne( - input1_val * input2_val, output_multiplier, output_shift); - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, unclamped_result)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } - } -} - -template <FusedActivationFunctionType Ac, typename Scalar> -void Concatenation(int concat_dim, const Scalar* const* input_data, - const Dims<4>* const* input_dims, int inputs_count, - Scalar* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Concatenation"); - DCHECK_GT(inputs_count, 1); - int concat_size = 0; - for (int i = 0; i < inputs_count; i++) { - for (int j = 0; j < 4; j++) { - if (j != concat_dim) { - MatchingArraySize(*input_dims[i], j, output_dims, j); - } - } - concat_size += ArraySize(*input_dims[i], concat_dim); - } - DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); - DCHECK(IsPackedWithoutStrides(output_dims)); - // for now we dont have a model with a Concatenation - // with fused activation function. - DCHECK(Ac == FusedActivationFunctionType::kNone); - int outer_size = 1; - for (int i = concat_dim + 1; i < 4; i++) { - outer_size *= output_dims.sizes[i]; - } - Scalar* output_ptr = output_data; - for (int k = 0; k < outer_size; k++) { - for (int i = 0; i < inputs_count; ++i) { - const int copy_size = - input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; - memcpy(output_ptr, input_data[i] + k * copy_size, - copy_size * sizeof(Scalar)); - output_ptr += copy_size; - } - } -} - -template <FusedActivationFunctionType Ac, typename Scalar> -void DepthConcatenation(const Scalar* const* input_data, - const Dims<4>* const* input_dims, int inputs_count, - Scalar* output_data, const Dims<4>& output_dims) { - Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, - output_data, output_dims); -} - -inline void LstmCell(const float* input_data, const Dims<4>& input_dims, - const float* prev_activ_data, - const Dims<4>& prev_activ_dims, const float* weights_data, - const Dims<4>& weights_dims, const float* bias_data, - const Dims<4>& bias_dims, const float* prev_state_data, - const Dims<4>& prev_state_dims, float* output_state_data, - const Dims<4>& output_state_dims, float* output_activ_data, - const Dims<4>& output_activ_dims, float* concat_temp_data, - const Dims<4>& concat_temp_dims, float* activ_temp_data, - const Dims<4>& activ_temp_dims) { - gemmlowp::ScopedProfilingLabel label("LstmCell"); - MatchingArraySize( // batches - input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims, - 3, output_activ_dims, 3); - MatchingArraySize( // height - input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims, - 2, output_activ_dims, 2); - MatchingArraySize( // width - input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims, - 1, output_activ_dims, 1); - CHECK_EQ(ArraySize(weights_dims, 2), 1); - CHECK_EQ(ArraySize(weights_dims, 3), 1); - const int input_depth = ArraySize(input_dims, 0); - const int prev_activ_depth = ArraySize(prev_activ_dims, 0); - const int total_input_depth = prev_activ_depth + input_depth; - CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); - CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 1); - const int intern_activ_depth = MatchingArraySize( - weights_dims, 1, - bias_dims, 0); - CHECK_EQ(intern_activ_depth % 4, 0); - const int output_depth = MatchingArraySize( - prev_state_dims, 0, - prev_activ_dims, 0, - output_state_dims, 0, - output_activ_dims, 0); - CHECK_EQ(output_depth, intern_activ_depth / 4); - - // Concatenate prev_activ and input data together - std::vector<float const*> concat_input_arrays_data; - std::vector<Dims<4> const*> concat_input_arrays_dims; - concat_input_arrays_data.push_back(input_data); - concat_input_arrays_data.push_back(prev_activ_data); - concat_input_arrays_dims.push_back(&input_dims); - concat_input_arrays_dims.push_back(&prev_activ_dims); - Concatenation<FusedActivationFunctionType::kNone, float>( - 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]), - concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims); - - // Fully connected - FullyConnected<FusedActivationFunctionType::kNone>( - concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data, - bias_dims, activ_temp_data, activ_temp_dims); - - // Map raw arrays to Eigen arrays so we can use Eigen's optimized array - // operations. - ArrayMap<float> activ_temp_map = - MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims); - auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth, - activ_temp_map.cols()); - auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth, - activ_temp_map.cols()); - auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth, - activ_temp_map.cols()); - auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth, - activ_temp_map.cols()); - ArrayMap<const float> prev_state_map = - MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims); - ArrayMap<float> output_state_map = - MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims); - ArrayMap<float> output_activ_map = - MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims); - - // Combined memory state and final output calculation - gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput"); - output_state_map = - input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * - new_input_sm.tanh() + - forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * - prev_state_map; - output_activ_map = - output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * - output_state_map.tanh(); -} - -template <FusedActivationFunctionType Ac, typename Scalar> -void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, - int outputs_count, Scalar* const* output_data, - const Dims<4>* const* output_dims) { - gemmlowp::ScopedProfilingLabel label("TensorFlowSplit"); - DCHECK_GE(outputs_count, 1); - for (int i = 0; i < outputs_count; i++) { - /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); - /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); - /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); - } - const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3); - const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2); - const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1); - DCHECK(IsPackedWithoutStrides(input_dims)); - // for now we dont have a model with a TensorFlowSplit - // with fused activation function. - DCHECK(Ac == FusedActivationFunctionType::kNone); - const int whb = width * height * batches; - const Scalar* input_ptr = input_data; - for (int k = 0; k < whb; k++) { - for (int i = 0; i < outputs_count; ++i) { - memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr, - output_dims[i]->sizes[0] * sizeof(Scalar)); - input_ptr += output_dims[i]->sizes[0]; - } - } -} - -inline int NodeOffset(int b, int h, int w, int height, int width) { - return (b * height + h) * width + w; -} - -template <FusedActivationFunctionType Ac> -void AveragePool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, int kwidth, int kheight, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("AveragePool"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - // TODO: get rid of the dynamic memory allocation here! - Eigen::VectorXf out_count(out_mat.cols()); - out_count.setZero(); - // Prefill the output to 0. - out_mat.setZero(); - for (int b = 0; b < batches; ++b) { - for (int h = 0; h < input_height; ++h) { - for (int w = 0; w < input_width; ++w) { - // (h_start, h_end) * (w_start, w_end) is the range that the input - // vector projects to. - int hpad = h + pad_height; - int wpad = w + pad_width; - int h_start = (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; - int h_end = std::min(hpad / stride_height + 1, output_height); - int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; - int w_end = std::min(wpad / stride_width + 1, output_width); - // compute elementwise sum - for (int ph = h_start; ph < h_end; ++ph) { - for (int pw = w_start; pw < w_end; ++pw) { - int out_offset = NodeOffset(b, ph, pw, output_height, output_width); - out_mat.col(out_offset) += - in_mat.col(NodeOffset(b, h, w, input_height, input_width)); - out_count(out_offset)++; - } - } - } - } - } - // Divide the output by the actual number of elements being averaged over - DCHECK_GT(out_count.minCoeff(), 0); - out_mat.array().rowwise() /= out_count.transpose().array(); - - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < output_height; ++y) { - for (int x = 0; x < output_width; ++x) { - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( - output_data[Offset(output_dims, c, x, y, b)]); - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -void AveragePool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, int filter_width, - int filter_height, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("AveragePool/8bit"); - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - DCHECK_EQ(output_activation_min, 0); - DCHECK_EQ(output_activation_max, 255); - } - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - for (int batch = 0; batch < batches; ++batch) { - for (int out_y = 0; out_y < output_height; ++out_y) { - for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; - const int filter_x_start = std::max(0, -in_x_origin); - const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); - const int filter_y_start = std::max(0, -in_y_origin); - const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); - const int filter_count = - (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); - // TODO: Add a dynamic buffer allocation path instead of hardcoded size. - static constexpr int kAccBufferMaxSize = 2048; - DCHECK_LE(depth, kAccBufferMaxSize); - uint16 acc[kAccBufferMaxSize]; - memset(acc, 0, depth * sizeof(acc[0])); - const uint8* input_ptr = - input_data + input_dims.strides[1] * in_x_origin + - input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch; - for (int fy = filter_y_start; fy < filter_y_end; fy++) { - const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] + - filter_x_start * input_dims.strides[1]; - for (int fx = filter_x_start; fx < filter_x_end; fx++) { - int channel = 0; -#ifdef USE_NEON - for (; channel <= depth - 16; channel += 16) { - uint16x8_t acc_reg[2]; - for (int i = 0; i < 2; i++) { - acc_reg[i] = vld1q_u16(acc + channel + 8 * i); - } - uint8x16_t input_reg = vld1q_u8(input_row_ptr); - input_row_ptr += 16; - acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg)); - acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg)); - for (int i = 0; i < 2; i++) { - vst1q_u16(acc + channel + 8 * i, acc_reg[i]); - } - } - for (; channel <= depth - 8; channel += 8) { - uint16x8_t acc_reg = vld1q_u16(acc + channel); - uint8x8_t input_reg = vld1_u8(input_row_ptr); - input_row_ptr += 8; - acc_reg = vaddw_u8(acc_reg, input_reg); - vst1q_u16(acc + channel, acc_reg); - } -#endif - for (; channel < depth; ++channel) { - acc[channel] += *input_row_ptr++; - } - } - } - uint8* output_ptr = - output_data + Offset(output_dims, 0, out_x, out_y, batch); - int channel = 0; -#ifdef USE_NEON -#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ - if (filter_count == FILTER_COUNT) { \ - for (; channel <= depth - 8; channel += 8) { \ - uint16 buf[8]; \ - for (int i = 0; i < 8; i++) { \ - buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ - } \ - uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ - buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); \ - buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); \ - vst1_u8(output_ptr + channel, buf8); \ - } \ - } - AVGPOOL_DIVIDING_BY(9) - AVGPOOL_DIVIDING_BY(15) -#undef AVGPOOL_DIVIDING_BY - for (; channel <= depth - 8; channel += 8) { - uint16 buf[8]; - for (int i = 0; i < 8; i++) { - buf[i] = (acc[channel + i] + filter_count / 2) / filter_count; - } - uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); - buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); - buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); - vst1_u8(output_ptr + channel, buf8); - } -#endif - for (; channel < depth; ++channel) { - uint16 a = (acc[channel] + filter_count / 2) / filter_count; - a = std::max<uint16>(a, output_activation_min); - a = std::min<uint16>(a, output_activation_max); - output_ptr[channel] = static_cast<uint8>(a); - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -void MaxPool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, int kwidth, int kheight, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("MaxPool"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - // Prefill the output to minimum representable float value - out_mat.setConstant(std::numeric_limits<float>::lowest()); - for (int b = 0; b < batches; ++b) { - for (int h = 0; h < input_height; ++h) { - for (int w = 0; w < input_width; ++w) { - // (h_start, h_end) * (w_start, w_end) is the range that the input - // vector projects to. - int hpad = h + pad_height; - int wpad = w + pad_width; - int h_start = (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; - int h_end = std::min(hpad / stride_height + 1, output_height); - int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; - int w_end = std::min(wpad / stride_width + 1, output_width); - // compute elementwise sum - for (int ph = h_start; ph < h_end; ++ph) { - for (int pw = w_start; pw < w_end; ++pw) { - int out_offset = NodeOffset(b, ph, pw, output_height, output_width); - out_mat.col(out_offset) = - out_mat.col(out_offset) - .cwiseMax(in_mat.col( - NodeOffset(b, h, w, input_height, input_width))); - } - } - } - } - } - - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < output_height; ++y) { - for (int x = 0; x < output_width; ++x) { - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( - output_data[Offset(output_dims, c, x, y, b)]); - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -void MaxPool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("MaxPool/8bit"); - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - DCHECK_EQ(output_activation_min, 0); - DCHECK_EQ(output_activation_max, 255); - } - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - for (int batch = 0; batch < batches; ++batch) { - for (int out_y = 0; out_y < output_height; ++out_y) { - for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; - const int filter_x_start = std::max(0, -in_x_origin); - const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); - const int filter_y_start = std::max(0, -in_y_origin); - const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); - // TODO: Add a dynamic buffer allocation path instead of hardcoded size. - static constexpr int kAccBufferMaxSize = 2048; - DCHECK_LE(depth, kAccBufferMaxSize); - uint8 acc[kAccBufferMaxSize]; - memset(acc, 0, depth * sizeof(acc[0])); - const uint8* input_ptr = - input_data + input_dims.strides[1] * in_x_origin + - input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch; - for (int fy = filter_y_start; fy < filter_y_end; fy++) { - const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] + - filter_x_start * input_dims.strides[1]; - for (int fx = filter_x_start; fx < filter_x_end; fx++) { - int channel = 0; -#ifdef USE_NEON - for (; channel <= depth - 16; channel += 16) { - uint8x16_t acc_reg = vld1q_u8(acc + channel); - uint8x16_t input_reg = vld1q_u8(input_row_ptr); - input_row_ptr += 16; - acc_reg = vmaxq_u8(acc_reg, input_reg); - vst1q_u8(acc + channel, acc_reg); - } - - for (; channel <= depth - 8; channel += 8) { - uint8x8_t acc_reg = vld1_u8(acc + channel); - uint8x8_t input_reg = vld1_u8(input_row_ptr); - input_row_ptr += 8; - acc_reg = vmax_u8(acc_reg, input_reg); - vst1_u8(acc + channel, acc_reg); - } -#endif - for (; channel < depth; ++channel) { - acc[channel] = std::max(acc[channel], *input_row_ptr++); - } - } - } - uint8* output_ptr = - output_data + Offset(output_dims, 0, out_x, out_y, batch); - int channel = 0; -#ifdef USE_NEON - for (; channel <= depth - 16; channel += 16) { - uint8x16_t a = vld1q_u8(acc + channel); - a = vminq_u8(a, vdupq_n_u8(output_activation_max)); - a = vmaxq_u8(a, vdupq_n_u8(output_activation_min)); - vst1q_u8(output_ptr + channel, a); - } - for (; channel <= depth - 8; channel += 8) { - uint8x8_t a = vld1_u8(acc + channel); - a = vmin_u8(a, vdup_n_u8(output_activation_max)); - a = vmax_u8(a, vdup_n_u8(output_activation_min)); - vst1_u8(output_ptr + channel, a); - } -#endif - for (; channel < depth; ++channel) { - uint8 a = acc[channel]; - a = std::max<uint8>(a, output_activation_min); - a = std::min<uint8>(a, output_activation_max); - output_ptr[channel] = static_cast<uint8>(a); - } - } - } - } -} - -template <FusedActivationFunctionType Ac> -void L2Pool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, int filter_width, int filter_height, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("L2Pool"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - // Actually carry out L2 Pool. Code is written in forward mode: we go through - // the input values once, and write to all the pooled regions that it maps to. - const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - Eigen::VectorXf in_square(in_mat.rows()); - Eigen::VectorXf out_count(out_mat.cols()); - out_count.setZero(); - // Prefill the output to 0. - out_mat.setZero(); - for (int b = 0; b < batches; ++b) { - for (int h = 0; h < input_height; ++h) { - for (int w = 0; w < input_width; ++w) { - // (h_start, h_end) * (w_start, w_end) is the range that the input - // vector projects to. - const int hpad = h + pad_height; - const int wpad = w + pad_width; - const int h_start = - (hpad < filter_height) ? 0 : (hpad - filter_height) / stride_height + 1; - const int h_end = std::min(hpad / stride_height + 1, output_height); - const int w_start = - (wpad < filter_width) ? 0 : (wpad - filter_width) / stride_width + 1; - const int w_end = std::min(wpad / stride_width + 1, output_width); - // pre-compute square - const int in_offset = w + input_width * (h + input_height * b); - in_square = - in_mat.col(in_offset).array() * in_mat.col(in_offset).array(); - // compute elementwise sum of squares - for (int ph = h_start; ph < h_end; ++ph) { - for (int pw = w_start; pw < w_end; ++pw) { - const int out_offset = pw + output_width * (ph + output_height * b); - out_mat.col(out_offset) += in_square; - out_count(out_offset)++; - } - } - } - } - } - - out_count = out_count.array().inverse(); - out_mat = - (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt(); -} - -inline void LocalResponseNormalization(const float* input_data, - const Dims<4>& input_dims, int range, - float bias, float alpha, float beta, - float* output_data, - const Dims<4>& output_dims) { - /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3); - /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2); - /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1); - /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0); - - const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - - // Carry out local response normalization, vector by vector. - // Since the data are stored column major, making row-wise operation - // probably not memory efficient anyway, we do an explicit for loop over - // the columns. - const int double_range = range * 2; - Eigen::VectorXf padded_square(data_in.rows() + double_range); - padded_square.setZero(); - for (int r = 0; r < data_in.cols(); ++r) { - // Do local response normalization for data_in(:, r) - // first, compute the square and store them in buffer for repeated use - padded_square.block(range, 0, data_in.rows(), 1) = - data_in.col(r).cwiseProduct(data_in.col(r)) * alpha; - // Then, compute the scale and writes them to data_out - float accumulated_scale = 0; - for (int i = 0; i < double_range; ++i) { - accumulated_scale += padded_square(i); - } - for (int i = 0; i < data_in.rows(); ++i) { - accumulated_scale += padded_square(i + double_range); - data_out(i, r) = bias + accumulated_scale; - accumulated_scale -= padded_square(i); - } - } - - // In a few cases, the pow computation could benefit from speedups. - if (beta == 1) { - data_out.array() = data_in.array() * data_out.array().inverse(); - } else if (beta == 0.5) { - data_out.array() = data_in.array() * data_out.array().sqrt().inverse(); - } else { - data_out.array() = data_in.array() * data_out.array().pow(-beta); - } -} - -inline void Softmax(const float* input_data, const Dims<4>& input_dims, - float beta, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Softmax"); - /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3); - /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2); - /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1); - /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0); - - const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - // Compute the exponential first, removing the max coefficient for numerical - // stability. - out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * beta; - // We are separating out the exp function so that exp can be vectorized. - out_mat = out_mat.array().exp(); - // Normalize to get the activations. - Eigen::Array<float, 1, Eigen::Dynamic> scale = - out_mat.array().colwise().sum().inverse(); - out_mat.array().rowwise() *= scale; -} - -inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, - int32 input_beta_multiplier, int32 input_beta_left_shift, - int diff_min, uint8* output_data, - const Dims<4>& output_dims) { - // The representation chosen for the input to the exp() function is Q5.26. - // We need to leave extra space since values that we skip might be as large as - // -32 before multiplying by input_beta_multiplier, and therefore as large as - // -16 afterwards. Note that exp(-8) is definitely not insignificant to - // accumulation, but exp(-16) definitely is. - static const int kScaledDiffIntegerBits = 5; - static const int kAccumulationIntegerBits = 12; - using FixedPointScaledDiff = - gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; - using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; - using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; - - gemmlowp::ScopedProfilingLabel label("Softmax"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - for (int b = 0; b < batches; ++b) { - for (int x = 0; x < width; ++x) { - for (int y = 0; y < height; ++y) { - uint8 max_in_row = 0; - for (int c = 0; c < depth; ++c) { - max_in_row = - std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]); - } - - FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); - for (int c = 0; c < depth; ++c) { - int32 input_diff = - static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - - max_in_row; - if (input_diff >= diff_min) { - const int32 input_diff_rescaled = - MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - sum_of_exps = - sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( - exp_on_negative_values(scaled_diff_f8)); - } - } - - int32 fixed_sum_of_exps = sum_of_exps.raw(); - // TODO: Use a NEON intrinsic like vclzq_u32 instead. - int headroom_plus_one = - __builtin_clz(static_cast<uint32>(fixed_sum_of_exps)); - // This is the number of bits to the left of the binary point above 1.0. - // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and - // no later adjustment will be needed. - int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; - int32 shifted_sum_minus_one = static_cast<int32>( - (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - - (static_cast<uint32>(1) << 31)); - - FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( - FixedPoint0::FromRaw(shifted_sum_minus_one)); - - for (int c = 0; c < depth; ++c) { - int32 input_diff = - static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - - max_in_row; - if (input_diff >= diff_min) { - const int32 input_diff_rescaled = - MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - - FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); - int32 unsat_output = gemmlowp::RoundingDivideByPOT( - (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); - - output_data[Offset(output_dims, c, x, y, b)] = - std::max(std::min(unsat_output, 255), 0); - - } else { - output_data[Offset(output_dims, c, x, y, b)] = 0; - } - } - } - } - } -} - -inline void Logistic(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Logistic"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); - output_map.array() = - input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()); -} - -inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Logistic"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)]; - const int32 input_val_centered = - static_cast<int32>(input_val_u8) - input_zero_point; - uint8 output_val; - if (input_val_centered < -input_range_radius) { - output_val = 0; - } else if (input_val_centered > input_range_radius) { - output_val = 255; - } else { - const int32 input_val_rescaled = - MultiplyByQuantizedMultiplierGreaterThanOne( - input_val_centered, input_multiplier, input_left_shift); - using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; - using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; - const FixedPoint4 input_val_f4 = - FixedPoint4::FromRaw(input_val_rescaled); - const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4); - using gemmlowp::RoundingDivideByPOT; - int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); - if (output_val_s32 == 256) { - output_val_s32 = 255; - } - DCHECK_GE(output_val_s32, 0); - DCHECK_LE(output_val_s32, 255); - output_val = static_cast<uint8>(output_val_s32); - } - output_data[Offset(output_dims, c, x, y, b)] = output_val; - } - } - } - } -} - -inline void Tanh(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Tanh"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); - output_map.array() = input_map.array().tanh(); -} - -inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, - int32 zero_point, double scale, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Dequantize"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - int32 val = input_data[Offset(input_dims, c, x, y, b)]; - float result = static_cast<float>(scale * (val - zero_point)); - output_data[Offset(output_dims, c, x, y, b)] = result; - } - } - } - } -} - -inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, - float rmin, float rmax, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("FakeQuant"); - - // 0 should always be a representable value. Let's assume that the initial - // min,max range contains 0. - DCHECK_LE(rmin, 0.); - DCHECK_GE(rmax, 0.); - - // Determine quantization parameters: zero_point, scale. - using Integer = uint8; - const Integer qmin = std::numeric_limits<Integer>::min(); - const Integer qmax = std::numeric_limits<Integer>::max(); - const float qmin_float = qmin; - const float qmax_float = qmax; - int32 zero_point = 0; - float scale = 0.f; - // If rmin==rmax, both must be zero per the above assertion, - // so we are done. - if (rmin != rmax) { - // First determine the scale. - scale = (rmax - rmin) / (qmax_float - qmin_float); - - // Zero-point computation. - // First the initial floating-point computation. The zero-point can be - // determined from solving an affine equation for any known pair - // (real value, corresponding quantized value). - // We know two such pairs: (rmin, qmin) and (rmax, qmax). - // The arithmetic error on the zero point computed from either pair - // will be roughly machine_epsilon * (sum of absolute values of terms) - // so we want to use the variant that adds the smaller terms. - const float zero_point_from_min = qmin_float - rmin / scale; - const float zero_point_from_max = qmax_float - rmax / scale; - const float zero_point_from_min_error = - std::abs(qmin_float) + std::abs(rmin / scale); - const float zero_point_from_max_error = - std::abs(qmax_float) + std::abs(rmax / scale); - - const float zero_point_float = - zero_point_from_min_error < zero_point_from_max_error - ? zero_point_from_min - : zero_point_from_max; - - // Now we need to nudge the zero point to be an integer - // (our zero points are integer, and this is motivated by the requirement - // to be able to represent the real value "0" exactly as a quantized value, - // which is required in multiple places, for example in Im2col with SAME - // padding). - if (zero_point_float < qmin_float) { - zero_point = qmin; - } else if (zero_point_float > qmax_float) { - zero_point = qmax; - } else { - zero_point = static_cast<int32>(std::round(zero_point_float)); - } - // The zero point should always be in the range of quantized value, - // [qmin, qmax]. - DCHECK_GE(zero_point, qmin); - DCHECK_LE(zero_point, qmax); - } - - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int height = MatchingArraySize(input_dims, 2, output_dims, 2); - const int width = MatchingArraySize(input_dims, 1, output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - const float src_val = input_data[Offset(input_dims, c, x, y, b)]; - const float unclamped_quantized_val = - std::round(zero_point + src_val / scale); - const float quantized_val = std::min( - qmax_float, std::max(qmin_float, unclamped_quantized_val)); - const float dst_val = scale * (quantized_val - zero_point); - output_data[Offset(output_dims, c, x, y, b)] = dst_val; - } - } - } - } -} - -template <typename SrcT, typename DstT> -inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, - DstT* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Cast"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); - output_map.array() = input_map.array().template cast<DstT>(); -} - -inline void Floor(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Floor"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); - output_map.array() = Eigen::floor(input_map.array()); -} - -template <typename T> -inline void Gather(const T* input_data, const Dims<4>& input_dims, - const int32* coords_data, const Dims<4>& coords_dims, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Gather"); - DCHECK_EQ(RequiredBufferSizeForDims(output_dims), - RequiredBufferSizeForDims(coords_dims)); - for (int i = 0; i < RequiredBufferSizeForDims(coords_dims); i++) { - DCHECK_GE(coords_data[i], 0); - DCHECK_LT(coords_data[i], RequiredBufferSizeForDims(input_dims)); - output_data[i] = input_data[coords_data[i]]; - } -} - -inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, - const int32* output_size_data, - const Dims<4>& output_size_dims, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("ResizeBilinear"); - int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); - int32 input_height = ArraySize(input_dims, 2); - int32 input_width = ArraySize(input_dims, 1); - int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - DCHECK_EQ(ArraySize(output_size_dims, 3), 1); - DCHECK_EQ(ArraySize(output_size_dims, 2), 1); - DCHECK_EQ(ArraySize(output_size_dims, 1), 1); - DCHECK_EQ(ArraySize(output_size_dims, 0), 2); - int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)]; - int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)]; - float height_scale = static_cast<float>(input_height) / output_height; - float width_scale = static_cast<float>(input_width) / output_width; - - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < output_height; ++y) { - float input_y = y * height_scale; - int32 y0 = static_cast<int32>(input_y); - int32 y1 = std::min(y0 + 1, input_height - 1); - for (int x = 0; x < output_width; ++x) { - float input_x = x * width_scale; - int32 x0 = static_cast<int32>(input_x); - int32 x1 = std::min(x0 + 1, input_width - 1); - for (int c = 0; c < depth; ++c) { - float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] * - (1 - (input_y - y0)) * - (1 - (input_x - x0)) + - input_data[Offset(input_dims, c, x0, y1, b)] * - (input_y - y0) * (1 - (input_x - x0)) + - input_data[Offset(input_dims, c, x1, y0, b)] * - (1 - (input_y - y0)) * (input_x - x0) + - input_data[Offset(input_dims, c, x1, y1, b)] * - (input_y - y0) * (input_x - x0); - output_data[Offset(output_dims, c, x, y, b)] = interpolation; - } - } - } - } -} - -} // namespace optimized_ops -} // namespace rt -} // namespace nnfw - -#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS -#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS -#pragma GCC diagnostic pop -#endif - -#endif // __NNFW_RT_OPTIMIZED_OPS_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h b/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h deleted file mode 100644 index bf659d0a3..000000000 --- a/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (C) 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_TENSOR_UTILS_IMPL_H__ -#define __NNFW_RT_TENSOR_UTILS_IMPL_H__ - -#include "ActivationFunctor.h" - -#ifndef USE_NEON -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#define USE_NEON -#endif // defined(__ARM_NEON__) || defined(__ARM_NEON) -#endif // USE_NEON - -namespace nnfw { -namespace rt { -namespace tensor_utils { - -// Multiply a matrix by a batch vector, and store results in a batch-size -// vector. -void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix, - int m_rows, int m_cols, - const float* vector, - int n_batch, float* result, - int result_stride); -void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, - int m_cols, const float* vector, - int n_batch, float* result, - int result_stride); - -// Cwise product of two vectors. -void PortableVectorVectorCwiseProduct(const float* vector1, - const float* vector2, int v_size, - float* result); -void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, - int v_size, float* result); - -// Cwise product and accumulate of two vectors. Since it's a MAC operation, the -// assumption here is that result array is initialized to valid values. -void PortableVectorVectorCwiseProductAccumulate(const float* vector1, - const float* vector2, - int v_size, float* result); -void NeonVectorVectorCwiseProductAccumulate(const float* vector1, - const float* vector2, int v_size, - float* result); - -// Dot product of two vectors. -float PortableVectorVectorDotProduct(const float* vector1, const float* vector2, - int v_size); - -// Dot product of two batch vectors. -void PortableBatchVectorBatchVectorDotProduct(const float* vector1, - const float* vector2, int v_size, - int n_batch, float* result, - int result_stride); - -// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC -// operation, the assumption here is that result array is initialized to valid -// values. -void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, - float* result); -void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, float* result); - -// Compute "1.0f - elements of vector" (used in CIFG). -void PortableSub1Vector(const float* vector, int v_size, float* result); -void NeonSub1Vector(const float* vector, int v_size, float* result); - -// Clip elements of a vector using a abs_limit value. -void PortableClipVector(const float* vector, int v_size, float abs_limit, - float* result); -void NeonClipVector(const float* vector, int v_size, float abs_limit, - float* result); - -// Batch vector initialization with another vector. -void PortableVectorBatchVectorAssign(const float* vector, int v_size, - int n_batch, float* batch_vector); - -// Apply sigmoid to elements of a vector. -void PortableApplySigmoidToVector(const float* vector, int v_size, - float* result); - -// Apply activation function to elements of a vector. -void PortableApplyActivationToVector(const float* vector, int v_size, - ActivationFn activation, - float* result); - -// Copy vector to another vector. -void PortableCopyVector(const float* vector, int v_size, float* result); - -// Fill vector with 0.f. -void PortableZeroVector(float* vector, int v_size); - -// Limit a float input f between +abs_limit and -abs_limit. -float PortableClip(float f, float abs_limit); - -// Shift left a vector in place with v_size size. -void PortableVectorShiftLeft(float* vector, int v_size, float shift_value); - -// Reduce-sum on a float input vector: -// input_vector: float pointer to input vector. -// input_stride: input vector stride. -// output_vector: float pointer to vector. -// output_size: output vector size. -// reduction_size: number of consecutive elements from input vector which are -// added to get one element of output. -void PortableReductionSumVector(const float* input_vector, int input_stride, - float* output_vector, int output_size, - int reduction_size); -} // namespace tensor_utils -} // namespace rt -} // namespace nnfw - -#endif // __NNFW_RT_TENSOR_UTILS_IMPL_H__ diff --git a/runtimes/nn/common/operations/internal/tensor_utils.cc b/runtimes/nn/common/operations/internal/tensor_utils.cc deleted file mode 100644 index 78275bb29..000000000 --- a/runtimes/nn/common/operations/internal/tensor_utils.cc +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "tensor_utils.h" - -#ifndef USE_NEON -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#define USE_NEON -#endif // defined(__ARM_NEON__) || defined(__ARM_NEON) -#endif // USE_NEON - -#ifdef USE_NEON -#include "optimized/neon_tensor_utils.h" -#else -#include "reference/portable_tensor_utils.h" -#endif // USE_NEON diff --git a/runtimes/nn/common/operations/internal/tensor_utils.h b/runtimes/nn/common/operations/internal/tensor_utils.h deleted file mode 100644 index df3d4e27b..000000000 --- a/runtimes/nn/common/operations/internal/tensor_utils.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_TENSOR_UTILS_H__ -#define __NNFW_RT_TENSOR_UTILS_H__ - -#include "ActivationFunctor.h" - -namespace nnfw { -namespace rt { -namespace tensor_utils { - -// Limit a float input f betweeen +abs_limit and -abs_limit. -float Clip(float f, float abs_limit); - -// Multiply a matrix by a batch vector, and store results in a batch-size -// vector using a stride value provided in result_stride. 'result_stride' shows -// how the number of elements between consecutive result values. For example -// result_stride = 1, will cause the output to look like this: -// [O_1, 0_2, ... O_rows] in memory, but result_stride = 3, will cause it to be -// arranged like this in memory: [O_1, x, x, 0_2, x, x, ..., O_rows] -void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, - int m_cols, const float* vector, - int n_batch, float* result, - int result_stride); - -// Cwise product of two vectors. -void VectorVectorCwiseProduct(const float* vector1, const float* vector2, - int v_size, float* result); - -// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the -// assumption here is that result array is initialized to valid values. -void VectorVectorCwiseProductAccumulate(const float* vector1, - const float* vector2, int v_size, - float* result); - -// Dot product of two vectors. -float VectorVectorDotProduct(const float* vector1, const float* vector2, - int v_size); - -// Dot product of two batch vectors of size n_batch * v_size: -// vector1 = [x_1_1, x_1_2, ..., x_1_vsize, -// x_2_1, x_2_2, ..., x_2_vsize, -// ... -// x_nbatch_1,..., x_nbatch_vsize] -// vector2 = [y_1_1, y_1_2, ..., y_1_vsize, -// y_2_1, y_2_2, ..., y_2_vsize, -// ... -// y_nbatch_1,..., y_nbatch_vsize] -// Then result will be a vector of n_batch size which will be saved with a -// stride of result_stride in memory starting from 'result': -// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize, -// x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize, -// ... -// x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize] -void BatchVectorBatchVectorDotProduct(const float* vector1, - const float* vector2, int v_size, - int n_batch, float* result, - int result_stride); - -// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC -// operation, the assumption here is that result array is initialized to valid -// values. -void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result); - -// Batch vector initialization with another vector. -void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch, - float* batch_vector); - -// Apply sigmoid to elements of a vector. -void ApplySigmoidToVector(const float* vector, int v_size, float* result); - -// Apply activation function to elements of a vector. -void ApplyActivationToVector(const float* vector, int v_size, - ActivationFn activation, float* result); - -// Copy vector to another vector. -void CopyVector(const float* vector, int v_size, float* result); - -// Compute "1.0f - elements of vector" (used in CIFG). -void Sub1Vector(const float* vector, int v_size, float* result); - -// Fill vector with 0.f. -void ZeroVector(float* vector, int v_size); - -// Clip elements of a vector using a abs_limit value. -void ClipVector(const float* vector, int v_size, float abs_limit, - float* result); - -// Shift left a vector in place with v_size size. -void VectorShiftLeft(float* vector, int v_size, float shift_value); - -// Reduce-sum on a float input vector: -// input_vector: float pointer to input vector. -// input_stride: input vector stride. -// output_vector: float pointer to vector. -// output_size: output vector size. -// reduction_size: number of consecutive elements from input vector which are -// added to get one element of output. -void ReductionSumVector(const float* input_vector, int input_stride, - float* output_vector, int output_size, - int reduction_size); -} // namespace tensor_utils -} // namespace rt -} // namespace nnfw - -#endif // __NNFW_RT_TENSOR_UTILS_H__ diff --git a/runtimes/nn/common/operations/internal/tensor_utils_test.cc b/runtimes/nn/common/operations/internal/tensor_utils_test.cc deleted file mode 100644 index b68982164..000000000 --- a/runtimes/nn/common/operations/internal/tensor_utils_test.cc +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "gmock/gmock-matchers.h" -#include "gtest/gtest.h" -#include "tensor_utils.h" - -namespace nnfw { -namespace rt { -namespace tensor_utils { - -namespace { - -using ::testing::FloatNear; -using ::testing::Matcher; - -std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values, - float max_abs_error=1.e-6) { - std::vector<Matcher<float>> matchers; - matchers.reserve(values.size()); - for (const float& v : values) { - matchers.emplace_back(FloatNear(v, max_abs_error)); - } - return matchers; -} - -} // anonymous namespace - -TEST(uKernels, ClipTest) { - constexpr int kVectorSize = 10; - constexpr float kAbsLimit = 2.0; - static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0, - -2.5, 3.0, -3.5, 4.0, -4.5}; - std::vector<float> output(kVectorSize); - ClipVector(input, kVectorSize, kAbsLimit, output.data()); - EXPECT_THAT(output, - ElementsAreArray(ArrayFloatNear( - {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0}))); -} - -TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) { - constexpr int kRow = 3; - constexpr int kCol = 4; - constexpr int kBatch = 2; - static float matrix[kRow * kCol] = {1.0, 2.0, 3.0, 4.0, // - -1.0, -2.0, -3.0, -4.0, // - 1.0, -2.0, 3.0, -4.0}; - static float vector[kCol * kBatch] = {1.0, -1.0, 1.0, -1.0, // - 2.0, -2.0, 2.0, -2.0}; - std::vector<float> output(kRow * kBatch); - std::fill(output.begin(), output.end(), 3.0); - MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch, - output.data(), /*result_stride=*/1); - EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({1., 5., 13., // - -1., 7., 23.}))); -} - -TEST(uKernels, VectorVectorCwiseProductTest) { - constexpr int kVectorSize = 10; - static float input1[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0, - -2.5, 3.0, -3.5, 4.0, -4.5}; - static float input2[kVectorSize] = {0.1, -0.1, 0.1, -0.1, 0.1, - -0.1, 0.1, -0.1, 0.1, -0.1}; - std::vector<float> output(kVectorSize); - VectorVectorCwiseProduct(input1, input2, kVectorSize, output.data()); - EXPECT_THAT(output, - ElementsAreArray(ArrayFloatNear( - {0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45}))); -} - -TEST(uKernels, VectorVectorCwiseProductAccumulateTest) { - constexpr int kVectorSize = 10; - static float input1[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0, - -2.5, 3.0, -3.5, 4.0, -4.5}; - static float input2[kVectorSize] = {0.1, -0.1, 0.1, -0.1, 0.1, - -0.1, 0.1, -0.1, 0.1, -0.1}; - std::vector<float> output(kVectorSize); - std::fill(output.begin(), output.end(), 1.0); - VectorVectorCwiseProductAccumulate(input1, input2, kVectorSize, - output.data()); - EXPECT_THAT(output, - ElementsAreArray(ArrayFloatNear( - {1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45}))); -} - -TEST(uKernels, VectorBatchVectorAssignTest) { - constexpr int kVectorSize = 5; - constexpr int kBatchSize = 3; - static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; - std::vector<float> output(kVectorSize * kBatchSize); - VectorBatchVectorAssign(input, kVectorSize, kBatchSize, output.data()); - EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear( - {0.0, -0.5, 1.0, -1.5, 2.0, 0.0, -0.5, 1.0, -1.5, 2.0, - 0.0, -0.5, 1.0, -1.5, 2.0}))); -} - -TEST(uKernels, ApplySigmoidToVectorTest) { - constexpr int kVectorSize = 5; - static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; - std::vector<float> output(kVectorSize); - ApplySigmoidToVector(input, kVectorSize, output.data()); - EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear( - {0.5, 0.377541, 0.731059, 0.182426, 0.880797}))); -} - -TEST(uKernels, ApplyActivationToVectorTest) { - constexpr int kVectorSize = 5; - static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; - std::vector<float> output(kVectorSize); - ApplyActivationToVector(input, kVectorSize, kActivationRelu, output.data()); - EXPECT_THAT(output, - ElementsAreArray(ArrayFloatNear({0.0, 0.0, 1.0, 0.0, 2.0}))); - - ApplyActivationToVector(input, kVectorSize, kActivationTanh, output.data()); - EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear( - {0.0, -0.462117, 0.761594, -0.905148, 0.964028}))); -} - -TEST(uKernels, CopyVectorTest) { - constexpr int kVectorSize = 5; - static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; - std::vector<float> output(kVectorSize); - CopyVector(input, kVectorSize, output.data()); - EXPECT_THAT(output, - ElementsAreArray(ArrayFloatNear({0.0, -0.5, 1.0, -1.5, 2.0}))); -} - -TEST(uKernels, Sub1VectorTest) { - constexpr int kVectorSize = 5; - static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; - std::vector<float> output(kVectorSize); - Sub1Vector(input, kVectorSize, output.data()); - EXPECT_THAT(output, - ElementsAreArray(ArrayFloatNear({1.0, 1.5, 0.0, 2.5, -1.0}))); -} - -TEST(uKernels, ZeroVectorTest) { - constexpr int kVectorSize = 5; - std::vector<float> output(kVectorSize); - ZeroVector(output.data(), kVectorSize); - EXPECT_THAT(output, - ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0}))); -} - -TEST(uKernels, BatchVectorBatchVectorDotProductTest) { - constexpr int kVectorSize = 5; - constexpr int kBatch = 2; - static float input1[kVectorSize * kBatch] = {0.0, -0.5, 1.0, -1.5, 2.0, - -2.5, 3.0, -3.5, 4.0, -4.5}; - static float input2[kVectorSize * kBatch] = {0.1, -0.1, 0.1, -0.1, 0.1, - -0.1, 0.1, -0.1, 0.1, -0.1}; - std::vector<float> output(kBatch); - BatchVectorBatchVectorDotProduct(input1, input2, kVectorSize, kBatch, - output.data(), /*result_stride=*/1); - EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({0.5, 1.75}))); -} - -TEST(uKernels, VectorShiftLeftTest) { - constexpr int kVectorSize = 5; - static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; - std::vector<float> result(kVectorSize); - VectorShiftLeft(input, kVectorSize, 3.0); - result.assign(input, input + kVectorSize); - EXPECT_THAT(result, - ElementsAreArray(ArrayFloatNear({-0.5, 1.0, -1.5, 2.0, 3.0}))); -} - -TEST(uKernels, ReductionSumVectorTest) { - constexpr int kInputVectorSize = 10; - constexpr int kOutputVectorSize = 5; - constexpr int kReductionSize = 2; - static float input[kInputVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0, - 0.0, -0.5, 1.0, 1.0, 2.0}; - std::vector<float> result(kOutputVectorSize); - ReductionSumVector(input, - /*input_stride=*/1, result.data(), kOutputVectorSize, - kReductionSize); - EXPECT_THAT(result, - ElementsAreArray(ArrayFloatNear({-0.5, -0.5, 2.0, 0.5, 3.0}))); -} - -} // namespace tensor_utils -} // namespace rt -} // namespace nnfw diff --git a/runtimes/nn/common/operations/internal/types.h b/runtimes/nn/common/operations/internal/types.h deleted file mode 100644 index bd5880edd..000000000 --- a/runtimes/nn/common/operations/internal/types.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (C) 2017 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_RT_TYPES_H__ -#define __NNFW_RT_TYPES_H__ - -#include "compatibility.h" - -namespace nnfw { -namespace rt { - -enum class FusedActivationFunctionType { kNone, kRelu6, kRelu1, kRelu }; - -template <int N> -struct Dims { - int sizes[N]; - int strides[N]; -}; - -struct Shape; - -inline Dims<4> convertShapeToDims(const Shape& shape) { - Dims<4> dims; - for (int i=0; i<4; i++) { - dims.sizes[i] = 1; - } - - if (shape.dimensions.size() == 1) { - dims.sizes[0] = (int)getSizeOfDimension(shape, 0); - } else { - for (int i=0; i<4; i++) { - int src = (int)shape.dimensions.size()-i-1; - if (src >= 0) { - dims.sizes[i] = (int)getSizeOfDimension(shape, src); - } - } - } - - dims.strides[0] = 1; - for (int i = 1; i<4; i++) { - dims.strides[i] = dims.strides[i-1] * dims.sizes[i-1]; - } - return dims; -} - -inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) { - DCHECK(i0 >= 0 && i0 < dims.sizes[0]); - DCHECK(i1 >= 0 && i1 < dims.sizes[1]); - DCHECK(i2 >= 0 && i2 < dims.sizes[2]); - DCHECK(i3 >= 0 && i3 < dims.sizes[3]); - return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] + - i3 * dims.strides[3]; -} - -// Get array size, DCHECKing that the dim index is in range. -template <int N> -int ArraySize(const Dims<N>& array, int index) { - DCHECK(index >= 0 && index < N); - return array.sizes[index]; -} - -// Get common array size, DCHECKing that they all agree. -template <typename ArrayType1, typename ArrayType2> -int MatchingArraySize(const ArrayType1& array1, int index1, - const ArrayType2& array2, int index2) { - DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2)); - return ArraySize(array1, index1); -} - -template <typename ArrayType1, typename ArrayType2, typename... Args> -int MatchingArraySize(const ArrayType1& array1, int index1, - const ArrayType2& array2, int index2, Args... args) { - DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2)); - return MatchingArraySize(array1, index1, args...); -} - -inline int RequiredBufferSizeForDims(const Dims<4>& dims) { - int max_offset = 0; - for (int i = 0; i < 4; i++) { - max_offset += (dims.sizes[i] - 1) * dims.strides[i]; - } - return max_offset + 1; -} - -template <int N> -bool IsPackedWithoutStrides(const Dims<N>& dims) { - int expected_stride = 1; - for (int d = 0; d < N; d++) { - if (dims.strides[d] != expected_stride) return false; - expected_stride *= dims.sizes[d]; - } - return true; -} - -} // namespace rt -} // namespace nnfw - -#endif // __NNFW_RT_TYPES_H__ |