summaryrefslogtreecommitdiff
path: root/runtimes/nn/common/operations/internal
diff options
context:
space:
mode:
Diffstat (limited to 'runtimes/nn/common/operations/internal')
-rw-r--r--runtimes/nn/common/operations/internal/common.h80
-rw-r--r--runtimes/nn/common/operations/internal/compatibility.h57
-rw-r--r--runtimes/nn/common/operations/internal/optimized/cpu_check.h28
-rw-r--r--runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h792
-rw-r--r--runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h1606
-rw-r--r--runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc217
-rw-r--r--runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h119
-rw-r--r--runtimes/nn/common/operations/internal/optimized/optimized_ops.h2717
-rw-r--r--runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h133
-rw-r--r--runtimes/nn/common/operations/internal/tensor_utils.cc29
-rw-r--r--runtimes/nn/common/operations/internal/tensor_utils.h123
-rw-r--r--runtimes/nn/common/operations/internal/tensor_utils_test.cc198
-rw-r--r--runtimes/nn/common/operations/internal/types.h112
13 files changed, 0 insertions, 6211 deletions
diff --git a/runtimes/nn/common/operations/internal/common.h b/runtimes/nn/common/operations/internal/common.h
deleted file mode 100644
index 1bf1050fd..000000000
--- a/runtimes/nn/common/operations/internal/common.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_COMMON_H__
-#define __NNFW_RT_COMMON_H__
-
-#ifndef USE_NEON
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define USE_NEON
-#include <arm_neon.h>
-#endif
-#endif
-
-#include "gemmlowp.h"
-#include "types.h"
-
-namespace nnfw {
-namespace rt {
-
-template <FusedActivationFunctionType Ac>
-struct ActivationFunctionImpl {};
-
-template <>
-struct ActivationFunctionImpl<FusedActivationFunctionType::kNone> {
- static float Eval(float x) { return x; }
-};
-
-template <>
-struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu> {
- static float Eval(float x) { return x < 0.f ? 0.f : x; }
-};
-
-template <>
-struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu1> {
- static float Eval(float x) { return x > 1.f ? 1.f : x < -1.f ? -1.f : x; }
-};
-
-template <>
-struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu6> {
- static float Eval(float x) { return x > 6.f ? 6.f : x < 0.f ? 0.f : x; }
-};
-
-template <FusedActivationFunctionType Ac>
-float ActivationFunction(float x) {
- return ActivationFunctionImpl<Ac>::Eval(x);
-}
-
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
- int32 x, int32 quantized_multiplier, int right_shift) {
- using gemmlowp::RoundingDivideByPOT;
- using gemmlowp::SaturatingRoundingDoublingHighMul;
- return RoundingDivideByPOT(
- SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
-}
-
-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
- int32 x, int32 quantized_multiplier, int left_shift) {
- using gemmlowp::SaturatingRoundingDoublingHighMul;
- return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
- quantized_multiplier);
-}
-
-} // namespace rt
-} // namespace nnfw
-
-#endif // __NNFW_RT_COMMON_H__
diff --git a/runtimes/nn/common/operations/internal/compatibility.h b/runtimes/nn/common/operations/internal/compatibility.h
deleted file mode 100644
index fd33cbd97..000000000
--- a/runtimes/nn/common/operations/internal/compatibility.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_COMPATIBILITY_H__
-#define __NNFW_RT_COMPATIBILITY_H__
-
-#include <cassert>
-#include <cstdint>
-
-#ifndef DCHECK
-#define DCHECK(condition) (condition) ? (void)0 : assert(false)
-#endif
-
-#ifndef DCHECK_EQ
-#define DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
-#endif
-
-#ifndef DCHECK_GE
-#define DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
-#endif
-
-#ifndef DCHECK_GT
-#define DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false)
-#endif
-
-#ifndef DCHECK_LE
-#define DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false)
-#endif
-
-#ifndef DCHECK_LT
-#define DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false)
-#endif
-
-#ifndef CHECK_EQ
-#define CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
-#endif
-
-using uint8 = std::uint8_t;
-using int16 = std::int16_t;
-using uint16 = std::uint16_t;
-using int32 = std::int32_t;
-using uint32 = std::uint32_t;
-
-#endif // __NNFW_RT_COMPATIBILITY_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/cpu_check.h b/runtimes/nn/common/operations/internal/optimized/cpu_check.h
deleted file mode 100644
index 02f42fd42..000000000
--- a/runtimes/nn/common/operations/internal/optimized/cpu_check.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_
-#define FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_
-
-// NEON_OR_PORTABLE(SomeFunc, arcs) calls NeonSomeFunc(args) if NEON is
-// enabled at build time, or PortableSomeFunc(args) otherwise.
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
-#else
-#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
-#endif
-
-#endif // FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_
diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h
deleted file mode 100644
index 5c05bf20f..000000000
--- a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__
-#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__
-
-#include "gemmlowp.h"
-#include "../common.h"
-#include "../types.h"
-
-namespace nnfw {
-namespace rt {
-namespace optimized_ops {
-
-// Implementation of float DepthwiseConv
-
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-struct FloatDepthwiseConvKernel {};
-
-#ifdef USE_NEON
-
-template <>
-struct FloatDepthwiseConvKernel<false, 8, 1> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const float* input_ptr, int input_ptr_increment,
- const float* filter_ptr, float* acc_buffer_ptr) {
- // Load the filters
- float32x4_t filter[2];
- for (int i = 0; i < 2; i++) {
- filter[i] = vld1q_f32(filter_ptr + 4 * i);
- }
- int outp = 0;
- // Handle 2 output pixels at a time.
- for (; outp <= num_output_pixels - 2; outp += 2) {
- // Load the inputs
- float32x4_t input[4];
- for (int i = 0; i < 4; i++) {
- input[i] = vld1q_f32(input_ptr + 4 * i);
- }
- input_ptr += 16;
- // Load the accumulators from acc_buffer
- float32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
- acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
- acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
- acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle one output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the inputs
- float32x4_t input[2];
- for (int i = 0; i < 2; i++) {
- input[i] = vld1q_f32(input_ptr + 4 * i);
- }
- input_ptr += 8;
- // Load the accumulators from acc_buffer
- float32x4_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<false, 2, 1> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const float* input_ptr, int input_ptr_increment,
- const float* filter_ptr, float* acc_buffer_ptr) {
- const float32x2_t filters = vld1_f32(filter_ptr);
- const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
- int outp = 0;
- // Handle 8 output pixels at a time.
- for (; outp <= num_output_pixels - 8; outp += 8) {
- // Load the inputs
- float32x4_t input[4];
- for (int i = 0; i < 4; i++) {
- input[i] = vld1q_f32(input_ptr + 4 * i);
- }
- input_ptr += 16;
- // Load the accumulators from acc_buffer
- float32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 4; i++) {
- acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle 4 output pixels at a time.
- for (; outp <= num_output_pixels - 4; outp += 4) {
- // Load the inputs
- float32x4_t input[2];
- for (int i = 0; i < 2; i++) {
- input[i] = vld1q_f32(input_ptr + 4 * i);
- }
- input_ptr += 8;
- // Load the accumulators from acc_buffer
- float32x4_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- // Handle 2 output pixels at a time.
- for (; outp <= num_output_pixels - 2; outp += 2) {
- // Load the inputs
- const float32x4_t input = vld1q_f32(input_ptr);
- input_ptr += 4;
- // Load the accumulators from acc_buffer
- float32x4_t acc = vld1q_f32(acc_buffer_ptr);
- // Multiply-accumulate
- acc = vmlaq_f32(acc, input, filters_dup2);
- // Store the accumulators back to acc_buffer
- vst1q_f32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 4;
- }
- // Handle 1 output pixel at a time
- for (; outp < num_output_pixels; outp++) {
- // Load the inputs
- const float32x2_t input = vld1_f32(input_ptr);
- input_ptr += 2;
- // Load the accumulators from acc_buffer
- float32x2_t acc = vld1_f32(acc_buffer_ptr);
- // Multiply-accumulate
- acc = vmla_f32(acc, input, filters);
- // Store the accumulators back to acc_buffer
- vst1_f32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 2;
- }
- }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 1> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const float* input_ptr, int input_ptr_increment,
- const float* filter_ptr, float* acc_buffer_ptr) {
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- const float* local_filter_ptr = filter_ptr;
- const float* local_input_ptr = input_ptr;
- int ic = 0;
- // Handle 16 input channels at a time.
- for (; ic <= input_depth - 16; ic += 16) {
- // Load the filters
- float32x4_t filter[4];
- for (int i = 0; i < 4; i++) {
- filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
- }
- local_filter_ptr += 16;
- // Load the inputs
- float32x4_t input[4];
- for (int i = 0; i < 4; i++) {
- input[i] = vld1q_f32(local_input_ptr + 4 * i);
- }
- local_input_ptr += 16;
- // Load the accumulators from acc_buffer
- float32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 4; i++) {
- acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle 4 input channels at a time.
- for (; ic <= input_depth - 4; ic += 4) {
- // Load the filters
- float32x4_t filter;
- filter = vld1q_f32(local_filter_ptr);
- local_filter_ptr += 4;
- // Load the inputs
- float32x4_t input;
- input = vld1q_f32(local_input_ptr);
- local_input_ptr += 4;
- // Load the accumulators from acc_buffer
- float32x4_t acc;
- acc = vld1q_f32(acc_buffer_ptr);
- // Multiply-accumulate
- acc = vmlaq_f32(acc, input, filter);
- // Store the accumulators back to acc_buffer
- vst1q_f32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 4;
- }
- // Handle one input channel at a time.
- for (; ic < input_depth; ic++) {
- const float input_val = *local_input_ptr++;
- const float filter_val = *local_filter_ptr++;
- *acc_buffer_ptr++ += filter_val * input_val;
- }
- input_ptr += input_ptr_increment;
- }
- }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 8> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const float* input_ptr, int input_ptr_increment,
- const float* filter_ptr, float* acc_buffer_ptr) {
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- const float* local_filter_ptr = filter_ptr;
- const float* local_input_ptr = input_ptr;
- int ic = 0;
- // Handle 2 input channels at a time.
- for (; ic <= input_depth - 2; ic += 2) {
- // Load the filters
- float32x4_t filter[4];
- for (int i = 0; i < 4; i++) {
- filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
- }
- local_filter_ptr += 16;
- // Load the inputs
- const float32x2_t input = vld1_f32(local_input_ptr);
- local_input_ptr += 2;
- // Load the accumulators from acc_buffer
- float32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
- acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
- acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
- acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle one input channel at a time.
- for (; ic < input_depth; ic++) {
- // Load the filters
- float32x4_t filter[2];
- for (int i = 0; i < 2; i++) {
- filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
- }
- local_filter_ptr += 8;
- // Load the inputs
- const float input_val = *local_input_ptr++;
- // Load the accumulators from acc_buffer
- float32x4_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- input_ptr += input_ptr_increment;
- }
- }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 2> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const float* input_ptr, int input_ptr_increment,
- const float* filter_ptr, float* acc_buffer_ptr) {
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- const float* local_filter_ptr = filter_ptr;
- const float* local_input_ptr = input_ptr;
- int ic = 0;
- // Handle 8 input channels at a time.
- for (; ic <= input_depth - 8; ic += 8) {
- // Load the filters
- float32x4_t filter[4];
- for (int i = 0; i < 4; i++) {
- filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
- }
- local_filter_ptr += 16;
- // Load the inputs
- float32x4x2_t input_dup2[2];
- for (int i = 0; i < 2; i++) {
- const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
- input_dup2[i] = vzipq_f32(input, input);
- }
- local_input_ptr += 8;
- // Load the accumulators from acc_buffer
- float32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
- acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
- acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
- acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle 4 input channels at a time.
- for (; ic <= input_depth - 4; ic += 4) {
- // Load the filters
- float32x2_t filter[4];
- for (int i = 0; i < 4; i++) {
- filter[i] = vld1_f32(local_filter_ptr + 2 * i);
- }
- local_filter_ptr += 8;
- // Load the inputs
- const float32x4_t input = vld1q_f32(local_input_ptr);
- local_input_ptr += 4;
- // Load the accumulators from acc_buffer
- float32x2_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
- }
- // Multiply-accumulate
- acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
- acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
- acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
- acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- // Handle 2 input channels at a time.
- for (; ic <= input_depth - 2; ic += 2) {
- // Load the filters
- const float32x4_t filter = vld1q_f32(local_filter_ptr);
- local_filter_ptr += 4;
- // Load the inputs
- const float32x2_t input = vld1_f32(local_input_ptr);
- local_input_ptr += 2;
- // Load the accumulators from acc_buffer
- float32x2_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
- }
- // Multiply-accumulate
- acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
- acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
- }
- acc_buffer_ptr += 4;
- }
- // Handle one input channel at a time.
- for (; ic < input_depth; ic++) {
- // Load the inputs
- const float input_val = *local_input_ptr++;
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
- }
- local_filter_ptr += 2;
- acc_buffer_ptr += 2;
- }
- input_ptr += input_ptr_increment;
- }
- }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 1, 8> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const float* input_ptr, int input_ptr_increment,
- const float* filter_ptr, float* acc_buffer_ptr) {
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- // Load the filters
- float32x4_t filter[2];
- for (int i = 0; i < 2; i++) {
- filter[i] = vld1q_f32(filter_ptr + 4 * i);
- }
- // Load the inputs
- const float input_val = *input_ptr;
- input_ptr += input_ptr_increment;
- // Load the accumulators from acc_buffer
- float32x4_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 16> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const float* input_ptr, int input_ptr_increment,
- const float* filter_ptr, float* acc_buffer_ptr) {
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- const float* local_filter_ptr = filter_ptr;
- const float* local_input_ptr = input_ptr;
- for (int ic = 0; ic < input_depth; ic++) {
- // Load the filters
- float32x4_t filter[4];
- for (int i = 0; i < 4; i++) {
- filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
- }
- local_filter_ptr += 16;
- // Load the inputs
- const float input_val = *local_input_ptr++;
- // Load the accumulators from acc_buffer
- float32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 4; i++) {
- acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- input_ptr += input_ptr_increment;
- }
- }
-};
-#endif
-
-// Accumulates the effect of one row of the filter, on a segment of one row
-// of the output, accessing the corresponding one row of the input.
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
- const float* input_data, int pad_width,
- int depth_multiplier, int filter_width,
- const float* filter_data,
- int out_x_buffer_start, int out_x_buffer_end,
- int output_depth, float* acc_buffer) {
-#ifdef GEMMLOWP_PROFILING
- gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
-#endif
- // Sanity check parameters. This is important in particular to ensure
- // that we keep the number of template instantiations minimal, so we don't
- // increase binary size unnecessarily.
- static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
- static_assert(kFixedInputDepth || kAllowStrided, "");
- DCHECK(stride == 1 || kAllowStrided);
- if (kFixedInputDepth) {
- DCHECK_EQ(input_depth, kFixedInputDepth);
- }
- if (kFixedDepthMultiplier) {
- DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
- }
- DCHECK_EQ(output_depth, input_depth * depth_multiplier);
- const int input_ptr_increment = stride * input_depth;
- const float* filter_base_ptr = filter_data;
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- // For the current (filter_x, filter_y) point in the filter,
- // compute the boundaries of the corresponding output row segment.
- int out_x_loop_start_unclampled = 0;
- int out_x_loop_end_unclampled = 0;
- if (kAllowStrided) {
- if (stride == 2) {
- out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
- out_x_loop_end_unclampled =
- (pad_width + input_width - filter_x + 1) / 2;
- } else if (stride == 4) {
- out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
- out_x_loop_end_unclampled =
- (pad_width + input_width - filter_x + 3) / 4;
- } else {
- out_x_loop_start_unclampled =
- (pad_width - filter_x + stride - 1) / stride;
- out_x_loop_end_unclampled =
- (pad_width + input_width - filter_x + stride - 1) / stride;
- }
- } else {
- out_x_loop_start_unclampled = pad_width - filter_x;
- out_x_loop_end_unclampled = pad_width + input_width - filter_x;
- }
- // The kernel will have to iterate on the segment of the
- // output row that starts at out_x_loop_start and out_x_loop_end.
- const int out_x_loop_start =
- std::max(out_x_buffer_start, out_x_loop_start_unclampled);
- const int out_x_loop_end =
- std::min(out_x_buffer_end, out_x_loop_end_unclampled);
-
- float* acc_buffer_ptr =
- acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
- const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
- const float* input_ptr = input_data + in_x_origin * input_depth;
- const int num_output_pixels = out_x_loop_end - out_x_loop_start;
- FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
- kFixedDepthMultiplier>::Run(num_output_pixels,
- input_depth,
- depth_multiplier,
- input_ptr,
- input_ptr_increment,
- filter_base_ptr,
- acc_buffer_ptr);
- filter_base_ptr += output_depth;
- }
-}
-
-// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
-inline void FloatDepthwiseConvAccumRowGeneric(
- int stride, int input_depth, int input_width, const float* input_data,
- int pad_width, int depth_multiplier, int filter_width,
- const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
- int output_depth, float* acc_buffer) {
- gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
- const float* filter_base_ptr = filter_data;
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- const int out_x_loop_start = std::max(
- out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
- const int out_x_loop_end =
- std::min(out_x_buffer_end,
- (pad_width + input_width - filter_x + stride - 1) / stride);
-
- float* acc_buffer_ptr =
- acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
- const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
- const float* input_ptr = input_data + in_x_origin * input_depth;
- const int input_ptr_increment = (stride - 1) * input_depth;
- for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
- const float* filter_ptr = filter_base_ptr;
- for (int ic = 0; ic < input_depth; ++ic) {
- const float input_val = *input_ptr++;
- for (int m = 0; m < depth_multiplier; m++) {
- const float filter_val = *filter_ptr++;
- *acc_buffer_ptr++ += filter_val * input_val;
- }
- }
- input_ptr += input_ptr_increment;
- }
- filter_base_ptr += output_depth;
- }
-}
-
-// Initializes the accumulator buffer with bias values.
-inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
- const float* bias_data,
- float* acc_buffer) {
- for (int i = 0; i < num_output_pixels; i++) {
- memcpy(acc_buffer + i * output_depth, bias_data,
- sizeof(acc_buffer[0]) * output_depth);
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
- const float* filter_data, const Dims<4>& filter_dims,
- const float* bias_data, const Dims<4>& bias_dims,
- int stride_width, int stride_height,
- int pad_width, int pad_height, int depth_multiplier,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int input_depth = ArraySize(input_dims, 0);
- const int filter_height = ArraySize(filter_dims, 2);
- const int filter_width = ArraySize(filter_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
-#if 0 // TODO-NNRT : Check if assertion is needed, output depth some times not equal to input * depthmultiplier
- DCHECK(output_depth == input_depth * depth_multiplier);
-#endif
-
- static const int kAccBufferMaxSize = 1024;
- float acc_buffer[kAccBufferMaxSize];
- DCHECK_GE(kAccBufferMaxSize, output_depth);
- const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
- const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
- DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
- DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
- DCHECK_GE(kOutputPixelsInAccBuffer, 1);
-
- // row_accum_func will point to the core accumulation function to be used
- // for this DepthwiseConv op.
- auto* row_accum_func = FloatDepthwiseConvAccumRowGeneric;
-
- const int kMaxFixedDepthMultiplier = 16;
- int fixed_depth_multiplier = 0;
- if (depth_multiplier <= kMaxFixedDepthMultiplier) {
- fixed_depth_multiplier = depth_multiplier;
- }
- // kMaxUnrolling is the max number of output values that we aim to handle
- // in one unrolled iteration of the inner loop. For practical performance
- // reasons, it is limited by the number of available registers. We could
- // fine-tune it depending on the architecture, but that's not worth doing
- // since this whole code is not very optimized to begin with. The
- // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
- // vector registers.
- const int kMaxUnrolling = 8;
- int fixed_input_depth = 0;
- if (fixed_depth_multiplier &&
- input_depth * fixed_depth_multiplier <= kMaxUnrolling) {
- fixed_input_depth = input_depth;
- }
-#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
- FIXED_DEPTH_MULTIPLIER) \
- if ((stride_width == 1 || ALLOW_STRIDED) && \
- fixed_input_depth == FIXED_INPUT_DEPTH && \
- fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \
- row_accum_func = \
- FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
- FIXED_DEPTH_MULTIPLIER>; \
- }
-
-#ifdef USE_NEON
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
-#endif // USE_NEON
-
-#undef TFMINI_USE_DEPTHWISECONV_KERNEL
-
- // Now that we have determined row_accum_func, we can start work.
- float* output_ptr = output_data;
- for (int b = 0; b < batches; ++b) {
- for (int out_y = 0; out_y < output_height; ++out_y) {
- const int in_y_origin = (out_y * stride_height) - pad_height;
- const int filter_y_start = std::max(0, -in_y_origin);
- const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
- for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
- out_x_buffer_start += kOutputPixelsInAccBuffer) {
- const int out_x_buffer_end = std::min(
- output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
- // We call a 'pixel' a group of activation that share all but the
- // 'depth'/'channel' coordinate. num_output_pixels is the number of
- // output pixels that we will accumulate in this loop iteration.
- const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
- // Initialize our local accumulator with the bias values, so we don't
- // have to add them later.
- DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
- acc_buffer);
- // Accumulation loop. Most of the time should be spent in here.
- for (int filter_y = filter_y_start; filter_y < filter_y_end;
- ++filter_y) {
- const int in_y = in_y_origin + filter_y;
- row_accum_func(stride_width, input_depth, input_width,
- input_data + in_y * input_dims.strides[2] +
- b * input_dims.strides[3],
- pad_width, depth_multiplier, filter_width,
- filter_data + filter_y * filter_dims.strides[2],
- out_x_buffer_start, out_x_buffer_end, output_depth,
- acc_buffer);
- }
- // Finished accumulating. Now store to destination.
- const int num_output_values = output_depth * num_output_pixels;
- int i = 0;
-#ifdef USE_NEON
- // Handle 16 values at a time
- for (; i <= num_output_values - 16; i += 16) {
- float32x4_t acc[4];
- for (int k = 0; k < 4; k++) {
- acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
- }
- if (Ac == FusedActivationFunctionType::kRelu) {
- for (int k = 0; k < 4; k++) {
- acc[k] = vmaxq_f32(vdupq_n_f32(0.f), acc[k]);
- }
- } else if (Ac == FusedActivationFunctionType::kRelu6) {
- for (int k = 0; k < 4; k++) {
- acc[k] = vmaxq_f32(vdupq_n_f32(0.f),
- vminq_f32(vdupq_n_f32(6.f), acc[k]));
- }
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- for (int k = 0; k < 4; k++) {
- acc[k] = vmaxq_f32(vdupq_n_f32(-1.f),
- vminq_f32(vdupq_n_f32(1.f), acc[k]));
- }
- }
- for (int k = 0; k < 4; k++) {
- vst1q_f32(output_ptr + 4 * k, acc[k]);
- }
- output_ptr += 16;
- }
- // Handle 4 values at a time
- for (; i <= num_output_values - 4; i += 4) {
- float32x4_t acc = vld1q_f32(acc_buffer + i);
- if (Ac == FusedActivationFunctionType::kRelu) {
- acc = vmaxq_f32(vdupq_n_f32(0.f), acc);
- } else if (Ac == FusedActivationFunctionType::kRelu6) {
- acc = vmaxq_f32(vdupq_n_f32(0.f), vminq_f32(vdupq_n_f32(6.f), acc));
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- acc =
- vmaxq_f32(vdupq_n_f32(-1.f), vminq_f32(vdupq_n_f32(1.f), acc));
- }
- vst1q_f32(output_ptr, acc);
- output_ptr += 4;
- }
-#endif
- // Handle leftover values, one by one. This is very slow.
- for (; i < num_output_values; i++) {
- float acc = acc_buffer[i];
- if (Ac == FusedActivationFunctionType::kRelu) {
- acc = std::max(0.f, acc);
- } else if (Ac == FusedActivationFunctionType::kRelu6) {
- acc = std::max(0.f, std::min(6.f, acc));
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- acc = std::max(-1.f, std::min(1.f, acc));
- }
- *output_ptr++ = acc;
- }
- }
- }
- }
-}
-
-} // namespace optimized_ops
-} // namespace rt
-} // namespace nnfw
-
-
-#endif // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h
deleted file mode 100644
index 220f8793e..000000000
--- a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h
+++ /dev/null
@@ -1,1606 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__
-#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__
-
-#include "fixedpoint.h"
-#include "gemmlowp.h"
-#include "../common.h"
-#include "../types.h"
-
-namespace nnfw {
-namespace rt {
-namespace optimized_ops {
-
-// Implementation of quantized DepthwiseConv
-
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-struct QuantizedDepthwiseConvKernel {};
-
-#ifdef USE_NEON
-template <>
-struct QuantizedDepthwiseConvKernel<true, 8, 2> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- uint8x8x2_t filter_u8;
- filter_u8.val[0] = vld1_u8(filter_ptr);
- filter_u8.val[1] = vld1_u8(filter_ptr + 8);
- int16x8_t filter[2];
- for (int i = 0; i < 2; i++) {
- filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
- vdupq_n_s16(filter_offset));
- }
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer
- int32x4x2_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
- acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
- }
- // Load the inputs, add input_offset.
- const uint8x8_t input_u8 = vld1_u8(input_ptr);
- input_ptr += input_ptr_increment;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
- // Duplicate the input values, 2-fold
- const int16x8x2_t input_dup2 = vzipq_s16(input, input);
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
- vget_low_s16(input_dup2.val[i]));
- acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
- vget_high_s16(input_dup2.val[i]));
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
- vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
- }
- acc_buffer_ptr += 16;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 8, 1> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
- const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
- const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-
- int outp = 0;
- // Handle 2 output pixels at a time.
- for (; outp <= num_output_pixels - 2; outp += 2) {
- // Load the accumulators from acc_buffer.
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Load the inputs, add input_offset.
- uint8x8_t input_u8[2];
- for (int i = 0; i < 2; i++) {
- input_u8[i] = vld1_u8(input_ptr + 8 * i);
- }
- input_ptr += 16;
- int16x8_t input[2];
- for (int i = 0; i < 2; i++) {
- input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
- }
- for (int i = 0; i < 2; i++) {
- input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
- }
- // Multiply-accumulate.
- acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
- acc[1] =
- vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
- acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
- acc[3] =
- vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle 1 output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer.
- int32x4_t acc[2];
- acc[0] = vld1q_s32(acc_buffer_ptr);
- acc[1] = vld1q_s32(acc_buffer_ptr + 4);
-
- // Load the inputs, add input_offset.
- const uint8x8_t input_u8 = vld1_u8(input_ptr);
- input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
- // Multiply-accumulate.
- acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
- acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
- // Store the accumulators back to acc_buffer
- vst1q_s32(acc_buffer_ptr, acc[0]);
- vst1q_s32(acc_buffer_ptr + 4, acc[1]);
- acc_buffer_ptr += 8;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 4, 2> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
- const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
- const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-
- int outp = 0;
- // Handle 2 output pixels at a time.
- for (; outp <= num_output_pixels - 2; outp += 2) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Load the inputs, add input_offset.
- const uint8x8_t input_u8 = vld1_u8(input_ptr);
- input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
- // Duplicate the input values, 2-fold
- const int16x8x2_t input_dup2 = vzipq_s16(input, input);
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
- vget_low_s16(input_dup2.val[i]));
- acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
- vget_high_s16(input_dup2.val[i]));
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle one output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
- input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
- input_ptr += 4;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
- // Duplicate the input values, 2-fold
- const int16x4x2_t input_dup2 = vzip_s16(input, input);
- // Multiply-accumulate
- acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
- acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 2, 8> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- int16x8_t filter[2];
- for (int i = 0; i < 2; i++) {
- const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
- const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
- filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
- }
- int outp = 0;
- // Handle two output pixels at a time.
- for (; outp <= num_output_pixels - 2; outp += 2) {
- // Load the accumulators from acc_buffer.
- int32x4_t acc[8];
- for (int i = 0; i < 8; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
- input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
- input_ptr += 4;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
- // Multiply-accumulate.
- acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
- acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
- acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
- acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
- acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
- acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
- acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
- acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
- // Store the accumulators back to acc_buffer.
- for (int i = 0; i < 8; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 32;
- }
- // Handle one output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer.
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_ptr += 2;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
- // Multiply-accumulate.
- acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
- acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
- acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
- acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
-
- // Store the accumulators back to acc_buffer.
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 2, 2> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- uint8x8_t filter_u8 = vdup_n_u8(0);
- filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
- filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
- filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
- filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
- const int16x4_t filter_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
- const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
- int outp = 0;
- // Handle 4 output pixels at a time.
- for (; outp <= num_output_pixels - 4; outp += 4) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
-
- // Load the inputs, add input_offset.
- const uint8x8_t input_u8 = vld1_u8(input_ptr);
- input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
- // Duplicate the input values, 2-fold
- const int16x8x2_t input_dup2 = vzipq_s16(input, input);
- // Multiply-accumulate
- acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
- acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
- acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
- acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle one output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer
- int32x4_t acc = vld1q_s32(acc_buffer_ptr);
-
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_ptr += 2;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
- // Duplicate the input values, 2-fold
- const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
- // Multiply-accumulate
- acc = vmlal_s16(acc, filter, input_dup2);
- // Store the accumulators back to acc_buffer
- vst1q_s32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 4;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 2, 1> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- uint8x8_t filter_u8 = vdup_n_u8(0);
- filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
- filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
- filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
- filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
- const int16x4_t filter_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
- const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
- int outp = 0;
- // Handle 8 output pixels at a time.
- for (; outp <= num_output_pixels - 8; outp += 8) {
- // Load the accumulators from acc_buffer.
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Load the inputs, add input_offset.
- uint8x8_t input_u8[2];
- for (int i = 0; i < 2; i++) {
- input_u8[i] = vld1_u8(input_ptr + 8 * i);
- }
- input_ptr += 16;
- int16x8_t input[2];
- for (int i = 0; i < 2; i++) {
- input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
- }
- for (int i = 0; i < 2; i++) {
- input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
- }
-
- // Multiply-accumulate.
- acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
- acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
- acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
- acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
- // Store the accumulators back to acc_buffer.
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle 4 output pixels at a time.
- for (; outp <= num_output_pixels - 4; outp += 4) {
- // Load the accumulators from acc_buffer.
- int32x4_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Load the inputs, add input_offset.
- const uint8x8_t input_u8 = vld1_u8(input_ptr);
- input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-
- // Multiply-accumulate.
- acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
- acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
- // Store the accumulators back to acc_buffer.
- for (int i = 0; i < 2; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- // Handle 2 output pixels at a time.
- for (; outp <= num_output_pixels - 2; outp += 2) {
- // Load the accumulators from acc_buffer.
- int32x4_t acc = vld1q_s32(acc_buffer_ptr);
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
- input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
- input_ptr += 4;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
- // Multiply-accumulate.
- acc = vmlal_s16(acc, filter, input);
- // Store the accumulators back to acc_buffer.
- vst1q_s32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 4;
- }
- // Handle 1 output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer.
- int32x2_t acc = vld1_s32(acc_buffer_ptr);
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_ptr += 2;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
- // Multiply-accumulate.
- acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
- // Store the accumulators back to acc_buffer.
- vst1_s32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 2;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 1, 2> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- uint8x8_t filter_u8 = vdup_n_u8(0);
- filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
- filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
- filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
- filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
- const int16x4_t filter_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
- const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
- int outp = 0;
- // Handle 8 output pixels at a time.
- for (; outp <= num_output_pixels - 8; outp += 8) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
-
- // Load the inputs, add input_offset.
- const uint8x8_t input_u8 = vld1_u8(input_ptr);
- input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
- // Duplicate the input values, 2-fold
- const int16x8x2_t input_dup2 = vzipq_s16(input, input);
- // Multiply-accumulate
- acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
- acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
- acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
- acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle one output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer
- int32x2_t acc = vld1_s32(acc_buffer_ptr);
-
- // Load the inputs, add input_offset.
- const uint32 input = *input_ptr++ + input_offset;
-
- // Multiply-accumulate
- acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
- // Store the accumulators back to acc_buffer
- vst1_s32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 2;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 1, 4> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- uint8x8_t filter_u8 = vdup_n_u8(0);
- filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
- filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
- filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
- filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
- const int16x4_t filter_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
- const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
- int outp = 0;
- // Handle 8 output pixels at a time.
- for (; outp <= num_output_pixels - 8; outp += 8) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[8];
- for (int i = 0; i < 8; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
-
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vld1_u8(input_ptr);
- input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-
- // Multiply-accumulate
- acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
- acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
- acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
- acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
- acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
- acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
- acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
- acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
-
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 8; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 32;
- }
- // Handle 4 output pixels at a time.
- for (; outp <= num_output_pixels - 4; outp += 4) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
-
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
- input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
- input_ptr += 4;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
- // Multiply-accumulate
- acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
- acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
- acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
- acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
-
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle one output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer
- int32x4_t acc = vld1q_s32(acc_buffer_ptr);
-
- // Load the inputs, add input_offset.
- const uint32 input = *input_ptr++ + input_offset;
-
- // Multiply-accumulate
- acc = vmlal_n_s16(acc, filter, input);
- // Store the accumulators back to acc_buffer
- vst1q_s32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 4;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 4, 1> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- uint8x8_t filter_u8 = vdup_n_u8(0);
- filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
- filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
- filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
- filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
- const int16x4_t filter_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
- const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
- int outp = 0;
- // Handle 4 output pixels at a time.
- for (; outp <= num_output_pixels - 4; outp += 4) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Load the inputs, add input_offset.
- int16x8_t input[2];
- for (int i = 0; i < 2; i++) {
- const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
- }
- input_ptr += 16;
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[2 * i + 0] =
- vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
- acc[2 * i + 1] =
- vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle one output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer
- int32x4_t acc;
- acc = vld1q_s32(acc_buffer_ptr);
-
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
- input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
- input_ptr += 4;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
- // Multiply-accumulate
- acc = vmlal_s16(acc, filter, input);
- // Store the accumulators back to acc_buffer
- vst1q_s32(acc_buffer_ptr, acc);
- acc_buffer_ptr += 4;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 4, 4> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- int16x8_t filter[2];
- for (int i = 0; i < 2; i++) {
- const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
- const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
- filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
- }
-
- int outp = 0;
- // Handle 2 output pixels at a time.
- for (; outp <= num_output_pixels - 2; outp += 2) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[8];
- for (int i = 0; i < 8; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
-
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vld1_u8(input_ptr);
- input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-
- // Multiply-accumulate
- acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
- vget_low_s16(input), 0);
- acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
- vget_low_s16(input), 1);
- acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
- vget_low_s16(input), 2);
- acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
- vget_low_s16(input), 3);
- acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
- vget_high_s16(input), 0);
- acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
- vget_high_s16(input), 1);
- acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
- vget_high_s16(input), 2);
- acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
- vget_high_s16(input), 3);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 8; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 32;
- }
- // Handle one output pixel at a time.
- for (; outp < num_output_pixels; outp++) {
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
-
- // Load the inputs, add input_offset.
- uint8x8_t input_u8 = vdup_n_u8(0);
- input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
- input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
- input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
- input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
- input_ptr += 4;
- const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
- const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
- // Multiply-accumulate
- acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
- acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
- acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
- acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 0, 3> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // We will have to duplicate bytes in a NEON register, 3-fold.
- // We will do that by register-level table-look-up using VTBL instructions.
- // Here we prepare the registers containing the table-lookup indices.
- static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
- {2, 3, 3, 3, 4, 4, 4, 5},
- {5, 5, 6, 6, 6, 7, 7, 7}};
- uint8x8_t dup3_indices[3];
- for (int i = 0; i < 3; i++) {
- dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
- }
-
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- const uint8* local_filter_ptr = filter_ptr;
- const uint8* local_input_ptr = input_ptr;
- int ic = 0;
- // Handle 8 input channels at a time.
- for (; ic <= input_depth - 8; ic += 8) {
- // Load the filters, add filter_offset.
- int16x8_t filter[3];
- uint8x8x3_t filter_u8;
- filter_u8.val[0] = vld1_u8(local_filter_ptr);
- filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
- filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
- local_filter_ptr += 24;
- for (int i = 0; i < 3; i++) {
- const int16x8_t filter_s16 =
- vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
- filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
- }
- // Load the inputs, duplicate 3-fold, add input_offset.
- const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
- local_input_ptr += 8;
-
- uint8x8_t input_u8_dup3[3];
- for (int i = 0; i < 3; i++) {
- input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
- }
- int16x8_t input_dup3[3];
- for (int i = 0; i < 3; i++) {
- const int16x8_t input_s16_dup3 =
- vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
- input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
- }
- // Load the accumulators from acc_buffer
- int32x4x3_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
- acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
- acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
- }
- // Multiply-accumulate
- for (int j = 0; j < 3; j++) {
- acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
- vget_low_s16(filter[j]));
- acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
- vget_high_s16(filter[j]));
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
- vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
- vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
- }
- acc_buffer_ptr += 24;
- }
- // Handle one input channel at a time.
- for (; ic < input_depth; ic++) {
- const int16 input_val = *local_input_ptr++ + input_offset;
- for (int i = 0; i < 3; i++) {
- const int16 filter_val = local_filter_ptr[i] + filter_offset;
- *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
- }
- local_filter_ptr += 3;
- }
- input_ptr += input_ptr_increment;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 0, 2> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- const uint8* local_filter_ptr = filter_ptr;
- const uint8* local_input_ptr = input_ptr;
- int ic = 0;
- // Handle 8 input channels at a time.
- for (; ic <= input_depth - 8; ic += 8) {
- // Load the filters, add filter_offset.
- int16x8_t filter[2];
- uint8x8x2_t filter_u8;
- filter_u8.val[0] = vld1_u8(local_filter_ptr);
- filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
- local_filter_ptr += 16;
- for (int i = 0; i < 2; i++) {
- const int16x8_t filter_s16 =
- vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
- filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
- }
- // Load the inputs, add input_offset, duplicate 2-fold.
- const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
- local_input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
- const int16x8x2_t input_dup2 = vzipq_s16(input, input);
- // Load the accumulators from acc_buffer.
- int32x4x2_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
- acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
- }
- // Multiply-accumulate.
- for (int j = 0; j < 2; j++) {
- acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
- vget_low_s16(input_dup2.val[j]));
- acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
- vget_high_s16(input_dup2.val[j]));
- }
- // Store the accumulators back to acc_buffer.
- for (int i = 0; i < 2; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
- vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle one input channel at a time.
- for (; ic < input_depth; ic++) {
- // Load the inputs.
- const int16 input_val = *local_input_ptr++ + input_offset;
- for (int i = 0; i < 2; i++) {
- const int16 filter_val = local_filter_ptr[i] + filter_offset;
- *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
- }
- local_filter_ptr += 2;
- }
- input_ptr += input_ptr_increment;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 0, 1> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- const uint8* local_filter_ptr = filter_ptr;
- const uint8* local_input_ptr = input_ptr;
- int ic = 0;
- // Handle 16 input channels at a time.
- for (; ic <= input_depth - 16; ic += 16) {
- // Load the filters, add filter_offset.
- uint8x8_t filter_u8[2];
- for (int i = 0; i < 2; i++) {
- filter_u8[i] = vld1_u8(local_filter_ptr + 8 * i);
- }
- local_filter_ptr += 16;
- int16x8_t filter[2];
- for (int i = 0; i < 2; i++) {
- filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
- }
- for (int i = 0; i < 2; i++) {
- filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
- }
- // Load the inputs, add input_offset.
- uint8x8_t input_u8[2];
- for (int i = 0; i < 2; i++) {
- input_u8[i] = vld1_u8(local_input_ptr + 8 * i);
- }
- local_input_ptr += 16;
- int16x8_t input[2];
- for (int i = 0; i < 2; i++) {
- input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
- }
- for (int i = 0; i < 2; i++) {
- input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
- }
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
- vget_low_s16(filter[i]));
- acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
- vget_high_s16(filter[i]));
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- // Handle 8 input channels at a time.
- for (; ic <= input_depth - 8; ic += 8) {
- // Load the filters, add filter_offset.
- const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
- local_filter_ptr += 8;
- const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
- const int16x8_t filter =
- vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
- // Load the inputs, add input_offset.
- const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
- local_input_ptr += 8;
- const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
- const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
- // Load the accumulators from acc_buffer
- int32x4_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
- acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- // Handle one input channel at a time.
- for (; ic < input_depth; ic++) {
- const int16 input_val = *local_input_ptr++ + input_offset;
- const int16 filter_val = *local_filter_ptr++ + filter_offset;
- *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
- }
- input_ptr += input_ptr_increment;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 16, 1> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- uint8x8_t filter_u8[2];
- for (int i = 0; i < 2; i++) {
- filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
- }
- int16x8_t filter[2];
- for (int i = 0; i < 2; i++) {
- filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
- }
- for (int i = 0; i < 2; i++) {
- filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
- }
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- // Load the inputs, add input_offset.
- uint8x8_t input_u8[2];
- for (int i = 0; i < 2; i++) {
- input_u8[i] = vld1_u8(input_ptr + 8 * i);
- }
- input_ptr += input_ptr_increment;
- int16x8_t input[2];
- for (int i = 0; i < 2; i++) {
- input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
- }
- for (int i = 0; i < 2; i++) {
- input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
- }
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
- vget_low_s16(filter[i]));
- acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
- vget_high_s16(filter[i]));
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 1, 16> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- uint8x8_t filter_u8[2];
- for (int i = 0; i < 2; i++) {
- filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
- }
- int16x8_t filter[2];
- for (int i = 0; i < 2; i++) {
- filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
- }
- for (int i = 0; i < 2; i++) {
- filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
- }
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- uint8 input_u8 = *input_ptr;
- input_ptr += input_ptr_increment;
- int16 input = static_cast<int16>(input_u8 + input_offset);
- // Load the accumulators from acc_buffer
- int32x4_t acc[4];
- for (int i = 0; i < 4; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- for (int i = 0; i < 2; i++) {
- acc[2 * i + 0] =
- vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
- acc[2 * i + 1] =
- vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
- }
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 4; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 16;
- }
- }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 1, 8> {
- static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8* input_ptr, int16 input_offset,
- int input_ptr_increment, const uint8* filter_ptr,
- int16 filter_offset, int32* acc_buffer_ptr) {
- // Load the filters, add filter_offset.
- const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
- const int16x8_t filter = vaddq_s16(
- vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
- // Handle one output pixel at a time.
- for (int outp = 0; outp < num_output_pixels; outp++) {
- uint8 input_u8 = *input_ptr;
- input_ptr += input_ptr_increment;
- int16 input = static_cast<int16>(input_u8 + input_offset);
- // Load the accumulators from acc_buffer
- int32x4_t acc[2];
- for (int i = 0; i < 2; i++) {
- acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
- }
- // Multiply-accumulate
- acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
- acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
- // Store the accumulators back to acc_buffer
- for (int i = 0; i < 2; i++) {
- vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
- }
- acc_buffer_ptr += 8;
- }
- }
-};
-#endif
-
-// Accumulates the effect of one row of the filter, on a segment of one row
-// of the output, accessing the corresponding one row of the input.
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-void QuantizedDepthwiseConvAccumRow(
- int stride, int input_depth, int input_width, const uint8* input_data,
- int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
- const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
- int out_x_buffer_end, int output_depth, int32* acc_buffer) {
-#ifdef GEMMLOWP_PROFILING
- gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
-#endif
- // Sanity check parameters. This is important in particular to ensure
- // that we keep the number of template instantiations minimal, so we don't
- // increase binary size unnecessarily.
- static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
- static_assert(kFixedInputDepth || kAllowStrided, "");
- DCHECK(stride == 1 || kAllowStrided);
- if (kFixedInputDepth) {
- DCHECK_EQ(input_depth, kFixedInputDepth);
- }
- if (kFixedDepthMultiplier) {
- DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
- }
- DCHECK_EQ(output_depth, input_depth * depth_multiplier);
- const int input_ptr_increment = stride * input_depth;
- const uint8* filter_base_ptr = filter_data;
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- // For the current (filter_x, filter_y) point in the filter,
- // compute the boundaries of the corresponding output row segment.
- int out_x_loop_start_unclampled = 0;
- int out_x_loop_end_unclampled = 0;
- if (kAllowStrided) {
- if (stride == 2) {
- out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
- out_x_loop_end_unclampled =
- (pad_width + input_width - filter_x + 1) / 2;
- } else if (stride == 4) {
- out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
- out_x_loop_end_unclampled =
- (pad_width + input_width - filter_x + 3) / 4;
- } else {
- out_x_loop_start_unclampled =
- (pad_width - filter_x + stride - 1) / stride;
- out_x_loop_end_unclampled =
- (pad_width + input_width - filter_x + stride - 1) / stride;
- }
- } else {
- out_x_loop_start_unclampled = pad_width - filter_x;
- out_x_loop_end_unclampled = pad_width + input_width - filter_x;
- }
- // The kernel will have to iterate on the segment of the
- // output row that starts at out_x_loop_start and out_x_loop_end.
- const int out_x_loop_start =
- std::max(out_x_buffer_start, out_x_loop_start_unclampled);
- const int out_x_loop_end =
- std::min(out_x_buffer_end, out_x_loop_end_unclampled);
-
- int32* acc_buffer_ptr =
- acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
- const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
- const uint8* input_ptr = input_data + in_x_origin * input_depth;
- const int num_output_pixels = out_x_loop_end - out_x_loop_start;
- QuantizedDepthwiseConvKernel<
- kAllowStrided, kFixedInputDepth,
- kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
- depth_multiplier, input_ptr, input_offset,
- input_ptr_increment, filter_base_ptr,
- filter_offset, acc_buffer_ptr);
- filter_base_ptr += output_depth;
- }
-}
-
-// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
-inline void QuantizedDepthwiseConvAccumRowGeneric(
- int stride, int input_depth, int input_width, const uint8* input_data,
- int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
- const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
- int out_x_buffer_end, int output_depth, int32* acc_buffer) {
- gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
- const uint8* filter_base_ptr = filter_data;
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- const int out_x_loop_start = std::max(
- out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
- const int out_x_loop_end =
- std::min(out_x_buffer_end,
- (pad_width + input_width - filter_x + stride - 1) / stride);
-
- int32* acc_buffer_ptr =
- acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
- const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
- const uint8* input_ptr = input_data + in_x_origin * input_depth;
- const int input_ptr_increment = (stride - 1) * input_depth;
- for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
- const uint8* filter_ptr = filter_base_ptr;
- for (int ic = 0; ic < input_depth; ++ic) {
- const int16 input_val = *input_ptr++ + input_offset;
- for (int m = 0; m < depth_multiplier; m++) {
- const int16 filter_val = *filter_ptr++ + filter_offset;
- *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
- }
- }
- input_ptr += input_ptr_increment;
- }
- filter_base_ptr += output_depth;
- }
-}
-
-// Initializes the accumulator buffer with bias values.
-inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
- const int32* bias_data,
- int32* acc_buffer) {
- int i = 0;
-#ifdef USE_NEON
- if (output_depth == 1) {
- const int32x4_t b = vdupq_n_s32(bias_data[0]);
- for (; i <= num_output_pixels - 16; i += 16) {
- vst1q_s32(acc_buffer + i + 0, b);
- vst1q_s32(acc_buffer + i + 4, b);
- vst1q_s32(acc_buffer + i + 8, b);
- vst1q_s32(acc_buffer + i + 12, b);
- }
- for (; i <= num_output_pixels - 4; i += 4) {
- vst1q_s32(acc_buffer + i, b);
- }
- } else if (output_depth == 2) {
- int32x4_t b = vdupq_n_s32(bias_data[0]);
- b = vsetq_lane_s32(bias_data[1], b, 1);
- b = vsetq_lane_s32(bias_data[1], b, 3);
- for (; i <= num_output_pixels - 8; i += 8) {
- vst1q_s32(acc_buffer + 2 * i + 0, b);
- vst1q_s32(acc_buffer + 2 * i + 4, b);
- vst1q_s32(acc_buffer + 2 * i + 8, b);
- vst1q_s32(acc_buffer + 2 * i + 12, b);
- }
- for (; i <= num_output_pixels - 2; i += 2) {
- vst1q_s32(acc_buffer + 2 * i, b);
- }
- } else if (output_depth == 4) {
- const int32x4_t b = vld1q_s32(bias_data);
- for (; i <= num_output_pixels - 4; i += 4) {
- vst1q_s32(acc_buffer + 4 * i + 0, b);
- vst1q_s32(acc_buffer + 4 * i + 4, b);
- vst1q_s32(acc_buffer + 4 * i + 8, b);
- vst1q_s32(acc_buffer + 4 * i + 12, b);
- }
- for (; i < num_output_pixels; i++) {
- vst1q_s32(acc_buffer + 4 * i, b);
- }
- } else if (output_depth == 8) {
- const int32x4_t b0 = vld1q_s32(bias_data);
- const int32x4_t b1 = vld1q_s32(bias_data + 4);
- for (; i <= num_output_pixels - 2; i += 2) {
- vst1q_s32(acc_buffer + 8 * i + 0, b0);
- vst1q_s32(acc_buffer + 8 * i + 4, b1);
- vst1q_s32(acc_buffer + 8 * i + 8, b0);
- vst1q_s32(acc_buffer + 8 * i + 12, b1);
- }
- for (; i < num_output_pixels; i++) {
- vst1q_s32(acc_buffer + 8 * i + 0, b0);
- vst1q_s32(acc_buffer + 8 * i + 4, b1);
- }
- } else if (output_depth == 16) {
- const int32x4_t b0 = vld1q_s32(bias_data);
- const int32x4_t b1 = vld1q_s32(bias_data + 4);
- const int32x4_t b2 = vld1q_s32(bias_data + 8);
- const int32x4_t b3 = vld1q_s32(bias_data + 12);
- for (; i < num_output_pixels; i++) {
- vst1q_s32(acc_buffer + 16 * i + 0, b0);
- vst1q_s32(acc_buffer + 16 * i + 4, b1);
- vst1q_s32(acc_buffer + 16 * i + 8, b2);
- vst1q_s32(acc_buffer + 16 * i + 12, b3);
- }
- }
-#endif
- for (; i < num_output_pixels; i++) {
- memcpy(acc_buffer + i * output_depth, bias_data,
- sizeof(acc_buffer[0]) * output_depth);
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_offset, const uint8* filter_data,
- const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims,
- int stride_width, int stride_height,
- int pad_width, int pad_height, int depth_multiplier,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- DCHECK_EQ(output_activation_min, 0);
- DCHECK_EQ(output_activation_max, 255);
- }
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int input_depth = ArraySize(input_dims, 0);
- const int filter_height = ArraySize(filter_dims, 2);
- const int filter_width = ArraySize(filter_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
- DCHECK(output_depth == input_depth * depth_multiplier);
-
- static const int kAccBufferMaxSize = 1024;
- int32 acc_buffer[kAccBufferMaxSize];
- DCHECK_GE(kAccBufferMaxSize, output_depth);
- const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
- const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
- DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
- DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
- DCHECK_GE(kOutputPixelsInAccBuffer, 1);
-
- // row_accum_func will point to the core accumulation function to be used
- // for this DepthwiseConv op.
- auto* row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
-
- const int kMaxFixedDepthMultiplier = 16;
- int fixed_depth_multiplier = 0;
- if (depth_multiplier <= kMaxFixedDepthMultiplier) {
- fixed_depth_multiplier = depth_multiplier;
- }
- // kMaxUnrolling is the max number of output values that we aim to handle
- // in one unrolled iteration of the inner loop. For practical performance
- // reasons, it is limited by the number of available registers. We could
- // fine-tune it depending on the architecture, but that's not worth doing
- // since this whole code is not very optimized to begin with. The
- // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
- // vector registers.
- const int kMaxUnrolling = 16;
- int fixed_input_depth = 0;
- if (fixed_depth_multiplier &&
- input_depth * fixed_depth_multiplier <= kMaxUnrolling) {
- fixed_input_depth = input_depth;
- }
-#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
- FIXED_DEPTH_MULTIPLIER) \
- if ((stride_width == 1 || ALLOW_STRIDED) && \
- fixed_input_depth == FIXED_INPUT_DEPTH && \
- fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \
- row_accum_func = \
- QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
- FIXED_DEPTH_MULTIPLIER>; \
- }
-
-#ifdef USE_NEON
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
- TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
- TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
-#endif // USE_NEON
-
-#undef TFMINI_USE_DEPTHWISECONV_KERNEL
-
- // Now that we have determined row_accum_func, we can start work.
- uint8* output_ptr = output_data;
- for (int b = 0; b < batches; ++b) {
- for (int out_y = 0; out_y < output_height; ++out_y) {
- const int in_y_origin = (out_y * stride_height) - pad_height;
- const int filter_y_start = std::max(0, -in_y_origin);
- const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
- for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
- out_x_buffer_start += kOutputPixelsInAccBuffer) {
- const int out_x_buffer_end = std::min(
- output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
- // We call a 'pixel' a group of activation that share all but the
- // 'depth'/'channel' coordinate. num_output_pixels is the number of
- // output pixels that we will accumulate in this loop iteration.
- const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
- // Initialize our local accumulator with the bias values, so we don't
- // have to add them later.
- DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
- acc_buffer);
- // Accumulation loop. Most of the time should be spent in here.
- for (int filter_y = filter_y_start; filter_y < filter_y_end;
- ++filter_y) {
- const int in_y = in_y_origin + filter_y;
- row_accum_func(
- stride_width, input_depth, input_width,
- input_data + in_y * input_dims.strides[2] +
- b * input_dims.strides[3],
- input_offset, pad_width, depth_multiplier, filter_width,
- filter_data + filter_y * filter_dims.strides[2], filter_offset,
- out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
- }
- // Finished accumulating int32 values. Now need to convert them to
- // the final 8bit form and store them.
- gemmlowp::ScopedProfilingLabel label("downquantize+store");
- const int num_output_values = output_depth * num_output_pixels;
- int i = 0;
-#ifdef USE_NEON
- using gemmlowp::RoundingDivideByPOT;
- const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
- const int32x4_t output_activation_min_vec =
- vdupq_n_s32(output_activation_min);
- const int32x4_t output_activation_max_vec =
- vdupq_n_s32(output_activation_max);
- // Handle 16 values at once.
- // This allows us to issue 4 mutually independent int32
- // multiplications (vqrdmulh), which should alleviate most of their
- // high latency.
- for (; i <= num_output_values - 16; i += 16) {
- int32x4_t acc[4];
- for (int j = 0; j < 4; j++) {
- acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
- }
-
- // Fixed-point multiplication.
- for (int j = 0; j < 4; j++) {
- acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
- }
- for (int j = 0; j < 4; j++) {
- acc[j] = RoundingDivideByPOT(acc[j], output_shift);
- }
- // Add the output offset.
- for (int j = 0; j < 4; j++) {
- acc[j] = vaddq_s32(acc[j], output_offset_vec);
- }
- // Apply the activation function.
- if (Ac != FusedActivationFunctionType::kNone) {
- for (int j = 0; j < 4; j++) {
- acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
- }
- for (int j = 0; j < 4; j++) {
- acc[j] = vminq_s32(acc[j], output_activation_max_vec);
- }
- }
- // Saturating cast to uint8 and store to destination.
- int16x4_t acc_s16[4];
- for (int j = 0; j < 4; j++) {
- acc_s16[j] = vqmovn_s32(acc[j]);
- }
- const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
- const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
- const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
- const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
- vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
- output_ptr += 16;
- }
- // Handle 8 values at once.
- // Not as good as 16 (now we're only issuing 2 mutually independent
- // vqrdmulh instructions, so we're probably paying for their high
- // latency).
- for (; i <= num_output_values - 8; i += 8) {
- int32x4_t acc0 = vld1q_s32(acc_buffer + i);
- int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
- // Fixed-point multiplication.
- acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
- acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
- // Rounding right shift.
- acc0 = RoundingDivideByPOT(acc0, output_shift);
- acc1 = RoundingDivideByPOT(acc1, output_shift);
- // Add the output offset.
- acc0 = vaddq_s32(acc0, output_offset_vec);
- acc1 = vaddq_s32(acc1, output_offset_vec);
- // Apply the activation function.
- if (Ac != FusedActivationFunctionType::kNone) {
- acc0 = vmaxq_s32(acc0, output_activation_min_vec);
- acc1 = vmaxq_s32(acc1, output_activation_min_vec);
- acc0 = vminq_s32(acc0, output_activation_max_vec);
- acc1 = vminq_s32(acc1, output_activation_max_vec);
- }
- // Saturating cast to uint8 and store to destination.
- const int16x4_t acc0_s16 = vqmovn_s32(acc0);
- const int16x4_t acc1_s16 = vqmovn_s32(acc1);
- const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
- const uint8x8_t res_u8 = vqmovun_s16(res_s16);
- vst1_u8(output_ptr, res_u8);
- output_ptr += 8;
- }
- // Handle 4 values at once. Now we're paying the full price of the
- // high latency of vqrdmulh. Also, storing only 4 bytes at the end
- // (without any alignment) can only be done 1 byte at a time.
- // Yet, that is still worth doing to minimize the amount of leftover
- // that will have to go through the very slow scalar code.
- for (; i <= num_output_values - 4; i += 4) {
- int32x4_t acc = vld1q_s32(acc_buffer + i);
- // Fixed-point multiplication.
- acc = vqrdmulhq_n_s32(acc, output_multiplier);
- // Rounding right shift.
- acc = RoundingDivideByPOT(acc, output_shift);
- // Add the output offset.
- acc = vaddq_s32(acc, output_offset_vec);
- // Apply the activation function.
- if (Ac != FusedActivationFunctionType::kNone) {
- acc = vmaxq_s32(acc, output_activation_min_vec);
- acc = vminq_s32(acc, output_activation_max_vec);
- }
- // Saturating cast to uint8 and store to destination.
- const int16x4_t acc_s16 = vqmovn_s32(acc);
- const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
- const uint8x8_t res_u8 = vqmovun_s16(res_s16);
- vst1_lane_u8(output_ptr + 0, res_u8, 0);
- vst1_lane_u8(output_ptr + 1, res_u8, 1);
- vst1_lane_u8(output_ptr + 2, res_u8, 2);
- vst1_lane_u8(output_ptr + 3, res_u8, 3);
- output_ptr += 4;
- }
-#endif // USE_NEON
-
- // Handle leftover values, one by one. This is very slow.
- for (; i < num_output_values; i++) {
- int32 acc = acc_buffer[i];
- acc = MultiplyByQuantizedMultiplierSmallerThanOne(
- acc, output_multiplier, output_shift);
- acc += output_offset;
- acc = std::max(acc, output_activation_min);
- acc = std::min(acc, output_activation_max);
- *output_ptr++ = static_cast<uint8>(acc);
- }
- }
- }
- }
-}
-
-} // namespace optimized_ops
-} // namespace rt
-} // namespace nnfw
-
-#endif // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc
deleted file mode 100644
index 7af122517..000000000
--- a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string.h>
-
-#include "ActivationFunctor.h"
-#include "tensor_utils_impl.h"
-
-#ifdef USE_NEON
-
-#include <arm_neon.h>
-#define kFloatWeightsPerNeonLane 4
-
-namespace nnfw {
-namespace rt {
-namespace tensor_utils {
-
-void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
- int m_cols, const float* vector,
- int n_batch, float* result,
- int result_stride) {
- // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
- // vectorized loop, and we need to process sequentially. postamble_start shows
- // the start index where this should happen.
- const int postamble_start =
- m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
-
- // The arrays used to cache the vector.
- float32x4_t* vector_cache_float32x4 =
- new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) *
- sizeof(float32x4_t)];
-
- for (int b = 0; b < n_batch; b++) {
- float* result_in_batch = result + b * m_rows;
- const float* vector_in_batch = vector + b * m_cols;
- const float* matrix_ptr = matrix;
- for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
- vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c);
- }
- for (int r = 0; r < m_rows; r++) {
- float32x4_t acc_32x4 = vmovq_n_f32(0.0);
- for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
- float32x4_t temp = vector_cache_float32x4[c >> 2];
- // Load 4 float values from vector1 and vector2 and accumulator.
- float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr + c);
- // Vector multiply-accumulate 4 float
- acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, temp);
- }
- // Add the 4 intermediate sum values to get the final dot-prod value for
- // this column.
- *result_in_batch +=
- (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
- vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
- for (int c = postamble_start; c < m_cols; c++) {
- *result_in_batch += matrix_ptr[c] * vector_in_batch[c];
- }
- matrix_ptr += m_cols;
- result_in_batch += result_stride;
- }
- }
- delete[] vector_cache_float32x4;
-}
-
-void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
- int v_size, float* result) {
- // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
- // vectorized loop, and we need to process sequentially. postamble_start shows
- // the start index where this should happen.
- const int postamble_start =
- v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
- for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
- // Load 4 float values from vector1 and vector2.
- float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
- float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
- // Vector multiply 4 float
- float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
- // Save to result array.
- vst1q_f32(&result[v], mul_32x4);
- }
- for (int v = postamble_start; v < v_size; v++) {
- result[v] = vector1[v] * vector2[v];
- }
-}
-
-void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
- const float* vector2, int v_size,
- float* result) {
- // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
- // vectorized loop, and we need to process sequentially. postamble_start shows
- // the start index where this should happen.
- const int postamble_start =
- v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
- for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
- // Load 4 float values from vector1 and vector2 and accumulator.
- float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
- float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
- float32x4_t acc_32x4 = vld1q_f32(result + v);
- // Vector multiply-accumulate 4 float
- acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
- // Save to result array.
- vst1q_f32(&result[v], acc_32x4);
- }
- for (int v = postamble_start; v < v_size; v++) {
- result[v] += vector1[v] * vector2[v];
- }
-}
-
-void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
- int v_size,
- const float* batch_vector,
- int n_batch, float* result) {
- // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
- // vectorized loop, and we need to process sequentially. postamble_start shows
- // the start index where this should happen.
- const int postamble_start =
- v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
-
- // The arrays used to cache the vector.
- float32x4_t* vector_cache_float32x4 =
- new float32x4_t[(v_size / kFloatWeightsPerNeonLane) *
- sizeof(float32x4_t)];
- for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
- vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v);
- }
-
- float* result_ptr = result;
- const float* batch_vector_ptr = batch_vector;
- for (int b = 0; b < n_batch; b++) {
- for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
- // Load from memory to vectors.
- float32x4_t result_f32x4 = vld1q_f32(result_ptr + v);
- float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v);
- // Multiply-accumulate.
- result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4,
- vector_cache_float32x4[v >> 2]);
- // Store.
- vst1q_f32(result_ptr + v, result_f32x4);
- }
- // Postamble loop
- for (int v = postamble_start; v < v_size; v++) {
- result_ptr[v] += vector[v] * batch_vector_ptr[v];
- }
- // Update the pointers.
- result_ptr += v_size;
- batch_vector_ptr += v_size;
- }
- delete[] vector_cache_float32x4;
-}
-
-void NeonSub1Vector(const float* vector, int v_size, float* result) {
- // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
- // vectorized loop, and we need to process sequentially. postamble_start shows
- // the start index where this should happen.
- const int postamble_start =
- v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
-
- float32x4_t one_f32x4 = vmovq_n_f32(1.0);
- for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
- // Load 4 float values from the current pointers of the input column and
- // subtract from 1.
- float32x4_t v_f32x4 = vld1q_f32(vector + v);
- float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
- // Save to output.
- vst1q_f32(result + v, result_f32x4);
- }
- for (int v = postamble_start; v < v_size; v++) {
- result[v] = 1.0f - vector[v];
- }
-}
-
-void NeonClipVector(const float* vector, int v_size, float abs_limit,
- float* result) {
- // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
- // vectorized loop, and we need to process sequentially. postamble_start shows
- // the start index where this should happen.
- const int postamble_start =
- v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
-
- // Replicate abs_limit and -abs_limit in two vectors.
- const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
- const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
-
- for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
- // Load from memory to vector.
- float32x4_t v_f32x4 = vld1q_f32(vector + v);
- // Clip between abs_limit and -abs_limit.
- float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
- result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
- // Save to output.
- vst1q_f32(result + v, result_f32x4);
- }
- // Postamble loop.
- for (int v = postamble_start; v < v_size; v++) {
- result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v];
- result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v];
- }
-}
-
-} // namespace tensor_utils
-} // namespace rt
-} // namespace nnfw
-
-#endif // USE_NEON
diff --git a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h
deleted file mode 100644
index 2a6f31572..000000000
--- a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_NEON_TENSOR_UTILS_H__
-#define __NNFW_RT_NEON_TENSOR_UTILS_H__
-
-#include "ActivationFunctor.h"
-#include "cpu_check.h"
-#include "tensor_utils_impl.h"
-
-namespace nnfw {
-namespace rt {
-namespace tensor_utils {
-
-void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
- int m_cols, const float* vector,
- int n_batch, float* result,
- int result_stride) {
- NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
- vector, n_batch, result, result_stride);
-}
-
-void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
- int v_size, float* result) {
- NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
-}
-
-void VectorVectorCwiseProductAccumulate(const float* vector1,
- const float* vector2, int v_size,
- float* result) {
- NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
- result);
-}
-
-void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
- const float* batch_vector,
- int n_batch, float* result) {
- NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
- batch_vector, n_batch, result);
-}
-
-float VectorVectorDotProduct(const float* vector1, const float* vector2,
- int v_size) {
- return PortableVectorVectorDotProduct(vector1, vector2, v_size);
-}
-
-void BatchVectorBatchVectorDotProduct(const float* vector1,
- const float* vector2, int v_size,
- int n_batch, float* result,
- int result_stride) {
- PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
- result, result_stride);
-}
-
-void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
- float* batch_vector) {
- PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
-}
-
-void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
- PortableApplySigmoidToVector(vector, v_size, result);
-}
-
-void ApplyActivationToVector(const float* vector, int v_size,
- ActivationFn activation, float* result) {
- PortableApplyActivationToVector(vector, v_size, activation, result);
-}
-
-void CopyVector(const float* vector, int v_size, float* result) {
- PortableCopyVector(vector, v_size, result);
-}
-
-void Sub1Vector(const float* vector, int v_size, float* result) {
- NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
-}
-
-void ZeroVector(float* vector, int v_size) {
- PortableZeroVector(vector, v_size);
-}
-
-float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
-
-void ClipVector(const float* vector, int v_size, float abs_limit,
- float* result) {
- NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
-}
-
-// TODO(ghodrat): Implement Neon version.
-void VectorShiftLeft(float* vector, int v_size, float shift_value) {
- PortableVectorShiftLeft(vector, v_size, shift_value);
-}
-
-// TODO(ghodrat): Implement Neon version.
-void ReductionSumVector(const float* input_vector, int input_stride,
- float* output_vector, int output_size,
- int reduction_size) {
- PortableReductionSumVector(input_vector, input_stride, output_vector,
- output_size, reduction_size);
-}
-
-} // namespace tensor_utils
-} // namespace rt
-} // namespace nnfw
-
-#endif // __NNFW_RT_NEON_TENSOR_UTILS_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/optimized_ops.h b/runtimes/nn/common/operations/internal/optimized/optimized_ops.h
deleted file mode 100644
index 33862a0d7..000000000
--- a/runtimes/nn/common/operations/internal/optimized/optimized_ops.h
+++ /dev/null
@@ -1,2717 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_OPTIMIZED_OPS_H__
-#define __NNFW_RT_OPTIMIZED_OPS_H__
-
-#include <assert.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <memory>
-#include <tuple>
-#include <type_traits>
-
-#include "Eigen/Core"
-#include "fixedpoint.h"
-#include "gemmlowp.h"
-#include "../common.h"
-#include "../types.h"
-
-namespace nnfw {
-namespace rt {
-namespace optimized_ops {
-
-// Make a local VectorMap typedef allowing to map a float array
-// as a Eigen vector expression. The std::conditional here is to
-// construct the suitable Eigen type for the constness of the
-// data. Indeed, for const data, we need to produce
-// Eigen::Map<const Eigen::Matrix<float, ...>>
-// and not the more straightforward
-// Eigen::Map<Eigen::Matrix<const float, ...>>
-template <typename Scalar>
-using VectorMap = typename std::conditional<
- std::is_const<Scalar>::value,
- Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
- Eigen::Dynamic, 1>>,
- Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
-
-template <typename Scalar, int N>
-VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
- const int size = RequiredBufferSizeForDims(dims);
- return VectorMap<Scalar>(data, size, 1);
-}
-
-// Make a local VectorMap typedef allowing to map a float array
-// as a Eigen matrix expression. The same explanation as for VectorMap
-// above also applies here.
-template <typename Scalar>
-using MatrixMap = typename std::conditional<
- std::is_const<Scalar>::value,
- Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
- Eigen::Dynamic, Eigen::Dynamic>>,
- Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
-
-template <typename Scalar, int N>
-MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
- const Dims<N>& dims) {
- const int rows = dims.sizes[0];
- int cols = 1;
- for (int d = 1; d < N; d++) {
- cols *= dims.sizes[d];
- }
- return MatrixMap<Scalar>(data, rows, cols);
-}
-
-template <typename Scalar, int N>
-MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
- const Dims<N>& dims) {
- const int cols = dims.sizes[N - 1];
- int rows = 1;
- for (int d = 0; d < N - 1; d++) {
- rows *= dims.sizes[d];
- }
- return MatrixMap<Scalar>(data, rows, cols);
-}
-
-template <typename Scalar>
-using ArrayMap = typename std::conditional<
- std::is_const<Scalar>::value,
- Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type,
- Eigen::Dynamic, Eigen::Dynamic>>,
- Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
-
-template <typename Scalar, int N>
-ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data,
- const Dims<N>& dims) {
- const int rows = dims.sizes[0];
- int cols = 1;
- for (int d = 1; d < N; d++) {
- cols *= dims.sizes[d];
- }
- return ArrayMap<Scalar>(data, rows, cols);
-}
-
-// TODO(b/62193649): this function is only needed as long
-// as we have the --variable_batch hack.
-template <typename Scalar, int N>
-MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
- const Dims<N>& dims,
- int rows) {
- int cols = 1;
- bool matched_rows = false;
- for (int d = 0; d < N; d++) {
- cols *= dims.sizes[d];
- if (cols == rows) {
- matched_rows = true;
- cols = 1;
- }
- }
- DCHECK(matched_rows);
- return MatrixMap<Scalar>(data, rows, cols);
-}
-
-// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
-// BROADCASTING.
-//
-// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
-// rectangular array of numbers.
-//
-// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
-// However, as Dims<N> is to be deprecated, this class exists as an adaptor
-// to enable simple unoptimized implementations of element-wise broadcasting
-// operations.
-template<int N>
-struct NdArrayDesc {
- // The "extent" of each dimension. Indices along dimension d must be in the
- // half-open interval [0, extents[d]).
- int extents[N];
-
- // The number of *elements* (not bytes) between consecutive indices of each
- // dimension.
- int strides[N];
-};
-
-// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
-// ELEMENT-WISE BROADCASTING.
-//
-// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
-inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
- int i3) {
- DCHECK(i0 >= 0 && i0 < desc.extents[0]);
- DCHECK(i1 >= 0 && i1 < desc.extents[1]);
- DCHECK(i2 >= 0 && i2 < desc.extents[2]);
- DCHECK(i3 >= 0 && i3 < desc.extents[3]);
- return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
- i3 * desc.strides[3];
-}
-
-// Given the dimensions of the operands for an element-wise binary broadcast,
-// adjusts them so that they can be directly iterated over with simple loops.
-// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
-// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
-//
-// This function assumes that the two input shapes are compatible up to
-// broadcasting and the shorter one has already been prepended with 1s to be the
-// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
-// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
-// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
-// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
-//
-// When two shapes are compatible up to broadcasting, for each dimension d,
-// the input extents are either equal, or one of them is 1.
-//
-// This function performs the following for each dimension d:
-// - If the extents are equal, then do nothing since the loop that walks over
-// both of the input arrays is correct.
-// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
-// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
-// array0 to be referenced *at any index* in dimension d and still access the
-// same slice.
-template <int N>
-inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
- const Dims<N>& input1_dims,
- NdArrayDesc<N>* desc0_out,
- NdArrayDesc<N>* desc1_out) {
- DCHECK(desc0_out != nullptr);
- DCHECK(desc1_out != nullptr);
-
- // Copy dims to desc.
- for (int i = 0; i < N; ++i) {
- desc0_out->extents[i] = input0_dims.sizes[i];
- desc0_out->strides[i] = input0_dims.strides[i];
- desc1_out->extents[i] = input1_dims.sizes[i];
- desc1_out->strides[i] = input1_dims.strides[i];
- }
-
- // Walk over each dimension. If the extents are equal do nothing.
- // Otherwise, set the desc with extent 1 to have extent equal to the other and
- // stride 0.
- for (int i = 0; i < N; ++i) {
- const int extent0 = ArraySize(input0_dims, i);
- const int extent1 = ArraySize(input1_dims, i);
- if (extent0 != extent1) {
- if (extent0 == 1) {
- desc0_out->strides[i] = 0;
- desc0_out->extents[i] = extent1;
- } else {
- DCHECK_EQ(extent1, 1);
- desc1_out->strides[i] = 0;
- desc1_out->extents[i] = extent0;
- }
- }
- }
-}
-
-#ifdef USE_NEON
-template <FusedActivationFunctionType Ac>
-void AddBiasAndEvalActivationFunction(const float* bias_data,
- const Dims<4>& bias_dims,
- float* array_data,
- const Dims<4>& array_dims) {
- gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
- const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
- const int array_size = array_dims.sizes[3] * array_dims.strides[3];
- DCHECK_EQ((array_size % bias_size), 0);
- float* array_ptr = array_data;
- float* array_end_ptr = array_ptr + array_size;
- const auto zero = vdupq_n_f32(0);
- const auto six = vdupq_n_f32(6);
- const auto neg_one = vdupq_n_f32(-1);
- const auto one = vdupq_n_f32(1);
- for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
- int i = 0;
- for (; i <= bias_size - 16; i += 16) {
- auto b0 = vld1q_f32(bias_data + i);
- auto b1 = vld1q_f32(bias_data + i + 4);
- auto b2 = vld1q_f32(bias_data + i + 8);
- auto b3 = vld1q_f32(bias_data + i + 12);
- auto a0 = vld1q_f32(array_ptr + i);
- auto a1 = vld1q_f32(array_ptr + i + 4);
- auto a2 = vld1q_f32(array_ptr + i + 8);
- auto a3 = vld1q_f32(array_ptr + i + 12);
- auto x0 = vaddq_f32(a0, b0);
- auto x1 = vaddq_f32(a1, b1);
- auto x2 = vaddq_f32(a2, b2);
- auto x3 = vaddq_f32(a3, b3);
- if (Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6) {
- x0 = vmaxq_f32(zero, x0);
- x1 = vmaxq_f32(zero, x1);
- x2 = vmaxq_f32(zero, x2);
- x3 = vmaxq_f32(zero, x3);
- if (Ac == FusedActivationFunctionType::kRelu6) {
- x0 = vminq_f32(six, x0);
- x1 = vminq_f32(six, x1);
- x2 = vminq_f32(six, x2);
- x3 = vminq_f32(six, x3);
- }
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- x0 = vmaxq_f32(neg_one, x0);
- x1 = vmaxq_f32(neg_one, x1);
- x2 = vmaxq_f32(neg_one, x2);
- x3 = vmaxq_f32(neg_one, x3);
- x0 = vminq_f32(one, x0);
- x1 = vminq_f32(one, x1);
- x2 = vminq_f32(one, x2);
- x3 = vminq_f32(one, x3);
- }
- vst1q_f32(array_ptr + i, x0);
- vst1q_f32(array_ptr + i + 4, x1);
- vst1q_f32(array_ptr + i + 8, x2);
- vst1q_f32(array_ptr + i + 12, x3);
- }
- for (; i <= bias_size - 4; i += 4) {
- auto b = vld1q_f32(bias_data + i);
- auto a = vld1q_f32(array_ptr + i);
- auto x = vaddq_f32(a, b);
- if (Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6) {
- x = vmaxq_f32(zero, x);
- if (Ac == FusedActivationFunctionType::kRelu6) {
- x = vminq_f32(six, x);
- }
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- x = vmaxq_f32(neg_one, x);
- x = vminq_f32(one, x);
- }
- vst1q_f32(array_ptr + i, x);
- }
- for (; i < bias_size; i++) {
- array_ptr[i] = ActivationFunction<Ac>(array_ptr[i] + bias_data[i]);
- }
- }
-}
-#else // not NEON
-template <FusedActivationFunctionType Ac>
-void AddBiasAndEvalActivationFunction(const float* bias_data,
- const Dims<4>& bias_dims,
- float* array_data,
- const Dims<4>& array_dims) {
- gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
- const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
- const int array_size = array_dims.sizes[3] * array_dims.strides[3];
- DCHECK_EQ((array_size % bias_size), 0);
- for (int array_offset = 0; array_offset < array_size;
- array_offset += bias_size) {
- for (int i = 0; i < bias_size; i++) {
- array_data[array_offset + i] =
- ActivationFunction<Ac>(array_data[array_offset + i] + bias_data[i]);
- }
- }
-}
-#endif
-
-template <typename Lhs, typename Rhs, typename Result>
-void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
- Eigen::MatrixBase<Result>* result) {
- if (rhs.cols() == 1) {
- gemmlowp::ScopedProfilingLabel label("GEMV");
- result->col(0).noalias() = lhs * rhs.col(0);
- } else {
- gemmlowp::ScopedProfilingLabel label("GEMM");
- result->noalias() = lhs * rhs;
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const float* input_data, const Dims<4>& input_dims,
- const float* weights_data, const Dims<4>& weights_dims,
- const float* bias_data, const Dims<4>& bias_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("FullyConnected");
- // TODO(b/62193649): this convoluted shape computation (determining
- // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
- // is because the current --variable_batch hack consists in overwriting the
- // 3rd dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- // When that is fixed, this should become:
- // const auto input_matrix_map =
- // MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
- const int input_rows = ArraySize(weights_dims, 0);
- const auto input_matrix_map =
- MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows);
- const auto filter_matrix_map =
- MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims);
- auto output_matrix_map =
- MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
-
- Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
- AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
- output_dims);
-}
-
-inline void preload_l1_stream(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
- asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
- gemmlowp::Prefetch(ptr);
-#endif
-}
-
-#ifdef USE_NEON
-template <FusedActivationFunctionType Ac>
-void FullyConnectedAsGEMV(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_offset, const uint8* filter_data,
- const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- DCHECK(IsPackedWithoutStrides(input_dims));
- DCHECK(IsPackedWithoutStrides(filter_dims));
- DCHECK(IsPackedWithoutStrides(bias_dims));
- DCHECK(IsPackedWithoutStrides(output_dims));
- DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
- ArraySize(output_dims, 3),
- 1);
- const int input_size = input_dims.strides[3];
- const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
- static constexpr int kPeel = 4;
- for (int k = 0; k < input_size; k += 64) {
- preload_l1_stream(input_data + k);
- }
- for (int k = 0; k < kPeel * input_size; k += 64) {
- preload_l1_stream(filter_data + k);
- }
- DCHECK(!(output_size % kPeel));
- const int32* bias_ptr = bias_data;
- uint8* output_ptr = output_data;
- for (int out = 0; out < output_size; out += kPeel) {
- int32x4_t acc[kPeel];
- for (int k = 0; k < kPeel; k++) {
- acc[k] = vdupq_n_s32(0);
- }
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
- int in = 0;
- for (; in <= input_size - 16; in += 16) {
- const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
- uint8x16_t filter_val_u8[kPeel];
- for (int k = 0; k < kPeel; k++) {
- const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
- filter_val_u8[k] = vld1q_u8(filter_ptr);
- preload_l1_stream(filter_ptr + 64);
- }
- int16x8_t input_val[2];
- const uint8x8_t low = vget_low_u8(input_val_u8);
- const uint8x8_t high = vget_high_u8(input_val_u8);
- input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
- input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
- input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
- input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
- int16x8_t filter_val[kPeel][2];
- for (int k = 0; k < kPeel; k++) {
- const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
- const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
- filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
- filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
- filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
- filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
- }
- for (int p = 0; p < 2; p++) {
- for (int k = 0; k < kPeel; k++) {
- acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
- vget_low_s16(input_val[p]));
- }
- for (int k = 0; k < kPeel; k++) {
- acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
- vget_high_s16(input_val[p]));
- }
- }
- }
- for (; in <= input_size - 8; in += 8) {
- const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
- uint8x8_t filter_val_u8[kPeel];
- for (int k = 0; k < kPeel; k++) {
- const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
- filter_val_u8[k] = vld1_u8(filter_ptr);
- }
- int16x8_t input_val;
- input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
- input_val = vaddq_s16(input_val, input_offset_vec);
- int16x8_t filter_val[kPeel];
- for (int k = 0; k < kPeel; k++) {
- filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
- filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
- }
- for (int k = 0; k < kPeel; k++) {
- acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
- vget_low_s16(input_val));
- }
- for (int k = 0; k < kPeel; k++) {
- acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
- vget_high_s16(input_val));
- }
- }
- if (in < input_size) {
- int32 buf[4 * kPeel];
- for (int k = 0; k < 4; k++) {
- vst1q_s32(buf + 4 * k, acc[k]);
- }
- for (; in < input_size; in++) {
- int lane = (in + 8 - input_size) % 4;
- const int32 input_val = input_data[in] + input_offset;
- for (int k = 0; k < kPeel; k++) {
- int32 filter_val =
- filter_data[in + (out + k) * input_size] + filter_offset;
- buf[lane + 4 * k] += filter_val * input_val;
- }
- }
- for (int k = 0; k < 4; k++) {
- acc[k] = vld1q_s32(buf + 4 * k);
- }
- }
-
- // Horizontally reduce accumulators
- int32x2_t pairwise_reduced_acc[kPeel];
- for (int k = 0; k < kPeel; k++) {
- pairwise_reduced_acc[k] =
- vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
- }
- static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
- const int32x2_t reduced_lo =
- vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
- const int32x2_t reduced_hi =
- vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
- int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
- // Add bias values.
- int32x4_t bias_vec = vld1q_s32(bias_ptr);
- bias_ptr += 4;
- reduced = vaddq_s32(reduced, bias_vec);
- // Multiply by the fixed-point multiplier.
- reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
- // Rounding-shift-right.
- using gemmlowp::RoundingDivideByPOT;
- reduced = RoundingDivideByPOT(reduced, output_shift);
- // Add the output offset.
- const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
- reduced = vaddq_s32(reduced, output_offset_vec);
- // Narrow values down to 16 bit signed.
- const int16x4_t res16 = vqmovn_s32(reduced);
- // Narrow values down to 8 bit unsigned, saturating.
- uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
- if (Ac != FusedActivationFunctionType::kNone) {
- // Apply the clamping from the activation function
- res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
- res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
- }
- // Store results to destination. Assumes 32bit alignment.
- vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
- vreinterpret_u32_u8(res8), 0);
- output_ptr += kPeel;
- }
-}
-#endif // USE_NEON
-
-template <FusedActivationFunctionType Ac>
-struct GemmlowpOutputPipeline {
- typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
- ColVectorMap;
- typedef std::tuple<
- gemmlowp::OutputStageBiasAddition<ColVectorMap>,
- gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
- gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
- Pipeline;
- static Pipeline Make(const int32* bias_data, int output_rows,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max) {
- ColVectorMap bias_vector(bias_data, output_rows);
- gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
- bias_addition_stage.bias_vector = bias_vector;
- gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
- quantize_down_stage;
- quantize_down_stage.result_offset_after_shift = output_offset;
- quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
- quantize_down_stage.result_shift = output_shift;
- gemmlowp::OutputStageClamp clamp_stage;
- clamp_stage.min = output_activation_min;
- clamp_stage.max = output_activation_max;
- gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
- return std::make_tuple(bias_addition_stage, quantize_down_stage,
- clamp_stage, saturating_cast_stage);
- }
-};
-
-template <>
-struct GemmlowpOutputPipeline<FusedActivationFunctionType::kNone> {
- typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
- ColVectorMap;
- typedef std::tuple<
- gemmlowp::OutputStageBiasAddition<ColVectorMap>,
- gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
- gemmlowp::OutputStageSaturatingCastToUint8>
- Pipeline;
- static Pipeline Make(const int32* bias_data, int output_rows,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max) {
- DCHECK_EQ(output_activation_min, 0);
- DCHECK_EQ(output_activation_max, 255);
- ColVectorMap bias_vector(bias_data, output_rows);
- gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
- bias_addition_stage.bias_vector = bias_vector;
- gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
- quantize_down_stage;
- quantize_down_stage.result_offset_after_shift = output_offset;
- quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
- quantize_down_stage.result_shift = output_shift;
- gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
- return std::make_tuple(bias_addition_stage, quantize_down_stage,
- saturating_cast_stage);
- }
-};
-
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_offset, const uint8* filter_data,
- const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims,
- gemmlowp::GemmContext* gemm_context) {
- gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit");
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- // TODO: This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
- ArraySize(output_dims, 3);
-#ifdef USE_NEON
- const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
- if (batches == 1 && !(output_size % 4)) {
- return FullyConnectedAsGEMV<Ac>(
- input_data, input_dims, input_offset, filter_data, filter_dims,
- filter_offset, bias_data, bias_dims, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_data,
- output_dims);
- }
-#endif // USE_NEON
- const int filter_rows = filter_dims.sizes[1];
- const int filter_cols = filter_dims.sizes[0];
- DCHECK_EQ(filter_dims.sizes[2], 1);
- DCHECK_EQ(filter_dims.sizes[3], 1);
- const int output_rows = output_dims.sizes[0];
- DCHECK_EQ(output_rows, filter_rows);
- DCHECK_EQ(bias_dims.sizes[0], output_rows);
- DCHECK_EQ(bias_dims.sizes[1], 1);
- DCHECK_EQ(bias_dims.sizes[2], 1);
- DCHECK_EQ(bias_dims.sizes[3], 1);
-
- gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
- filter_data, output_rows, filter_cols, filter_cols);
- gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
- input_data, filter_cols, batches, filter_cols);
- gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
- output_data, output_rows, batches, output_rows);
- const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make(
- bias_data, output_rows, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max);
- gemmlowp::GemmWithOutputPipeline<uint8, uint8,
- gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
- gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
- input_offset, output_pipeline);
-}
-
-template <typename T>
-inline void ExtractPatchIntoBufferColumn(
- const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
- int stride_width, int stride_height, int pad_width, int pad_height,
- int in_width, int in_height, int in_depth, int single_buffer_length,
- int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) {
- gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn");
- // This chunk of code reshapes all the inputs corresponding to
- // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
- const int kwidth_times_indepth = kwidth * in_depth;
- const int inwidth_times_indepth = in_width * in_depth;
- const int ih_ungated_start = h * stride_height - pad_height;
- const int ih_ungated_end = (ih_ungated_start + kheight);
- const int ih_end = std::min(ih_ungated_end, in_height);
- const int iw_ungated_start = w * stride_width - pad_width;
- const int iw_ungated_end = (iw_ungated_start + kwidth);
- const int iw_end = std::min(iw_ungated_end, in_width);
- // If the patch is off the edge of the input image, skip writing those rows
- // and columns from the patch into the output array.
- const int h_offset = std::max(0, -ih_ungated_start);
- const int w_offset = std::max(0, -iw_ungated_start);
- const int ih_start = std::max(0, ih_ungated_start);
- const int iw_start = std::max(0, iw_ungated_start);
- const int single_row_num =
- std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
- const int output_row_offset = (buffer_id * single_buffer_length);
- int out_offset =
- output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
- int in_offset = Offset(input_dims, 0, iw_start, ih_start, b);
-
- // Express all of the calculations as padding around the input patch.
- const int top_padding = h_offset;
- const int bottom_padding = (ih_ungated_end - ih_end);
- const int left_padding = w_offset;
- const int right_padding = (iw_ungated_end - iw_end);
- assert(single_row_num ==
- ((kwidth - (left_padding + right_padding)) * in_depth));
-
- // Write out zeroes to the elements representing the top rows of the input
- // patch that are off the edge of the input image.
- if (top_padding > 0) {
- const int top_row_elements = (top_padding * kwidth * in_depth);
- memset(conv_buffer_data + output_row_offset, byte_zero,
- (top_row_elements * sizeof(T)));
- }
-
- // If the patch is on the interior of the input image horizontally, just copy
- // over the rows sequentially, otherwise add zero padding at the start or end.
- if ((left_padding == 0) && (right_padding == 0)) {
- for (int ih = ih_start; ih < ih_end; ++ih) {
- memcpy(conv_buffer_data + out_offset, in_data + in_offset,
- single_row_num * sizeof(T));
- out_offset += kwidth_times_indepth;
- in_offset += inwidth_times_indepth;
- }
- } else {
- for (int ih = ih_start; ih < ih_end; ++ih) {
- if (left_padding > 0) {
- const int left_start = (out_offset - (left_padding * in_depth));
- memset(conv_buffer_data + left_start, byte_zero,
- (left_padding * in_depth * sizeof(T)));
- }
- memcpy(conv_buffer_data + out_offset, in_data + in_offset,
- single_row_num * sizeof(T));
- if (right_padding > 0) {
- const int right_start = (out_offset + single_row_num);
- memset(conv_buffer_data + right_start, byte_zero,
- (right_padding * in_depth * sizeof(T)));
- }
- out_offset += kwidth_times_indepth;
- in_offset += inwidth_times_indepth;
- }
- }
-
- // If the bottom of the patch falls off the input image, pad the values
- // representing those input rows with zeroes.
- if (bottom_padding > 0) {
- const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
- const int bottom_start =
- output_row_offset +
- ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
- memset(conv_buffer_data + bottom_start, byte_zero,
- (bottom_row_elements * sizeof(T)));
- }
-}
-
-template <typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
- int stride_height, int pad_width, int pad_height, int kheight,
- int kwidth, uint8 byte_zero, T* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Im2col");
- DCHECK(IsPackedWithoutStrides(input_dims));
- DCHECK(IsPackedWithoutStrides(output_dims));
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int input_depth = ArraySize(input_dims, 0);
- const int input_width = ArraySize(input_dims, 1);
- const int input_height = ArraySize(input_dims, 2);
- const int output_depth = ArraySize(output_dims, 0);
- const int output_width = ArraySize(output_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
-
- int buffer_id = 0;
- // Loop over the output nodes.
- for (int b = 0; b < batches; ++b) {
- for (int h = 0; h < output_height; ++h) {
- for (int w = 0; w < output_width; ++w) {
- ExtractPatchIntoBufferColumn(
- input_dims, w, h, b, kheight, kwidth, stride_width, stride_height,
- pad_width, pad_height, input_width, input_height, input_depth,
- output_depth, buffer_id, input_data, output_data, byte_zero);
- ++buffer_id;
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void Conv(const float* input_data, const Dims<4>& input_dims,
- const float* filter_data, const Dims<4>& filter_dims,
- const float* bias_data, const Dims<4>& bias_dims, int stride_width,
- int stride_height, int pad_width, int pad_height, float* output_data,
- const Dims<4>& output_dims, float* im2col_data,
- const Dims<4>& im2col_dims) {
- (void)im2col_data;
- (void)im2col_dims;
- gemmlowp::ScopedProfilingLabel label("Conv");
-
- const float* gemm_input_data = nullptr;
- const Dims<4>* gemm_input_dims = nullptr;
- const int filter_width = ArraySize(filter_dims, 1);
- const int filter_height = ArraySize(filter_dims, 2);
- const bool need_im2col = stride_width != 1 || stride_height != 1 ||
- filter_width != 1 || filter_height != 1;
- if (need_im2col) {
- DCHECK(im2col_data);
- Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
- pad_height, filter_height, filter_width, 0, im2col_data,
- im2col_dims);
- gemm_input_data = im2col_data;
- gemm_input_dims = &im2col_dims;
- } else {
-#if 0 // TODO-NNRT : Check if it needs, 'im2col_data' seems to be always not null.
- DCHECK(!im2col_data);
-#endif
- gemm_input_data = input_data;
- gemm_input_dims = &input_dims;
- }
-
- const auto im2col_matrix_map =
- MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims);
- const auto filter_matrix_map =
- MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
- auto output_matrix_map =
- MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
-
- Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
-
- AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
- output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-void Conv(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_offset, const uint8* filter_data,
- const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims, int stride_width,
- int stride_height, int pad_width, int pad_height, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
- const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
- gemmlowp::ScopedProfilingLabel label("Conv/8bit");
-
- DCHECK(IsPackedWithoutStrides(input_dims));
- DCHECK(IsPackedWithoutStrides(filter_dims));
- DCHECK(IsPackedWithoutStrides(output_dims));
-
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
-
- const uint8* gemm_input_data = nullptr;
- const Dims<4>* gemm_input_dims = nullptr;
- const int filter_width = ArraySize(filter_dims, 1);
- const int filter_height = ArraySize(filter_dims, 2);
- const bool need_im2col = stride_width != 1 || stride_height != 1 ||
- filter_width != 1 || filter_height != 1;
- if (need_im2col) {
- DCHECK(im2col_data);
- const int input_zero_point = -input_offset;
- DCHECK_GE(input_zero_point, 0);
- DCHECK_LE(input_zero_point, 255);
- Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
- pad_height, filter_height, filter_width, input_zero_point,
- im2col_data, im2col_dims);
- gemm_input_data = im2col_data;
- gemm_input_dims = &im2col_dims;
- } else {
-#if 0 // TODO-NNRT : Check if it needs, 'im2col_data' seems to be always not null.
- DCHECK(!im2col_data);
-#endif
- gemm_input_data = input_data;
- gemm_input_dims = &input_dims;
- }
-
- const int gemm_input_rows = gemm_input_dims->sizes[0];
- const int gemm_input_cols = gemm_input_dims->sizes[1] *
- gemm_input_dims->sizes[2] *
- gemm_input_dims->sizes[3];
- const int filter_rows = filter_dims.sizes[3];
- const int filter_cols =
- filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
- const int output_rows = output_dims.sizes[0];
- const int output_cols =
- output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
- DCHECK_EQ(output_rows, filter_rows);
- DCHECK_EQ(output_cols, gemm_input_cols);
- DCHECK_EQ(filter_cols, gemm_input_rows);
- DCHECK_EQ(bias_dims.sizes[0], output_rows);
- DCHECK_EQ(bias_dims.sizes[1], 1);
- DCHECK_EQ(bias_dims.sizes[2], 1);
- DCHECK_EQ(bias_dims.sizes[3], 1);
- gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
- filter_data, filter_rows, filter_cols);
- gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
- gemm_input_data, gemm_input_rows, gemm_input_cols);
- gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
- output_data, output_rows, output_cols);
- const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make(
- bias_data, output_rows, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max);
- gemmlowp::GemmWithOutputPipeline<uint8, uint8,
- gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
- gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
- input_offset, output_pipeline);
-}
-
-template <typename T>
-inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
- int block_size, T* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("DepthToSpace");
-
- const int input_depth = ArraySize(input_dims, 0);
- const int input_width = ArraySize(input_dims, 1);
- const int input_height = ArraySize(input_dims, 2);
-
- const int output_depth = ArraySize(output_dims, 0);
- const int batch_size = ArraySize(output_dims, 3);
-
- // Number of continuous values that we can copy in one interation.
- const int stride = block_size * output_depth;
-
- for (int batch = 0; batch < batch_size; ++batch) {
- for (int in_h = 0; in_h < input_height; ++in_h) {
- const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch);
- for (int offset_h = 0; offset_h < block_size; ++offset_h) {
- const T* src = input_ptr;
- for (int in_w = 0; in_w < input_width; ++in_w) {
- memcpy(output_data, src, stride * sizeof(T));
- output_data += stride;
- src += input_depth;
- }
- input_ptr += stride;
- }
- }
- }
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
- int pad_width, int pad_height, int kheight, int kwidth,
- uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
- Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
- kwidth, byte_zero, output_data, output_dims);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
- const float* filter_data, const Dims<4>& filter_dims,
- const float* bias_data, const Dims<4>& bias_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("ConvAsGemm");
-
- const auto input_matrix_map =
- MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
- const auto filter_matrix_map =
- MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
- auto output_matrix_map =
- MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
-
- Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
-
- AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
- output_dims);
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_offset, const uint8* filter_data,
- const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims,
- int32 output_offset, int32 output_multiplier, int output_shift,
- int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims,
- gemmlowp::GemmContext* gemm_context) {
- gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit");
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- const int input_rows = input_dims.sizes[0];
- const int input_cols =
- input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3];
- const int filter_rows = filter_dims.sizes[3];
- const int filter_cols =
- filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
- const int output_rows = output_dims.sizes[0];
- const int output_cols =
- output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
- DCHECK_EQ(output_rows, filter_rows);
- DCHECK_EQ(output_cols, input_cols);
- DCHECK_EQ(filter_cols, input_rows);
- DCHECK_EQ(bias_dims.sizes[0], output_rows);
- DCHECK_EQ(bias_dims.sizes[1], 1);
- DCHECK_EQ(bias_dims.sizes[2], 1);
- DCHECK_EQ(bias_dims.sizes[3], 1);
- gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
- filter_data, output_rows, filter_cols, filter_cols);
- gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
- input_data, filter_cols, output_cols, filter_cols);
- gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
- output_data, output_rows, output_cols, output_rows);
- const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make(
- bias_data, output_rows, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max);
- gemmlowp::GemmWithOutputPipeline<uint8, uint8,
- gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
- gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
- input_offset, output_pipeline);
-}
-
-template <typename T>
-inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
- int block_size, T* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("SpaceToDepth");
-
- const int output_depth = ArraySize(output_dims, 0);
- const int output_width = ArraySize(output_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
-
- const int input_depth = ArraySize(input_dims, 0);
- const int batch_size = ArraySize(input_dims, 3);
-
- // Number of continuous values that we can copy in one interation.
- const int stride = block_size * input_depth;
-
- for (int batch = 0; batch < batch_size; ++batch) {
- for (int out_h = 0; out_h < output_height; ++out_h) {
- T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch);
- for (int offset_h = 0; offset_h < block_size; ++offset_h) {
- T* dst = output_ptr;
- for (int out_w = 0; out_w < output_width; ++out_w) {
- memcpy(dst, input_data, stride * sizeof(T));
- input_data += stride;
- dst += output_depth;
- }
- output_ptr += stride;
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void NonGlobalBatchNormalization(
- const float* input_data, const Dims<4>& input_dims, const float* mean_data,
- const Dims<4>& mean_dims, const float* multiplier_data,
- const Dims<4>& multiplier_dims, const float* offset_data,
- const Dims<4>& offset_dims, float* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height =
- MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
- offset_dims, 2, output_dims, 2);
- const int width =
- MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
- offset_dims, 1, output_dims, 1);
- const int depth =
- MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
- offset_dims, 0, output_dims, 0);
-
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
- (input_data[Offset(input_dims, c, x, y, b)] -
- mean_data[Offset(mean_dims, c, x, y, 0)]) *
- multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
- offset_data[Offset(offset_dims, c, x, y, 0)]);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void GlobalBatchNormalization(const float* input_data,
- const Dims<4>& input_dims, const float* mean_data,
- const Dims<4>& mean_dims,
- const float* multiplier_data,
- const Dims<4>& multiplier_dims,
- const float* offset_data,
- const Dims<4>& offset_dims, float* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth =
- MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
- offset_dims, 0, output_dims, 0);
-
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
- (input_data[Offset(input_dims, c, x, y, b)] -
- mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
- multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
- offset_data[Offset(offset_dims, c, 0, 0, 0)]);
- }
- }
- }
- }
-}
-
-inline void Relu(const float* input_data, const Dims<4>& input_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Relu (not fused)");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- float val = input_data[Offset(input_dims, c, x, y, b)];
- const float lower = 0;
- float clamped = val < lower ? lower : val;
- output_data[Offset(output_dims, c, x, y, b)] = clamped;
- }
- }
- }
- }
-}
-
-inline void Relu1(const float* input_data, const Dims<4>& input_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- float val = input_data[Offset(input_dims, c, x, y, b)];
- const float upper = 1;
- const float lower = -1;
- float clamped = val > upper ? upper : val < lower ? lower : val;
- output_data[Offset(output_dims, c, x, y, b)] = clamped;
- }
- }
- }
- }
-}
-
-inline void Relu6(const float* input_data, const Dims<4>& input_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- float val = input_data[Offset(input_dims, c, x, y, b)];
- const float upper = 6;
- const float lower = 0;
- float clamped = val > upper ? upper : val < lower ? lower : val;
- output_data[Offset(output_dims, c, x, y, b)] = clamped;
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void L2Normalization(const float* input_data, const Dims<4>& input_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("L2Normalization");
- static_assert(Ac == FusedActivationFunctionType::kNone, "");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- float squared_l2_norm = 0;
- for (int c = 0; c < depth; ++c) {
- float val = input_data[Offset(input_dims, c, x, y, b)];
- squared_l2_norm += val * val;
- }
- float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm);
- for (int c = 0; c < depth; ++c) {
- output_data[Offset(output_dims, c, x, y, b)] =
- input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm;
- }
- }
- }
- }
-}
-
-inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
- int* output_shift) {
- *output_shift = 11;
- while (input >= (1 << 29)) {
- input /= 4;
- ++*output_shift;
- }
- DCHECK_GT(input, 0);
- const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
- const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
- const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
- *output_shift -= left_shift_bit_pairs;
- input <<= 2 * left_shift_bit_pairs;
- DCHECK_GE(input, (1 << 27));
- DCHECK_LT(input, (1 << 29));
- using gemmlowp::FixedPoint;
- using gemmlowp::Rescale;
- using gemmlowp::SaturatingRoundingMultiplyByPOT;
- // Using 3 integer bits gives us enough room for the internal arithmetic in
- // this Newton-Raphson iteration.
- using F3 = FixedPoint<int32, 3>;
- using F0 = FixedPoint<int32, 0>;
- const F3 fixedpoint_input = F3::FromRaw(input >> 1);
- const F3 fixedpoint_half_input =
- SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
- const F3 fixedpoint_half_three =
- GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
- // Newton-Raphson iteration
- // Naive unoptimized starting guess: x = 1
- F3 x = F3::One();
- // Naive unoptimized number of iterations: 5
- for (int i = 0; i < 5; i++) {
- const F3 x3 = Rescale<3>(x * x * x);
- x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
- }
- const F0 fixedpoint_half_sqrt_2 =
- GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
- x = x * fixedpoint_half_sqrt_2;
- *output_inv_sqrt = x.raw();
- if (*output_shift < 0) {
- *output_inv_sqrt <<= -*output_shift;
- *output_shift = 0;
- }
-}
-
-inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_zero_point, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- DCHECK(IsPackedWithoutStrides(input_dims));
- DCHECK(IsPackedWithoutStrides(output_dims));
- DCHECK_EQ(batches, 1);
- DCHECK_EQ(height, 1);
- DCHECK_EQ(width, 1);
- int32 square_l2_norm = 0;
- for (int i = 0; i < depth; i++) {
- int32 diff = input_data[i] - input_zero_point;
- square_l2_norm += diff * diff;
- }
- int32 inv_l2norm_multiplier;
- int inv_l2norm_shift;
- GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
- &inv_l2norm_shift);
-
- for (int i = 0; i < depth; i++) {
- int32 diff = input_data[i] - input_zero_point;
- int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
- 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
- int32 unclamped_output_val = 128 + rescaled_diff;
- int32 output_val = std::min(255, std::max(0, unclamped_output_val));
- output_data[i] = static_cast<uint8>(output_val);
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void Add(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Add");
- /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
- output_dims, 3);
- /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
- output_dims, 2);
- /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
- output_dims, 1);
- /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
- output_dims, 0);
- DCHECK(IsPackedWithoutStrides(input1_dims));
- DCHECK(IsPackedWithoutStrides(input2_dims));
- DCHECK(IsPackedWithoutStrides(output_dims));
-
- int i = 0;
- const int size = input1_dims.sizes[3] * input1_dims.strides[3];
-#ifdef USE_NEON
- const auto zero = vdupq_n_f32(0);
- const auto six = vdupq_n_f32(6);
- const auto neg_one = vdupq_n_f32(-1);
- const auto one = vdupq_n_f32(1);
- for (; i <= size - 16; i += 16) {
- auto a10 = vld1q_f32(input1_data + i);
- auto a11 = vld1q_f32(input1_data + i + 4);
- auto a12 = vld1q_f32(input1_data + i + 8);
- auto a13 = vld1q_f32(input1_data + i + 12);
- auto a20 = vld1q_f32(input2_data + i);
- auto a21 = vld1q_f32(input2_data + i + 4);
- auto a22 = vld1q_f32(input2_data + i + 8);
- auto a23 = vld1q_f32(input2_data + i + 12);
- auto x0 = vaddq_f32(a10, a20);
- auto x1 = vaddq_f32(a11, a21);
- auto x2 = vaddq_f32(a12, a22);
- auto x3 = vaddq_f32(a13, a23);
- if (Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6) {
- x0 = vmaxq_f32(zero, x0);
- x1 = vmaxq_f32(zero, x1);
- x2 = vmaxq_f32(zero, x2);
- x3 = vmaxq_f32(zero, x3);
- if (Ac == FusedActivationFunctionType::kRelu6) {
- x0 = vminq_f32(six, x0);
- x1 = vminq_f32(six, x1);
- x2 = vminq_f32(six, x2);
- x3 = vminq_f32(six, x3);
- }
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- x0 = vmaxq_f32(neg_one, x0);
- x1 = vmaxq_f32(neg_one, x1);
- x2 = vmaxq_f32(neg_one, x2);
- x3 = vmaxq_f32(neg_one, x3);
- x0 = vminq_f32(one, x0);
- x1 = vminq_f32(one, x1);
- x2 = vminq_f32(one, x2);
- x3 = vminq_f32(one, x3);
- }
- vst1q_f32(output_data + i, x0);
- vst1q_f32(output_data + i + 4, x1);
- vst1q_f32(output_data + i + 8, x2);
- vst1q_f32(output_data + i + 12, x3);
- }
- for (; i <= size - 4; i += 4) {
- auto a1 = vld1q_f32(input1_data + i);
- auto a2 = vld1q_f32(input2_data + i);
- auto x = vaddq_f32(a1, a2);
- if (Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6) {
- x = vmaxq_f32(zero, x);
- if (Ac == FusedActivationFunctionType::kRelu6) {
- x = vminq_f32(six, x);
- }
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- x = vmaxq_f32(neg_one, x);
- x = vminq_f32(one, x);
- }
- vst1q_f32(output_data + i, x);
- }
-#endif // NEON
-
- for (; i < size; i++) {
- auto x = input1_data[i] + input2_data[i];
- output_data[i] = ActivationFunction<Ac>(x);
- }
-}
-
-template <FusedActivationFunctionType Ac>
-inline void Add(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier, int input2_shift,
- int32 output_offset, int32 output_multiplier, int output_shift,
- int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- DCHECK_EQ(output_activation_min, 0);
- DCHECK_EQ(output_activation_max, 255);
- }
- gemmlowp::ScopedProfilingLabel label("Add/8bit");
- /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
- output_dims, 3);
- /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
- output_dims, 2);
- /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
- output_dims, 1);
- /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
- output_dims, 0);
- DCHECK(IsPackedWithoutStrides(input1_dims));
- DCHECK(IsPackedWithoutStrides(input2_dims));
- DCHECK(IsPackedWithoutStrides(output_dims));
-
- int i = 0;
- const int size = input1_dims.sizes[3] * input1_dims.strides[3];
- DCHECK_GT(input1_offset, -256);
- DCHECK_GT(input2_offset, -256);
- DCHECK_LT(input1_offset, 256);
- DCHECK_LT(input2_offset, 256);
-#ifdef USE_NEON
- for (; i <= size - 8; i += 8) {
- const auto input1_val_original = vld1_u8(input1_data + i);
- const auto input2_val_original = vld1_u8(input2_data + i);
- const auto input1_val_s16 =
- vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
- const auto input2_val_s16 =
- vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
- const auto input1_val =
- vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset));
- const auto input2_val =
- vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset));
- const auto input1_val_high = vget_high_s16(input1_val);
- const auto input1_val_low = vget_low_s16(input1_val);
- const auto input2_val_high = vget_high_s16(input2_val);
- const auto input2_val_low = vget_low_s16(input2_val);
- auto x11 = vmovl_s16(input1_val_low);
- auto x12 = vmovl_s16(input1_val_high);
- auto x21 = vmovl_s16(input2_val_low);
- auto x22 = vmovl_s16(input2_val_high);
- const auto left_shift_dup = vdupq_n_s32(left_shift);
- x11 = vshlq_s32(x11, left_shift_dup);
- x12 = vshlq_s32(x12, left_shift_dup);
- x21 = vshlq_s32(x21, left_shift_dup);
- x22 = vshlq_s32(x22, left_shift_dup);
- x11 = vqrdmulhq_n_s32(x11, input1_multiplier);
- x12 = vqrdmulhq_n_s32(x12, input1_multiplier);
- x21 = vqrdmulhq_n_s32(x21, input2_multiplier);
- x22 = vqrdmulhq_n_s32(x22, input2_multiplier);
- const auto input1_shift_dup = vdupq_n_s32(-input1_shift);
- const auto input2_shift_dup = vdupq_n_s32(-input2_shift);
- x11 = vshlq_s32(x11, input1_shift_dup);
- x12 = vshlq_s32(x12, input1_shift_dup);
- x21 = vshlq_s32(x21, input2_shift_dup);
- x22 = vshlq_s32(x22, input2_shift_dup);
- auto s1 = vaddq_s32(x11, x21);
- auto s2 = vaddq_s32(x12, x22);
- s1 = vqrdmulhq_n_s32(s1, output_multiplier);
- s2 = vqrdmulhq_n_s32(s2, output_multiplier);
- using gemmlowp::RoundingDivideByPOT;
- s1 = RoundingDivideByPOT(s1, output_shift);
- s2 = RoundingDivideByPOT(s2, output_shift);
- const auto s1_narrowed = vmovn_s32(s1);
- const auto s2_narrowed = vmovn_s32(s2);
- const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
- vdupq_n_s16(output_offset));
- vst1_u8(output_data + i, vqmovun_s16(s));
- }
-#endif // NEON
-
- for (; i < size; i++) {
- const int32 input1_val = input1_offset + input1_data[i];
- const int32 input2_val = input2_offset + input2_data[i];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
- const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
- shifted_input1_val, input1_multiplier, input1_shift);
- const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
- shifted_input2_val, input2_multiplier, input2_shift);
- const int32 raw_sum = scaled_input1_val + scaled_input2_val;
- const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne(
- raw_sum, output_multiplier, output_shift) +
- output_offset;
- const int32 clamped_output = std::min(
- output_activation_max, std::max(output_activation_min, raw_output));
- output_data[i] = static_cast<uint8>(clamped_output);
- }
-}
-
-
-// TODO: We can implement BroadcastAdd on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO: BroadcastAdd is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <FusedActivationFunctionType Ac>
-void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
-
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
- input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
- input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
- const Dims<4>& input1_dims, int32 input1_offset,
- int32 input1_multiplier, int input1_shift,
- const uint8* input2_data, const Dims<4>& input2_dims,
- int32 input2_offset, int32 input2_multiplier,
- int input2_shift, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- DCHECK_EQ(output_activation_min, 0);
- DCHECK_EQ(output_activation_max, 255);
- }
- gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
-
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- const int32 input1_val =
- input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
- const int32 input2_val =
- input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- const int32 shifted_input1_val = input1_val * (1 << left_shift);
- const int32 shifted_input2_val = input2_val * (1 << left_shift);
- const int32 scaled_input1_val =
- MultiplyByQuantizedMultiplierSmallerThanOne(
- shifted_input1_val, input1_multiplier, input1_shift);
- const int32 scaled_input2_val =
- MultiplyByQuantizedMultiplierSmallerThanOne(
- shifted_input2_val, input2_multiplier, input2_shift);
- const int32 raw_sum = scaled_input1_val + scaled_input2_val;
- const int32 raw_output =
- MultiplyByQuantizedMultiplierSmallerThanOne(
- raw_sum, output_multiplier, output_shift) +
- output_offset;
- const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, raw_output));
- output_data[Offset(output_dims, c, x, y, b)] =
- static_cast<uint8>(clamped_output);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void Mul(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Mul");
- /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
- output_dims, 3);
- /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
- output_dims, 2);
- /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
- output_dims, 1);
- /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
- output_dims, 0);
- DCHECK(IsPackedWithoutStrides(input1_dims));
- DCHECK(IsPackedWithoutStrides(input2_dims));
- DCHECK(IsPackedWithoutStrides(output_dims));
-
- int i = 0;
- const int size = input1_dims.sizes[3] * input1_dims.strides[3];
-#ifdef USE_NEON
- const auto zero = vdupq_n_f32(0);
- const auto six = vdupq_n_f32(6);
- const auto neg_one = vdupq_n_f32(-1);
- const auto one = vdupq_n_f32(1);
- for (; i <= size - 16; i += 16) {
- auto a10 = vld1q_f32(input1_data + i);
- auto a11 = vld1q_f32(input1_data + i + 4);
- auto a12 = vld1q_f32(input1_data + i + 8);
- auto a13 = vld1q_f32(input1_data + i + 12);
- auto a20 = vld1q_f32(input2_data + i);
- auto a21 = vld1q_f32(input2_data + i + 4);
- auto a22 = vld1q_f32(input2_data + i + 8);
- auto a23 = vld1q_f32(input2_data + i + 12);
- auto x0 = vmulq_f32(a10, a20);
- auto x1 = vmulq_f32(a11, a21);
- auto x2 = vmulq_f32(a12, a22);
- auto x3 = vmulq_f32(a13, a23);
- if (Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6) {
- x0 = vmaxq_f32(zero, x0);
- x1 = vmaxq_f32(zero, x1);
- x2 = vmaxq_f32(zero, x2);
- x3 = vmaxq_f32(zero, x3);
- if (Ac == FusedActivationFunctionType::kRelu6) {
- x0 = vminq_f32(six, x0);
- x1 = vminq_f32(six, x1);
- x2 = vminq_f32(six, x2);
- x3 = vminq_f32(six, x3);
- }
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- x0 = vmaxq_f32(neg_one, x0);
- x1 = vmaxq_f32(neg_one, x1);
- x2 = vmaxq_f32(neg_one, x2);
- x3 = vmaxq_f32(neg_one, x3);
- x0 = vminq_f32(one, x0);
- x1 = vminq_f32(one, x1);
- x2 = vminq_f32(one, x2);
- x3 = vminq_f32(one, x3);
- }
- vst1q_f32(output_data + i, x0);
- vst1q_f32(output_data + i + 4, x1);
- vst1q_f32(output_data + i + 8, x2);
- vst1q_f32(output_data + i + 12, x3);
- }
- for (; i <= size - 4; i += 4) {
- auto a1 = vld1q_f32(input1_data + i);
- auto a2 = vld1q_f32(input2_data + i);
- auto x = vmulq_f32(a1, a2);
- if (Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6) {
- x = vmaxq_f32(zero, x);
- if (Ac == FusedActivationFunctionType::kRelu6) {
- x = vminq_f32(six, x);
- }
- } else if (Ac == FusedActivationFunctionType::kRelu1) {
- x = vmaxq_f32(neg_one, x);
- x = vminq_f32(one, x);
- }
- vst1q_f32(output_data + i, x);
- }
-#endif // NEON
-
- for (; i < size; i++) {
- auto x = input1_data[i] * input2_data[i];
- output_data[i] = ActivationFunction<Ac>(x);
- }
-}
-
-// TODO: We can implement BroadcastMul on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO: BroadcastMul is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <FusedActivationFunctionType Ac>
-void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
- const float* input2_data, const Dims<4>& input2_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("BroadcastMul");
-
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
- input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
- input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
- int32 input1_offset, const uint8* input2_data,
- const Dims<4>& input2_dims, int32 input2_offset,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- DCHECK_EQ(output_activation_min, 0);
- DCHECK_EQ(output_activation_max, 255);
- }
- gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
-
- NdArrayDesc<4> desc1;
- NdArrayDesc<4> desc2;
- NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
- // In Tensorflow, the dimensions are canonically named (batch_number, row,
- // col, channel), with extents (batches, height, width, depth), with the
- // trailing dimension changing most rapidly (channels has the smallest stride,
- // typically 1 element).
- //
- // In generated C code, we store arrays with the dimensions reversed. The
- // first dimension has smallest stride.
- //
- // We name our variables by their Tensorflow convention, but generate C code
- // nesting loops such that the innermost loop has the smallest stride for the
- // best cache behavior.
- for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
- for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
- for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
- for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
- const int32 input1_val =
- input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
- const int32 input2_val =
- input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
- const int32 unclamped_result =
- output_offset +
- MultiplyByQuantizedMultiplierSmallerThanOne(
- input1_val * input2_val, output_multiplier, output_shift);
- const int32 clamped_output =
- std::min(output_activation_max,
- std::max(output_activation_min, unclamped_result));
- output_data[Offset(output_dims, c, x, y, b)] =
- static_cast<uint8>(clamped_output);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac, typename Scalar>
-void Concatenation(int concat_dim, const Scalar* const* input_data,
- const Dims<4>* const* input_dims, int inputs_count,
- Scalar* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Concatenation");
- DCHECK_GT(inputs_count, 1);
- int concat_size = 0;
- for (int i = 0; i < inputs_count; i++) {
- for (int j = 0; j < 4; j++) {
- if (j != concat_dim) {
- MatchingArraySize(*input_dims[i], j, output_dims, j);
- }
- }
- concat_size += ArraySize(*input_dims[i], concat_dim);
- }
- DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
- DCHECK(IsPackedWithoutStrides(output_dims));
- // for now we dont have a model with a Concatenation
- // with fused activation function.
- DCHECK(Ac == FusedActivationFunctionType::kNone);
- int outer_size = 1;
- for (int i = concat_dim + 1; i < 4; i++) {
- outer_size *= output_dims.sizes[i];
- }
- Scalar* output_ptr = output_data;
- for (int k = 0; k < outer_size; k++) {
- for (int i = 0; i < inputs_count; ++i) {
- const int copy_size =
- input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
- memcpy(output_ptr, input_data[i] + k * copy_size,
- copy_size * sizeof(Scalar));
- output_ptr += copy_size;
- }
- }
-}
-
-template <FusedActivationFunctionType Ac, typename Scalar>
-void DepthConcatenation(const Scalar* const* input_data,
- const Dims<4>* const* input_dims, int inputs_count,
- Scalar* output_data, const Dims<4>& output_dims) {
- Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
- output_data, output_dims);
-}
-
-inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
- const float* prev_activ_data,
- const Dims<4>& prev_activ_dims, const float* weights_data,
- const Dims<4>& weights_dims, const float* bias_data,
- const Dims<4>& bias_dims, const float* prev_state_data,
- const Dims<4>& prev_state_dims, float* output_state_data,
- const Dims<4>& output_state_dims, float* output_activ_data,
- const Dims<4>& output_activ_dims, float* concat_temp_data,
- const Dims<4>& concat_temp_dims, float* activ_temp_data,
- const Dims<4>& activ_temp_dims) {
- gemmlowp::ScopedProfilingLabel label("LstmCell");
- MatchingArraySize( // batches
- input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims,
- 3, output_activ_dims, 3);
- MatchingArraySize( // height
- input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims,
- 2, output_activ_dims, 2);
- MatchingArraySize( // width
- input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims,
- 1, output_activ_dims, 1);
- CHECK_EQ(ArraySize(weights_dims, 2), 1);
- CHECK_EQ(ArraySize(weights_dims, 3), 1);
- const int input_depth = ArraySize(input_dims, 0);
- const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
- const int total_input_depth = prev_activ_depth + input_depth;
- CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
- CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 1);
- const int intern_activ_depth = MatchingArraySize(
- weights_dims, 1,
- bias_dims, 0);
- CHECK_EQ(intern_activ_depth % 4, 0);
- const int output_depth = MatchingArraySize(
- prev_state_dims, 0,
- prev_activ_dims, 0,
- output_state_dims, 0,
- output_activ_dims, 0);
- CHECK_EQ(output_depth, intern_activ_depth / 4);
-
- // Concatenate prev_activ and input data together
- std::vector<float const*> concat_input_arrays_data;
- std::vector<Dims<4> const*> concat_input_arrays_dims;
- concat_input_arrays_data.push_back(input_data);
- concat_input_arrays_data.push_back(prev_activ_data);
- concat_input_arrays_dims.push_back(&input_dims);
- concat_input_arrays_dims.push_back(&prev_activ_dims);
- Concatenation<FusedActivationFunctionType::kNone, float>(
- 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
- concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
-
- // Fully connected
- FullyConnected<FusedActivationFunctionType::kNone>(
- concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
- bias_dims, activ_temp_data, activ_temp_dims);
-
- // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
- // operations.
- ArrayMap<float> activ_temp_map =
- MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims);
- auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
- activ_temp_map.cols());
- auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
- activ_temp_map.cols());
- auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
- activ_temp_map.cols());
- auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
- activ_temp_map.cols());
- ArrayMap<const float> prev_state_map =
- MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims);
- ArrayMap<float> output_state_map =
- MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims);
- ArrayMap<float> output_activ_map =
- MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims);
-
- // Combined memory state and final output calculation
- gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
- output_state_map =
- input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
- new_input_sm.tanh() +
- forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
- prev_state_map;
- output_activ_map =
- output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
- output_state_map.tanh();
-}
-
-template <FusedActivationFunctionType Ac, typename Scalar>
-void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
- int outputs_count, Scalar* const* output_data,
- const Dims<4>* const* output_dims) {
- gemmlowp::ScopedProfilingLabel label("TensorFlowSplit");
- DCHECK_GE(outputs_count, 1);
- for (int i = 0; i < outputs_count; i++) {
- /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
- /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
- /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
- }
- const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
- const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
- const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
- DCHECK(IsPackedWithoutStrides(input_dims));
- // for now we dont have a model with a TensorFlowSplit
- // with fused activation function.
- DCHECK(Ac == FusedActivationFunctionType::kNone);
- const int whb = width * height * batches;
- const Scalar* input_ptr = input_data;
- for (int k = 0; k < whb; k++) {
- for (int i = 0; i < outputs_count; ++i) {
- memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr,
- output_dims[i]->sizes[0] * sizeof(Scalar));
- input_ptr += output_dims[i]->sizes[0];
- }
- }
-}
-
-inline int NodeOffset(int b, int h, int w, int height, int width) {
- return (b * height + h) * width + w;
-}
-
-template <FusedActivationFunctionType Ac>
-void AveragePool(const float* input_data, const Dims<4>& input_dims,
- int stride_width, int stride_height,
- int pad_width, int pad_height, int kwidth, int kheight,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("AveragePool");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-
- const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
- auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
- // TODO: get rid of the dynamic memory allocation here!
- Eigen::VectorXf out_count(out_mat.cols());
- out_count.setZero();
- // Prefill the output to 0.
- out_mat.setZero();
- for (int b = 0; b < batches; ++b) {
- for (int h = 0; h < input_height; ++h) {
- for (int w = 0; w < input_width; ++w) {
- // (h_start, h_end) * (w_start, w_end) is the range that the input
- // vector projects to.
- int hpad = h + pad_height;
- int wpad = w + pad_width;
- int h_start = (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1;
- int h_end = std::min(hpad / stride_height + 1, output_height);
- int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1;
- int w_end = std::min(wpad / stride_width + 1, output_width);
- // compute elementwise sum
- for (int ph = h_start; ph < h_end; ++ph) {
- for (int pw = w_start; pw < w_end; ++pw) {
- int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
- out_mat.col(out_offset) +=
- in_mat.col(NodeOffset(b, h, w, input_height, input_width));
- out_count(out_offset)++;
- }
- }
- }
- }
- }
- // Divide the output by the actual number of elements being averaged over
- DCHECK_GT(out_count.minCoeff(), 0);
- out_mat.array().rowwise() /= out_count.transpose().array();
-
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < output_height; ++y) {
- for (int x = 0; x < output_width; ++x) {
- for (int c = 0; c < depth; ++c) {
- output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
- output_data[Offset(output_dims, c, x, y, b)]);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
- int stride_width, int stride_height,
- int pad_width, int pad_height, int filter_width,
- int filter_height, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- DCHECK_EQ(output_activation_min, 0);
- DCHECK_EQ(output_activation_max, 255);
- }
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
- for (int batch = 0; batch < batches; ++batch) {
- for (int out_y = 0; out_y < output_height; ++out_y) {
- for (int out_x = 0; out_x < output_width; ++out_x) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
- const int filter_x_start = std::max(0, -in_x_origin);
- const int filter_x_end =
- std::min(filter_width, input_width - in_x_origin);
- const int filter_y_start = std::max(0, -in_y_origin);
- const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
- const int filter_count =
- (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
- // TODO: Add a dynamic buffer allocation path instead of hardcoded size.
- static constexpr int kAccBufferMaxSize = 2048;
- DCHECK_LE(depth, kAccBufferMaxSize);
- uint16 acc[kAccBufferMaxSize];
- memset(acc, 0, depth * sizeof(acc[0]));
- const uint8* input_ptr =
- input_data + input_dims.strides[1] * in_x_origin +
- input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch;
- for (int fy = filter_y_start; fy < filter_y_end; fy++) {
- const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] +
- filter_x_start * input_dims.strides[1];
- for (int fx = filter_x_start; fx < filter_x_end; fx++) {
- int channel = 0;
-#ifdef USE_NEON
- for (; channel <= depth - 16; channel += 16) {
- uint16x8_t acc_reg[2];
- for (int i = 0; i < 2; i++) {
- acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
- }
- uint8x16_t input_reg = vld1q_u8(input_row_ptr);
- input_row_ptr += 16;
- acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
- acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
- for (int i = 0; i < 2; i++) {
- vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
- }
- }
- for (; channel <= depth - 8; channel += 8) {
- uint16x8_t acc_reg = vld1q_u16(acc + channel);
- uint8x8_t input_reg = vld1_u8(input_row_ptr);
- input_row_ptr += 8;
- acc_reg = vaddw_u8(acc_reg, input_reg);
- vst1q_u16(acc + channel, acc_reg);
- }
-#endif
- for (; channel < depth; ++channel) {
- acc[channel] += *input_row_ptr++;
- }
- }
- }
- uint8* output_ptr =
- output_data + Offset(output_dims, 0, out_x, out_y, batch);
- int channel = 0;
-#ifdef USE_NEON
-#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \
- if (filter_count == FILTER_COUNT) { \
- for (; channel <= depth - 8; channel += 8) { \
- uint16 buf[8]; \
- for (int i = 0; i < 8; i++) { \
- buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \
- } \
- uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \
- buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); \
- buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); \
- vst1_u8(output_ptr + channel, buf8); \
- } \
- }
- AVGPOOL_DIVIDING_BY(9)
- AVGPOOL_DIVIDING_BY(15)
-#undef AVGPOOL_DIVIDING_BY
- for (; channel <= depth - 8; channel += 8) {
- uint16 buf[8];
- for (int i = 0; i < 8; i++) {
- buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
- }
- uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
- buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max));
- buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min));
- vst1_u8(output_ptr + channel, buf8);
- }
-#endif
- for (; channel < depth; ++channel) {
- uint16 a = (acc[channel] + filter_count / 2) / filter_count;
- a = std::max<uint16>(a, output_activation_min);
- a = std::min<uint16>(a, output_activation_max);
- output_ptr[channel] = static_cast<uint8>(a);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void MaxPool(const float* input_data, const Dims<4>& input_dims,
- int stride_width, int stride_height,
- int pad_width, int pad_height, int kwidth, int kheight,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("MaxPool");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-
- const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
- auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
- // Prefill the output to minimum representable float value
- out_mat.setConstant(std::numeric_limits<float>::lowest());
- for (int b = 0; b < batches; ++b) {
- for (int h = 0; h < input_height; ++h) {
- for (int w = 0; w < input_width; ++w) {
- // (h_start, h_end) * (w_start, w_end) is the range that the input
- // vector projects to.
- int hpad = h + pad_height;
- int wpad = w + pad_width;
- int h_start = (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1;
- int h_end = std::min(hpad / stride_height + 1, output_height);
- int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1;
- int w_end = std::min(wpad / stride_width + 1, output_width);
- // compute elementwise sum
- for (int ph = h_start; ph < h_end; ++ph) {
- for (int pw = w_start; pw < w_end; ++pw) {
- int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
- out_mat.col(out_offset) =
- out_mat.col(out_offset)
- .cwiseMax(in_mat.col(
- NodeOffset(b, h, w, input_height, input_width)));
- }
- }
- }
- }
- }
-
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < output_height; ++y) {
- for (int x = 0; x < output_width; ++x) {
- for (int c = 0; c < depth; ++c) {
- output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
- output_data[Offset(output_dims, c, x, y, b)]);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
- int stride_width, int stride_height,
- int pad_width, int pad_height, int filter_width, int filter_height,
- int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("MaxPool/8bit");
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- DCHECK_LE(output_activation_min, output_activation_max);
- if (Ac == FusedActivationFunctionType::kNone) {
- DCHECK_EQ(output_activation_min, 0);
- DCHECK_EQ(output_activation_max, 255);
- }
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
- for (int batch = 0; batch < batches; ++batch) {
- for (int out_y = 0; out_y < output_height; ++out_y) {
- for (int out_x = 0; out_x < output_width; ++out_x) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
- const int filter_x_start = std::max(0, -in_x_origin);
- const int filter_x_end =
- std::min(filter_width, input_width - in_x_origin);
- const int filter_y_start = std::max(0, -in_y_origin);
- const int filter_y_end =
- std::min(filter_height, input_height - in_y_origin);
- // TODO: Add a dynamic buffer allocation path instead of hardcoded size.
- static constexpr int kAccBufferMaxSize = 2048;
- DCHECK_LE(depth, kAccBufferMaxSize);
- uint8 acc[kAccBufferMaxSize];
- memset(acc, 0, depth * sizeof(acc[0]));
- const uint8* input_ptr =
- input_data + input_dims.strides[1] * in_x_origin +
- input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch;
- for (int fy = filter_y_start; fy < filter_y_end; fy++) {
- const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] +
- filter_x_start * input_dims.strides[1];
- for (int fx = filter_x_start; fx < filter_x_end; fx++) {
- int channel = 0;
-#ifdef USE_NEON
- for (; channel <= depth - 16; channel += 16) {
- uint8x16_t acc_reg = vld1q_u8(acc + channel);
- uint8x16_t input_reg = vld1q_u8(input_row_ptr);
- input_row_ptr += 16;
- acc_reg = vmaxq_u8(acc_reg, input_reg);
- vst1q_u8(acc + channel, acc_reg);
- }
-
- for (; channel <= depth - 8; channel += 8) {
- uint8x8_t acc_reg = vld1_u8(acc + channel);
- uint8x8_t input_reg = vld1_u8(input_row_ptr);
- input_row_ptr += 8;
- acc_reg = vmax_u8(acc_reg, input_reg);
- vst1_u8(acc + channel, acc_reg);
- }
-#endif
- for (; channel < depth; ++channel) {
- acc[channel] = std::max(acc[channel], *input_row_ptr++);
- }
- }
- }
- uint8* output_ptr =
- output_data + Offset(output_dims, 0, out_x, out_y, batch);
- int channel = 0;
-#ifdef USE_NEON
- for (; channel <= depth - 16; channel += 16) {
- uint8x16_t a = vld1q_u8(acc + channel);
- a = vminq_u8(a, vdupq_n_u8(output_activation_max));
- a = vmaxq_u8(a, vdupq_n_u8(output_activation_min));
- vst1q_u8(output_ptr + channel, a);
- }
- for (; channel <= depth - 8; channel += 8) {
- uint8x8_t a = vld1_u8(acc + channel);
- a = vmin_u8(a, vdup_n_u8(output_activation_max));
- a = vmax_u8(a, vdup_n_u8(output_activation_min));
- vst1_u8(output_ptr + channel, a);
- }
-#endif
- for (; channel < depth; ++channel) {
- uint8 a = acc[channel];
- a = std::max<uint8>(a, output_activation_min);
- a = std::min<uint8>(a, output_activation_max);
- output_ptr[channel] = static_cast<uint8>(a);
- }
- }
- }
- }
-}
-
-template <FusedActivationFunctionType Ac>
-void L2Pool(const float* input_data, const Dims<4>& input_dims,
- int stride_width, int stride_height,
- int pad_width, int pad_height, int filter_width, int filter_height,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("L2Pool");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
- // Actually carry out L2 Pool. Code is written in forward mode: we go through
- // the input values once, and write to all the pooled regions that it maps to.
- const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
- auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
- Eigen::VectorXf in_square(in_mat.rows());
- Eigen::VectorXf out_count(out_mat.cols());
- out_count.setZero();
- // Prefill the output to 0.
- out_mat.setZero();
- for (int b = 0; b < batches; ++b) {
- for (int h = 0; h < input_height; ++h) {
- for (int w = 0; w < input_width; ++w) {
- // (h_start, h_end) * (w_start, w_end) is the range that the input
- // vector projects to.
- const int hpad = h + pad_height;
- const int wpad = w + pad_width;
- const int h_start =
- (hpad < filter_height) ? 0 : (hpad - filter_height) / stride_height + 1;
- const int h_end = std::min(hpad / stride_height + 1, output_height);
- const int w_start =
- (wpad < filter_width) ? 0 : (wpad - filter_width) / stride_width + 1;
- const int w_end = std::min(wpad / stride_width + 1, output_width);
- // pre-compute square
- const int in_offset = w + input_width * (h + input_height * b);
- in_square =
- in_mat.col(in_offset).array() * in_mat.col(in_offset).array();
- // compute elementwise sum of squares
- for (int ph = h_start; ph < h_end; ++ph) {
- for (int pw = w_start; pw < w_end; ++pw) {
- const int out_offset = pw + output_width * (ph + output_height * b);
- out_mat.col(out_offset) += in_square;
- out_count(out_offset)++;
- }
- }
- }
- }
- }
-
- out_count = out_count.array().inverse();
- out_mat =
- (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt();
-}
-
-inline void LocalResponseNormalization(const float* input_data,
- const Dims<4>& input_dims, int range,
- float bias, float alpha, float beta,
- float* output_data,
- const Dims<4>& output_dims) {
- /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
- /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
- /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
- /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
-
- const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
- auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
-
- // Carry out local response normalization, vector by vector.
- // Since the data are stored column major, making row-wise operation
- // probably not memory efficient anyway, we do an explicit for loop over
- // the columns.
- const int double_range = range * 2;
- Eigen::VectorXf padded_square(data_in.rows() + double_range);
- padded_square.setZero();
- for (int r = 0; r < data_in.cols(); ++r) {
- // Do local response normalization for data_in(:, r)
- // first, compute the square and store them in buffer for repeated use
- padded_square.block(range, 0, data_in.rows(), 1) =
- data_in.col(r).cwiseProduct(data_in.col(r)) * alpha;
- // Then, compute the scale and writes them to data_out
- float accumulated_scale = 0;
- for (int i = 0; i < double_range; ++i) {
- accumulated_scale += padded_square(i);
- }
- for (int i = 0; i < data_in.rows(); ++i) {
- accumulated_scale += padded_square(i + double_range);
- data_out(i, r) = bias + accumulated_scale;
- accumulated_scale -= padded_square(i);
- }
- }
-
- // In a few cases, the pow computation could benefit from speedups.
- if (beta == 1) {
- data_out.array() = data_in.array() * data_out.array().inverse();
- } else if (beta == 0.5) {
- data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
- } else {
- data_out.array() = data_in.array() * data_out.array().pow(-beta);
- }
-}
-
-inline void Softmax(const float* input_data, const Dims<4>& input_dims,
- float beta, float* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Softmax");
- /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
- /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
- /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
- /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
-
- const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
- auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
- // Compute the exponential first, removing the max coefficient for numerical
- // stability.
- out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * beta;
- // We are separating out the exp function so that exp can be vectorized.
- out_mat = out_mat.array().exp();
- // Normalize to get the activations.
- Eigen::Array<float, 1, Eigen::Dynamic> scale =
- out_mat.array().colwise().sum().inverse();
- out_mat.array().rowwise() *= scale;
-}
-
-inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_beta_multiplier, int32 input_beta_left_shift,
- int diff_min, uint8* output_data,
- const Dims<4>& output_dims) {
- // The representation chosen for the input to the exp() function is Q5.26.
- // We need to leave extra space since values that we skip might be as large as
- // -32 before multiplying by input_beta_multiplier, and therefore as large as
- // -16 afterwards. Note that exp(-8) is definitely not insignificant to
- // accumulation, but exp(-16) definitely is.
- static const int kScaledDiffIntegerBits = 5;
- static const int kAccumulationIntegerBits = 12;
- using FixedPointScaledDiff =
- gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
- using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
- using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-
- gemmlowp::ScopedProfilingLabel label("Softmax");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-
- for (int b = 0; b < batches; ++b) {
- for (int x = 0; x < width; ++x) {
- for (int y = 0; y < height; ++y) {
- uint8 max_in_row = 0;
- for (int c = 0; c < depth; ++c) {
- max_in_row =
- std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
- }
-
- FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
- for (int c = 0; c < depth; ++c) {
- int32 input_diff =
- static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
- max_in_row;
- if (input_diff >= diff_min) {
- const int32 input_diff_rescaled =
- MultiplyByQuantizedMultiplierGreaterThanOne(
- input_diff, input_beta_multiplier, input_beta_left_shift);
- const FixedPointScaledDiff scaled_diff_f8 =
- FixedPointScaledDiff::FromRaw(input_diff_rescaled);
- sum_of_exps =
- sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
- exp_on_negative_values(scaled_diff_f8));
- }
- }
-
- int32 fixed_sum_of_exps = sum_of_exps.raw();
- // TODO: Use a NEON intrinsic like vclzq_u32 instead.
- int headroom_plus_one =
- __builtin_clz(static_cast<uint32>(fixed_sum_of_exps));
- // This is the number of bits to the left of the binary point above 1.0.
- // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
- // no later adjustment will be needed.
- int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
- int32 shifted_sum_minus_one = static_cast<int32>(
- (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
- (static_cast<uint32>(1) << 31));
-
- FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
- FixedPoint0::FromRaw(shifted_sum_minus_one));
-
- for (int c = 0; c < depth; ++c) {
- int32 input_diff =
- static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
- max_in_row;
- if (input_diff >= diff_min) {
- const int32 input_diff_rescaled =
- MultiplyByQuantizedMultiplierGreaterThanOne(
- input_diff, input_beta_multiplier, input_beta_left_shift);
- const FixedPointScaledDiff scaled_diff_f8 =
- FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
- FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
- int32 unsat_output = gemmlowp::RoundingDivideByPOT(
- (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
-
- output_data[Offset(output_dims, c, x, y, b)] =
- std::max(std::min(unsat_output, 255), 0);
-
- } else {
- output_data[Offset(output_dims, c, x, y, b)] = 0;
- }
- }
- }
- }
- }
-}
-
-inline void Logistic(const float* input_data, const Dims<4>& input_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Logistic");
- auto input_map = MapAsVector(input_data, input_dims);
- auto output_map = MapAsVector(output_data, output_dims);
- output_map.array() =
- input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>());
-}
-
-inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_zero_point, int32 input_range_radius,
- int32 input_multiplier, int input_left_shift,
- uint8* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Logistic");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
- const int32 input_val_centered =
- static_cast<int32>(input_val_u8) - input_zero_point;
- uint8 output_val;
- if (input_val_centered < -input_range_radius) {
- output_val = 0;
- } else if (input_val_centered > input_range_radius) {
- output_val = 255;
- } else {
- const int32 input_val_rescaled =
- MultiplyByQuantizedMultiplierGreaterThanOne(
- input_val_centered, input_multiplier, input_left_shift);
- using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
- using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
- const FixedPoint4 input_val_f4 =
- FixedPoint4::FromRaw(input_val_rescaled);
- const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
- using gemmlowp::RoundingDivideByPOT;
- int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
- if (output_val_s32 == 256) {
- output_val_s32 = 255;
- }
- DCHECK_GE(output_val_s32, 0);
- DCHECK_LE(output_val_s32, 255);
- output_val = static_cast<uint8>(output_val_s32);
- }
- output_data[Offset(output_dims, c, x, y, b)] = output_val;
- }
- }
- }
- }
-}
-
-inline void Tanh(const float* input_data, const Dims<4>& input_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Tanh");
- auto input_map = MapAsVector(input_data, input_dims);
- auto output_map = MapAsVector(output_data, output_dims);
- output_map.array() = input_map.array().tanh();
-}
-
-inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
- int32 zero_point, double scale, float* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Dequantize");
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- int32 val = input_data[Offset(input_dims, c, x, y, b)];
- float result = static_cast<float>(scale * (val - zero_point));
- output_data[Offset(output_dims, c, x, y, b)] = result;
- }
- }
- }
- }
-}
-
-inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
- float rmin, float rmax, float* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("FakeQuant");
-
- // 0 should always be a representable value. Let's assume that the initial
- // min,max range contains 0.
- DCHECK_LE(rmin, 0.);
- DCHECK_GE(rmax, 0.);
-
- // Determine quantization parameters: zero_point, scale.
- using Integer = uint8;
- const Integer qmin = std::numeric_limits<Integer>::min();
- const Integer qmax = std::numeric_limits<Integer>::max();
- const float qmin_float = qmin;
- const float qmax_float = qmax;
- int32 zero_point = 0;
- float scale = 0.f;
- // If rmin==rmax, both must be zero per the above assertion,
- // so we are done.
- if (rmin != rmax) {
- // First determine the scale.
- scale = (rmax - rmin) / (qmax_float - qmin_float);
-
- // Zero-point computation.
- // First the initial floating-point computation. The zero-point can be
- // determined from solving an affine equation for any known pair
- // (real value, corresponding quantized value).
- // We know two such pairs: (rmin, qmin) and (rmax, qmax).
- // The arithmetic error on the zero point computed from either pair
- // will be roughly machine_epsilon * (sum of absolute values of terms)
- // so we want to use the variant that adds the smaller terms.
- const float zero_point_from_min = qmin_float - rmin / scale;
- const float zero_point_from_max = qmax_float - rmax / scale;
- const float zero_point_from_min_error =
- std::abs(qmin_float) + std::abs(rmin / scale);
- const float zero_point_from_max_error =
- std::abs(qmax_float) + std::abs(rmax / scale);
-
- const float zero_point_float =
- zero_point_from_min_error < zero_point_from_max_error
- ? zero_point_from_min
- : zero_point_from_max;
-
- // Now we need to nudge the zero point to be an integer
- // (our zero points are integer, and this is motivated by the requirement
- // to be able to represent the real value "0" exactly as a quantized value,
- // which is required in multiple places, for example in Im2col with SAME
- // padding).
- if (zero_point_float < qmin_float) {
- zero_point = qmin;
- } else if (zero_point_float > qmax_float) {
- zero_point = qmax;
- } else {
- zero_point = static_cast<int32>(std::round(zero_point_float));
- }
- // The zero point should always be in the range of quantized value,
- // [qmin, qmax].
- DCHECK_GE(zero_point, qmin);
- DCHECK_LE(zero_point, qmax);
- }
-
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
- const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
- const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- for (int c = 0; c < depth; ++c) {
- const float src_val = input_data[Offset(input_dims, c, x, y, b)];
- const float unclamped_quantized_val =
- std::round(zero_point + src_val / scale);
- const float quantized_val = std::min(
- qmax_float, std::max(qmin_float, unclamped_quantized_val));
- const float dst_val = scale * (quantized_val - zero_point);
- output_data[Offset(output_dims, c, x, y, b)] = dst_val;
- }
- }
- }
- }
-}
-
-template <typename SrcT, typename DstT>
-inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
- DstT* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Cast");
- auto input_map = MapAsVector(input_data, input_dims);
- auto output_map = MapAsVector(output_data, output_dims);
- output_map.array() = input_map.array().template cast<DstT>();
-}
-
-inline void Floor(const float* input_data, const Dims<4>& input_dims,
- float* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Floor");
- auto input_map = MapAsVector(input_data, input_dims);
- auto output_map = MapAsVector(output_data, output_dims);
- output_map.array() = Eigen::floor(input_map.array());
-}
-
-template <typename T>
-inline void Gather(const T* input_data, const Dims<4>& input_dims,
- const int32* coords_data, const Dims<4>& coords_dims,
- T* output_data, const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("Gather");
- DCHECK_EQ(RequiredBufferSizeForDims(output_dims),
- RequiredBufferSizeForDims(coords_dims));
- for (int i = 0; i < RequiredBufferSizeForDims(coords_dims); i++) {
- DCHECK_GE(coords_data[i], 0);
- DCHECK_LT(coords_data[i], RequiredBufferSizeForDims(input_dims));
- output_data[i] = input_data[coords_data[i]];
- }
-}
-
-inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
- const int32* output_size_data,
- const Dims<4>& output_size_dims, float* output_data,
- const Dims<4>& output_dims) {
- gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
- int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- int32 input_height = ArraySize(input_dims, 2);
- int32 input_width = ArraySize(input_dims, 1);
- int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-
- DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
- DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
- DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
- DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
- int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
- int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
- float height_scale = static_cast<float>(input_height) / output_height;
- float width_scale = static_cast<float>(input_width) / output_width;
-
- for (int b = 0; b < batches; ++b) {
- for (int y = 0; y < output_height; ++y) {
- float input_y = y * height_scale;
- int32 y0 = static_cast<int32>(input_y);
- int32 y1 = std::min(y0 + 1, input_height - 1);
- for (int x = 0; x < output_width; ++x) {
- float input_x = x * width_scale;
- int32 x0 = static_cast<int32>(input_x);
- int32 x1 = std::min(x0 + 1, input_width - 1);
- for (int c = 0; c < depth; ++c) {
- float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
- (1 - (input_y - y0)) *
- (1 - (input_x - x0)) +
- input_data[Offset(input_dims, c, x0, y1, b)] *
- (input_y - y0) * (1 - (input_x - x0)) +
- input_data[Offset(input_dims, c, x1, y0, b)] *
- (1 - (input_y - y0)) * (input_x - x0) +
- input_data[Offset(input_dims, c, x1, y1, b)] *
- (input_y - y0) * (input_x - x0);
- output_data[Offset(output_dims, c, x, y, b)] = interpolation;
- }
- }
- }
- }
-}
-
-} // namespace optimized_ops
-} // namespace rt
-} // namespace nnfw
-
-#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
-#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
-#pragma GCC diagnostic pop
-#endif
-
-#endif // __NNFW_RT_OPTIMIZED_OPS_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h b/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h
deleted file mode 100644
index bf659d0a3..000000000
--- a/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_TENSOR_UTILS_IMPL_H__
-#define __NNFW_RT_TENSOR_UTILS_IMPL_H__
-
-#include "ActivationFunctor.h"
-
-#ifndef USE_NEON
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define USE_NEON
-#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
-#endif // USE_NEON
-
-namespace nnfw {
-namespace rt {
-namespace tensor_utils {
-
-// Multiply a matrix by a batch vector, and store results in a batch-size
-// vector.
-void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
- int m_rows, int m_cols,
- const float* vector,
- int n_batch, float* result,
- int result_stride);
-void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
- int m_cols, const float* vector,
- int n_batch, float* result,
- int result_stride);
-
-// Cwise product of two vectors.
-void PortableVectorVectorCwiseProduct(const float* vector1,
- const float* vector2, int v_size,
- float* result);
-void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
- int v_size, float* result);
-
-// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
-// assumption here is that result array is initialized to valid values.
-void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
- const float* vector2,
- int v_size, float* result);
-void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
- const float* vector2, int v_size,
- float* result);
-
-// Dot product of two vectors.
-float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
- int v_size);
-
-// Dot product of two batch vectors.
-void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
- const float* vector2, int v_size,
- int n_batch, float* result,
- int result_stride);
-
-// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
-// operation, the assumption here is that result array is initialized to valid
-// values.
-void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
- int v_size,
- const float* batch_vector,
- int n_batch,
- float* result);
-void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
- int v_size,
- const float* batch_vector,
- int n_batch, float* result);
-
-// Compute "1.0f - elements of vector" (used in CIFG).
-void PortableSub1Vector(const float* vector, int v_size, float* result);
-void NeonSub1Vector(const float* vector, int v_size, float* result);
-
-// Clip elements of a vector using a abs_limit value.
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
- float* result);
-void NeonClipVector(const float* vector, int v_size, float abs_limit,
- float* result);
-
-// Batch vector initialization with another vector.
-void PortableVectorBatchVectorAssign(const float* vector, int v_size,
- int n_batch, float* batch_vector);
-
-// Apply sigmoid to elements of a vector.
-void PortableApplySigmoidToVector(const float* vector, int v_size,
- float* result);
-
-// Apply activation function to elements of a vector.
-void PortableApplyActivationToVector(const float* vector, int v_size,
- ActivationFn activation,
- float* result);
-
-// Copy vector to another vector.
-void PortableCopyVector(const float* vector, int v_size, float* result);
-
-// Fill vector with 0.f.
-void PortableZeroVector(float* vector, int v_size);
-
-// Limit a float input f between +abs_limit and -abs_limit.
-float PortableClip(float f, float abs_limit);
-
-// Shift left a vector in place with v_size size.
-void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
-
-// Reduce-sum on a float input vector:
-// input_vector: float pointer to input vector.
-// input_stride: input vector stride.
-// output_vector: float pointer to vector.
-// output_size: output vector size.
-// reduction_size: number of consecutive elements from input vector which are
-// added to get one element of output.
-void PortableReductionSumVector(const float* input_vector, int input_stride,
- float* output_vector, int output_size,
- int reduction_size);
-} // namespace tensor_utils
-} // namespace rt
-} // namespace nnfw
-
-#endif // __NNFW_RT_TENSOR_UTILS_IMPL_H__
diff --git a/runtimes/nn/common/operations/internal/tensor_utils.cc b/runtimes/nn/common/operations/internal/tensor_utils.cc
deleted file mode 100644
index 78275bb29..000000000
--- a/runtimes/nn/common/operations/internal/tensor_utils.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tensor_utils.h"
-
-#ifndef USE_NEON
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define USE_NEON
-#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
-#endif // USE_NEON
-
-#ifdef USE_NEON
-#include "optimized/neon_tensor_utils.h"
-#else
-#include "reference/portable_tensor_utils.h"
-#endif // USE_NEON
diff --git a/runtimes/nn/common/operations/internal/tensor_utils.h b/runtimes/nn/common/operations/internal/tensor_utils.h
deleted file mode 100644
index df3d4e27b..000000000
--- a/runtimes/nn/common/operations/internal/tensor_utils.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_TENSOR_UTILS_H__
-#define __NNFW_RT_TENSOR_UTILS_H__
-
-#include "ActivationFunctor.h"
-
-namespace nnfw {
-namespace rt {
-namespace tensor_utils {
-
-// Limit a float input f betweeen +abs_limit and -abs_limit.
-float Clip(float f, float abs_limit);
-
-// Multiply a matrix by a batch vector, and store results in a batch-size
-// vector using a stride value provided in result_stride. 'result_stride' shows
-// how the number of elements between consecutive result values. For example
-// result_stride = 1, will cause the output to look like this:
-// [O_1, 0_2, ... O_rows] in memory, but result_stride = 3, will cause it to be
-// arranged like this in memory: [O_1, x, x, 0_2, x, x, ..., O_rows]
-void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
- int m_cols, const float* vector,
- int n_batch, float* result,
- int result_stride);
-
-// Cwise product of two vectors.
-void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
- int v_size, float* result);
-
-// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
-// assumption here is that result array is initialized to valid values.
-void VectorVectorCwiseProductAccumulate(const float* vector1,
- const float* vector2, int v_size,
- float* result);
-
-// Dot product of two vectors.
-float VectorVectorDotProduct(const float* vector1, const float* vector2,
- int v_size);
-
-// Dot product of two batch vectors of size n_batch * v_size:
-// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
-// x_2_1, x_2_2, ..., x_2_vsize,
-// ...
-// x_nbatch_1,..., x_nbatch_vsize]
-// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
-// y_2_1, y_2_2, ..., y_2_vsize,
-// ...
-// y_nbatch_1,..., y_nbatch_vsize]
-// Then result will be a vector of n_batch size which will be saved with a
-// stride of result_stride in memory starting from 'result':
-// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
-// x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
-// ...
-// x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
-void BatchVectorBatchVectorDotProduct(const float* vector1,
- const float* vector2, int v_size,
- int n_batch, float* result,
- int result_stride);
-
-// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
-// operation, the assumption here is that result array is initialized to valid
-// values.
-void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
- const float* batch_vector,
- int n_batch, float* result);
-
-// Batch vector initialization with another vector.
-void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
- float* batch_vector);
-
-// Apply sigmoid to elements of a vector.
-void ApplySigmoidToVector(const float* vector, int v_size, float* result);
-
-// Apply activation function to elements of a vector.
-void ApplyActivationToVector(const float* vector, int v_size,
- ActivationFn activation, float* result);
-
-// Copy vector to another vector.
-void CopyVector(const float* vector, int v_size, float* result);
-
-// Compute "1.0f - elements of vector" (used in CIFG).
-void Sub1Vector(const float* vector, int v_size, float* result);
-
-// Fill vector with 0.f.
-void ZeroVector(float* vector, int v_size);
-
-// Clip elements of a vector using a abs_limit value.
-void ClipVector(const float* vector, int v_size, float abs_limit,
- float* result);
-
-// Shift left a vector in place with v_size size.
-void VectorShiftLeft(float* vector, int v_size, float shift_value);
-
-// Reduce-sum on a float input vector:
-// input_vector: float pointer to input vector.
-// input_stride: input vector stride.
-// output_vector: float pointer to vector.
-// output_size: output vector size.
-// reduction_size: number of consecutive elements from input vector which are
-// added to get one element of output.
-void ReductionSumVector(const float* input_vector, int input_stride,
- float* output_vector, int output_size,
- int reduction_size);
-} // namespace tensor_utils
-} // namespace rt
-} // namespace nnfw
-
-#endif // __NNFW_RT_TENSOR_UTILS_H__
diff --git a/runtimes/nn/common/operations/internal/tensor_utils_test.cc b/runtimes/nn/common/operations/internal/tensor_utils_test.cc
deleted file mode 100644
index b68982164..000000000
--- a/runtimes/nn/common/operations/internal/tensor_utils_test.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gmock/gmock-matchers.h"
-#include "gtest/gtest.h"
-#include "tensor_utils.h"
-
-namespace nnfw {
-namespace rt {
-namespace tensor_utils {
-
-namespace {
-
-using ::testing::FloatNear;
-using ::testing::Matcher;
-
-std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
- float max_abs_error=1.e-6) {
- std::vector<Matcher<float>> matchers;
- matchers.reserve(values.size());
- for (const float& v : values) {
- matchers.emplace_back(FloatNear(v, max_abs_error));
- }
- return matchers;
-}
-
-} // anonymous namespace
-
-TEST(uKernels, ClipTest) {
- constexpr int kVectorSize = 10;
- constexpr float kAbsLimit = 2.0;
- static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
- -2.5, 3.0, -3.5, 4.0, -4.5};
- std::vector<float> output(kVectorSize);
- ClipVector(input, kVectorSize, kAbsLimit, output.data());
- EXPECT_THAT(output,
- ElementsAreArray(ArrayFloatNear(
- {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
-}
-
-TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
- constexpr int kRow = 3;
- constexpr int kCol = 4;
- constexpr int kBatch = 2;
- static float matrix[kRow * kCol] = {1.0, 2.0, 3.0, 4.0, //
- -1.0, -2.0, -3.0, -4.0, //
- 1.0, -2.0, 3.0, -4.0};
- static float vector[kCol * kBatch] = {1.0, -1.0, 1.0, -1.0, //
- 2.0, -2.0, 2.0, -2.0};
- std::vector<float> output(kRow * kBatch);
- std::fill(output.begin(), output.end(), 3.0);
- MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
- output.data(), /*result_stride=*/1);
- EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({1., 5., 13., //
- -1., 7., 23.})));
-}
-
-TEST(uKernels, VectorVectorCwiseProductTest) {
- constexpr int kVectorSize = 10;
- static float input1[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
- -2.5, 3.0, -3.5, 4.0, -4.5};
- static float input2[kVectorSize] = {0.1, -0.1, 0.1, -0.1, 0.1,
- -0.1, 0.1, -0.1, 0.1, -0.1};
- std::vector<float> output(kVectorSize);
- VectorVectorCwiseProduct(input1, input2, kVectorSize, output.data());
- EXPECT_THAT(output,
- ElementsAreArray(ArrayFloatNear(
- {0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45})));
-}
-
-TEST(uKernels, VectorVectorCwiseProductAccumulateTest) {
- constexpr int kVectorSize = 10;
- static float input1[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
- -2.5, 3.0, -3.5, 4.0, -4.5};
- static float input2[kVectorSize] = {0.1, -0.1, 0.1, -0.1, 0.1,
- -0.1, 0.1, -0.1, 0.1, -0.1};
- std::vector<float> output(kVectorSize);
- std::fill(output.begin(), output.end(), 1.0);
- VectorVectorCwiseProductAccumulate(input1, input2, kVectorSize,
- output.data());
- EXPECT_THAT(output,
- ElementsAreArray(ArrayFloatNear(
- {1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45})));
-}
-
-TEST(uKernels, VectorBatchVectorAssignTest) {
- constexpr int kVectorSize = 5;
- constexpr int kBatchSize = 3;
- static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
- std::vector<float> output(kVectorSize * kBatchSize);
- VectorBatchVectorAssign(input, kVectorSize, kBatchSize, output.data());
- EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
- {0.0, -0.5, 1.0, -1.5, 2.0, 0.0, -0.5, 1.0, -1.5, 2.0,
- 0.0, -0.5, 1.0, -1.5, 2.0})));
-}
-
-TEST(uKernels, ApplySigmoidToVectorTest) {
- constexpr int kVectorSize = 5;
- static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
- std::vector<float> output(kVectorSize);
- ApplySigmoidToVector(input, kVectorSize, output.data());
- EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
- {0.5, 0.377541, 0.731059, 0.182426, 0.880797})));
-}
-
-TEST(uKernels, ApplyActivationToVectorTest) {
- constexpr int kVectorSize = 5;
- static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
- std::vector<float> output(kVectorSize);
- ApplyActivationToVector(input, kVectorSize, kActivationRelu, output.data());
- EXPECT_THAT(output,
- ElementsAreArray(ArrayFloatNear({0.0, 0.0, 1.0, 0.0, 2.0})));
-
- ApplyActivationToVector(input, kVectorSize, kActivationTanh, output.data());
- EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
- {0.0, -0.462117, 0.761594, -0.905148, 0.964028})));
-}
-
-TEST(uKernels, CopyVectorTest) {
- constexpr int kVectorSize = 5;
- static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
- std::vector<float> output(kVectorSize);
- CopyVector(input, kVectorSize, output.data());
- EXPECT_THAT(output,
- ElementsAreArray(ArrayFloatNear({0.0, -0.5, 1.0, -1.5, 2.0})));
-}
-
-TEST(uKernels, Sub1VectorTest) {
- constexpr int kVectorSize = 5;
- static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
- std::vector<float> output(kVectorSize);
- Sub1Vector(input, kVectorSize, output.data());
- EXPECT_THAT(output,
- ElementsAreArray(ArrayFloatNear({1.0, 1.5, 0.0, 2.5, -1.0})));
-}
-
-TEST(uKernels, ZeroVectorTest) {
- constexpr int kVectorSize = 5;
- std::vector<float> output(kVectorSize);
- ZeroVector(output.data(), kVectorSize);
- EXPECT_THAT(output,
- ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0})));
-}
-
-TEST(uKernels, BatchVectorBatchVectorDotProductTest) {
- constexpr int kVectorSize = 5;
- constexpr int kBatch = 2;
- static float input1[kVectorSize * kBatch] = {0.0, -0.5, 1.0, -1.5, 2.0,
- -2.5, 3.0, -3.5, 4.0, -4.5};
- static float input2[kVectorSize * kBatch] = {0.1, -0.1, 0.1, -0.1, 0.1,
- -0.1, 0.1, -0.1, 0.1, -0.1};
- std::vector<float> output(kBatch);
- BatchVectorBatchVectorDotProduct(input1, input2, kVectorSize, kBatch,
- output.data(), /*result_stride=*/1);
- EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({0.5, 1.75})));
-}
-
-TEST(uKernels, VectorShiftLeftTest) {
- constexpr int kVectorSize = 5;
- static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
- std::vector<float> result(kVectorSize);
- VectorShiftLeft(input, kVectorSize, 3.0);
- result.assign(input, input + kVectorSize);
- EXPECT_THAT(result,
- ElementsAreArray(ArrayFloatNear({-0.5, 1.0, -1.5, 2.0, 3.0})));
-}
-
-TEST(uKernels, ReductionSumVectorTest) {
- constexpr int kInputVectorSize = 10;
- constexpr int kOutputVectorSize = 5;
- constexpr int kReductionSize = 2;
- static float input[kInputVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
- 0.0, -0.5, 1.0, 1.0, 2.0};
- std::vector<float> result(kOutputVectorSize);
- ReductionSumVector(input,
- /*input_stride=*/1, result.data(), kOutputVectorSize,
- kReductionSize);
- EXPECT_THAT(result,
- ElementsAreArray(ArrayFloatNear({-0.5, -0.5, 2.0, 0.5, 3.0})));
-}
-
-} // namespace tensor_utils
-} // namespace rt
-} // namespace nnfw
diff --git a/runtimes/nn/common/operations/internal/types.h b/runtimes/nn/common/operations/internal/types.h
deleted file mode 100644
index bd5880edd..000000000
--- a/runtimes/nn/common/operations/internal/types.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_TYPES_H__
-#define __NNFW_RT_TYPES_H__
-
-#include "compatibility.h"
-
-namespace nnfw {
-namespace rt {
-
-enum class FusedActivationFunctionType { kNone, kRelu6, kRelu1, kRelu };
-
-template <int N>
-struct Dims {
- int sizes[N];
- int strides[N];
-};
-
-struct Shape;
-
-inline Dims<4> convertShapeToDims(const Shape& shape) {
- Dims<4> dims;
- for (int i=0; i<4; i++) {
- dims.sizes[i] = 1;
- }
-
- if (shape.dimensions.size() == 1) {
- dims.sizes[0] = (int)getSizeOfDimension(shape, 0);
- } else {
- for (int i=0; i<4; i++) {
- int src = (int)shape.dimensions.size()-i-1;
- if (src >= 0) {
- dims.sizes[i] = (int)getSizeOfDimension(shape, src);
- }
- }
- }
-
- dims.strides[0] = 1;
- for (int i = 1; i<4; i++) {
- dims.strides[i] = dims.strides[i-1] * dims.sizes[i-1];
- }
- return dims;
-}
-
-inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
- DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
- DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
- DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
- DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
- return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
- i3 * dims.strides[3];
-}
-
-// Get array size, DCHECKing that the dim index is in range.
-template <int N>
-int ArraySize(const Dims<N>& array, int index) {
- DCHECK(index >= 0 && index < N);
- return array.sizes[index];
-}
-
-// Get common array size, DCHECKing that they all agree.
-template <typename ArrayType1, typename ArrayType2>
-int MatchingArraySize(const ArrayType1& array1, int index1,
- const ArrayType2& array2, int index2) {
- DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
- return ArraySize(array1, index1);
-}
-
-template <typename ArrayType1, typename ArrayType2, typename... Args>
-int MatchingArraySize(const ArrayType1& array1, int index1,
- const ArrayType2& array2, int index2, Args... args) {
- DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
- return MatchingArraySize(array1, index1, args...);
-}
-
-inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
- int max_offset = 0;
- for (int i = 0; i < 4; i++) {
- max_offset += (dims.sizes[i] - 1) * dims.strides[i];
- }
- return max_offset + 1;
-}
-
-template <int N>
-bool IsPackedWithoutStrides(const Dims<N>& dims) {
- int expected_stride = 1;
- for (int d = 0; d < N; d++) {
- if (dims.strides[d] != expected_stride) return false;
- expected_stride *= dims.sizes[d];
- }
- return true;
-}
-
-} // namespace rt
-} // namespace nnfw
-
-#endif // __NNFW_RT_TYPES_H__