1 files changed, 0 insertions, 792 deletions
diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h
deleted file mode 100644
index 5c05bf20f..000000000
--- a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (C) 2017 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__
-#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__
-
-#include "gemmlowp.h"
-#include "../common.h"
-#include "../types.h"
-
-namespace nnfw {
-namespace rt {
-namespace optimized_ops {
-
-// Implementation of float DepthwiseConv
-
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-struct FloatDepthwiseConvKernel {};
-
-#ifdef USE_NEON
-
-template <>
-struct FloatDepthwiseConvKernel<false, 8, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Load the filters
-    float32x4_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vld1q_f32(filter_ptr + 4 * i);
-    }
-    int outp = 0;
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the inputs
-      float32x4_t input[4];
-      for (int i = 0; i < 4; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      input_ptr += 16;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
-      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
-      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
-      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      float32x4_t input[2];
-      for (int i = 0; i < 2; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      input_ptr += 8;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<false, 2, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    const float32x2_t filters = vld1_f32(filter_ptr);
-    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
-    int outp = 0;
-    // Handle 8 output pixels at a time.
-    for (; outp <= num_output_pixels - 8; outp += 8) {
-      // Load the inputs
-      float32x4_t input[4];
-      for (int i = 0; i < 4; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      input_ptr += 16;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle 4 output pixels at a time.
-    for (; outp <= num_output_pixels - 4; outp += 4) {
-      // Load the inputs
-      float32x4_t input[2];
-      for (int i = 0; i < 2; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      input_ptr += 8;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the inputs
-      const float32x4_t input = vld1q_f32(input_ptr);
-      input_ptr += 4;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
-      // Multiply-accumulate
-      acc = vmlaq_f32(acc, input, filters_dup2);
-      // Store the accumulators back to acc_buffer
-      vst1q_f32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-    // Handle 1 output pixel at a time
-    for (; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      const float32x2_t input = vld1_f32(input_ptr);
-      input_ptr += 2;
-      // Load the accumulators from acc_buffer
-      float32x2_t acc = vld1_f32(acc_buffer_ptr);
-      // Multiply-accumulate
-      acc = vmla_f32(acc, input, filters);
-      // Store the accumulators back to acc_buffer
-      vst1_f32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 2;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float* local_filter_ptr = filter_ptr;
-      const float* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 16 input channels at a time.
-      for (; ic <= input_depth - 16; ic += 16) {
-        // Load the filters
-        float32x4_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 16;
-        // Load the inputs
-        float32x4_t input[4];
-        for (int i = 0; i < 4; i++) {
-          input[i] = vld1q_f32(local_input_ptr + 4 * i);
-        }
-        local_input_ptr += 16;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
-        }
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 16;
-      }
-      // Handle 4 input channels at a time.
-      for (; ic <= input_depth - 4; ic += 4) {
-        // Load the filters
-        float32x4_t filter;
-        filter = vld1q_f32(local_filter_ptr);
-        local_filter_ptr += 4;
-        // Load the inputs
-        float32x4_t input;
-        input = vld1q_f32(local_input_ptr);
-        local_input_ptr += 4;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc;
-        acc = vld1q_f32(acc_buffer_ptr);
-        // Multiply-accumulate
-        acc = vmlaq_f32(acc, input, filter);
-        // Store the accumulators back to acc_buffer
-        vst1q_f32(acc_buffer_ptr, acc);
-        acc_buffer_ptr += 4;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        const float input_val = *local_input_ptr++;
-        const float filter_val = *local_filter_ptr++;
-        *acc_buffer_ptr++ += filter_val * input_val;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 8> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float* local_filter_ptr = filter_ptr;
-      const float* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 2 input channels at a time.
-      for (; ic <= input_depth - 2; ic += 2) {
-        // Load the filters
-        float32x4_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 16;
-        // Load the inputs
-        const float32x2_t input = vld1_f32(local_input_ptr);
-        local_input_ptr += 2;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
-        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
-        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
-        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 16;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        // Load the filters
-        float32x4_t filter[2];
-        for (int i = 0; i < 2; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 8;
-        // Load the inputs
-        const float input_val = *local_input_ptr++;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[2];
-        for (int i = 0; i < 2; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        for (int i = 0; i < 2; i++) {
-          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
-        }
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 2; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 8;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 2> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float* local_filter_ptr = filter_ptr;
-      const float* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 8 input channels at a time.
-      for (; ic <= input_depth - 8; ic += 8) {
-        // Load the filters
-        float32x4_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 16;
-        // Load the inputs
-        float32x4x2_t input_dup2[2];
-        for (int i = 0; i < 2; i++) {
-          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
-          input_dup2[i] = vzipq_f32(input, input);
-        }
-        local_input_ptr += 8;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
-        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
-        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
-        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 16;
-      }
-      // Handle 4 input channels at a time.
-      for (; ic <= input_depth - 4; ic += 4) {
-        // Load the filters
-        float32x2_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
-        }
-        local_filter_ptr += 8;
-        // Load the inputs
-        const float32x4_t input = vld1q_f32(local_input_ptr);
-        local_input_ptr += 4;
-        // Load the accumulators from acc_buffer
-        float32x2_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
-        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
-        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
-        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
-        }
-        acc_buffer_ptr += 8;
-      }
-      // Handle 2 input channels at a time.
-      for (; ic <= input_depth - 2; ic += 2) {
-        // Load the filters
-        const float32x4_t filter = vld1q_f32(local_filter_ptr);
-        local_filter_ptr += 4;
-        // Load the inputs
-        const float32x2_t input = vld1_f32(local_input_ptr);
-        local_input_ptr += 2;
-        // Load the accumulators from acc_buffer
-        float32x2_t acc[2];
-        for (int i = 0; i < 2; i++) {
-          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
-        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 2; i++) {
-          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
-        }
-        acc_buffer_ptr += 4;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        // Load the inputs
-        const float input_val = *local_input_ptr++;
-        // Multiply-accumulate
-        for (int i = 0; i < 2; i++) {
-          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
-        }
-        local_filter_ptr += 2;
-        acc_buffer_ptr += 2;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 1, 8> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the filters
-      float32x4_t filter[2];
-      for (int i = 0; i < 2; i++) {
-        filter[i] = vld1q_f32(filter_ptr + 4 * i);
-      }
-      // Load the inputs
-      const float input_val = *input_ptr;
-      input_ptr += input_ptr_increment;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 16> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float* local_filter_ptr = filter_ptr;
-      const float* local_input_ptr = input_ptr;
-      for (int ic = 0; ic < input_depth; ic++) {
-        // Load the filters
-        float32x4_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 16;
-        // Load the inputs
-        const float input_val = *local_input_ptr++;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
-        }
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 16;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-#endif
-
-// Accumulates the effect of one row of the filter, on a segment of one row
-// of the output, accessing the corresponding one row of the input.
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
-                                const float* input_data, int pad_width,
-                                int depth_multiplier, int filter_width,
-                                const float* filter_data,
-                                int out_x_buffer_start, int out_x_buffer_end,
-                                int output_depth, float* acc_buffer) {
-#ifdef GEMMLOWP_PROFILING
-  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
-#endif
-  // Sanity check parameters. This is important in particular to ensure
-  // that we keep the number of template instantiations minimal, so we don't
-  // increase binary size unnecessarily.
-  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
-  static_assert(kFixedInputDepth || kAllowStrided, "");
-  DCHECK(stride == 1 || kAllowStrided);
-  if (kFixedInputDepth) {
-    DCHECK_EQ(input_depth, kFixedInputDepth);
-  }
-  if (kFixedDepthMultiplier) {
-    DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
-  }
-  DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  const int input_ptr_increment = stride * input_depth;
-  const float* filter_base_ptr = filter_data;
-  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-    // For the current (filter_x, filter_y) point in the filter,
-    // compute the boundaries of the corresponding output row segment.
-    int out_x_loop_start_unclampled = 0;
-    int out_x_loop_end_unclampled = 0;
-    if (kAllowStrided) {
-      if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
-      } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
-      } else {
-        out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
-      }
-    } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
-    }
-    // The kernel will have to iterate on the segment of the
-    // output row that starts at out_x_loop_start and out_x_loop_end.
-    const int out_x_loop_start =
-        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
-    const int out_x_loop_end =
-        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
-
-    float* acc_buffer_ptr =
-        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
-    const float* input_ptr = input_data + in_x_origin * input_depth;
-    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
-    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
-                             kFixedDepthMultiplier>::Run(num_output_pixels,
-                                                         input_depth,
-                                                         depth_multiplier,
-                                                         input_ptr,
-                                                         input_ptr_increment,
-                                                         filter_base_ptr,
-                                                         acc_buffer_ptr);
-    filter_base_ptr += output_depth;
-  }
-}
-
-// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
-inline void FloatDepthwiseConvAccumRowGeneric(
-    int stride, int input_depth, int input_width, const float* input_data,
-    int pad_width, int depth_multiplier, int filter_width,
-    const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
-    int output_depth, float* acc_buffer) {
-  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
-  const float* filter_base_ptr = filter_data;
-  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-    const int out_x_loop_start = std::max(
-        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
-    const int out_x_loop_end =
-        std::min(out_x_buffer_end,
-                 (pad_width + input_width - filter_x + stride - 1) / stride);
-
-    float* acc_buffer_ptr =
-        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
-    const float* input_ptr = input_data + in_x_origin * input_depth;
-    const int input_ptr_increment = (stride - 1) * input_depth;
-    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
-      const float* filter_ptr = filter_base_ptr;
-      for (int ic = 0; ic < input_depth; ++ic) {
-        const float input_val = *input_ptr++;
-        for (int m = 0; m < depth_multiplier; m++) {
-          const float filter_val = *filter_ptr++;
-          *acc_buffer_ptr++ += filter_val * input_val;
-        }
-      }
-      input_ptr += input_ptr_increment;
-    }
-    filter_base_ptr += output_depth;
-  }
-}
-
-// Initializes the accumulator buffer with bias values.
-inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
-                                       const float* bias_data,
-                                       float* acc_buffer) {
-  for (int i = 0; i < num_output_pixels; i++) {
-    memcpy(acc_buffer + i * output_depth, bias_data,
-           sizeof(acc_buffer[0]) * output_depth);
-  }
-}
-
-template <FusedActivationFunctionType Ac>
-void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                   const float* filter_data, const Dims<4>& filter_dims,
-                   const float* bias_data, const Dims<4>& bias_dims,
-                   int stride_width, int stride_height,
-                   int pad_width, int pad_height, int depth_multiplier,
-                   float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_depth = ArraySize(input_dims, 0);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-#if 0 // TODO-NNRT : Check if assertion is needed, output depth some times not equal to input * depthmultiplier
-  DCHECK(output_depth == input_depth * depth_multiplier);
-#endif
-
-  static const int kAccBufferMaxSize = 1024;
-  float acc_buffer[kAccBufferMaxSize];
-  DCHECK_GE(kAccBufferMaxSize, output_depth);
-  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
-  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
-  DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
-  DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
-  DCHECK_GE(kOutputPixelsInAccBuffer, 1);
-
-  // row_accum_func will point to the core accumulation function to be used
-  // for this DepthwiseConv op.
-  auto* row_accum_func = FloatDepthwiseConvAccumRowGeneric;
-
-  const int kMaxFixedDepthMultiplier = 16;
-  int fixed_depth_multiplier = 0;
-  if (depth_multiplier <= kMaxFixedDepthMultiplier) {
-    fixed_depth_multiplier = depth_multiplier;
-  }
-  // kMaxUnrolling is the max number of output values that we aim to handle
-  // in one unrolled iteration of the inner loop. For practical performance
-  // reasons, it is limited by the number of available registers. We could
-  // fine-tune it depending on the architecture, but that's not worth doing
-  // since this whole code is not very optimized to begin with. The
-  // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
-  // vector registers.
-  const int kMaxUnrolling = 8;
-  int fixed_input_depth = 0;
-  if (fixed_depth_multiplier &&
-      input_depth * fixed_depth_multiplier <= kMaxUnrolling) {
-    fixed_input_depth = input_depth;
-  }
-#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
-                                        FIXED_DEPTH_MULTIPLIER)           \
-  if ((stride_width == 1 || ALLOW_STRIDED) &&                             \
-      fixed_input_depth == FIXED_INPUT_DEPTH &&                           \
-      fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                 \
-    row_accum_func =                                                      \
-        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
-                                   FIXED_DEPTH_MULTIPLIER>;               \
-  }
-
-#ifdef USE_NEON
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
-#endif  // USE_NEON
-
-#undef TFMINI_USE_DEPTHWISECONV_KERNEL
-
-  // Now that we have determined row_accum_func, we can start work.
-  float* output_ptr = output_data;
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      const int in_y_origin = (out_y * stride_height) - pad_height;
-      const int filter_y_start = std::max(0, -in_y_origin);
-      const int filter_y_end =
-          std::min(filter_height, input_height - in_y_origin);
-      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
-           out_x_buffer_start += kOutputPixelsInAccBuffer) {
-        const int out_x_buffer_end = std::min(
-            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
-        // We call a 'pixel' a group of activation that share all but the
-        // 'depth'/'channel' coordinate. num_output_pixels is the number of
-        // output pixels that we will accumulate in this loop iteration.
-        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
-        // Initialize our local accumulator with the bias values, so we don't
-        // have to add them later.
-        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
-                                   acc_buffer);
-        // Accumulation loop. Most of the time should be spent in here.
-        for (int filter_y = filter_y_start; filter_y < filter_y_end;
-             ++filter_y) {
-          const int in_y = in_y_origin + filter_y;
-          row_accum_func(stride_width, input_depth, input_width,
-                         input_data + in_y * input_dims.strides[2] +
-                             b * input_dims.strides[3],
-                         pad_width, depth_multiplier, filter_width,
-                         filter_data + filter_y * filter_dims.strides[2],
-                         out_x_buffer_start, out_x_buffer_end, output_depth,
-                         acc_buffer);
-        }
-        // Finished accumulating. Now store to destination.
-        const int num_output_values = output_depth * num_output_pixels;
-        int i = 0;
-#ifdef USE_NEON
-        // Handle 16 values at a time
-        for (; i <= num_output_values - 16; i += 16) {
-          float32x4_t acc[4];
-          for (int k = 0; k < 4; k++) {
-            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
-          }
-          if (Ac == FusedActivationFunctionType::kRelu) {
-            for (int k = 0; k < 4; k++) {
-              acc[k] = vmaxq_f32(vdupq_n_f32(0.f), acc[k]);
-            }
-          } else if (Ac == FusedActivationFunctionType::kRelu6) {
-            for (int k = 0; k < 4; k++) {
-              acc[k] = vmaxq_f32(vdupq_n_f32(0.f),
-                                 vminq_f32(vdupq_n_f32(6.f), acc[k]));
-            }
-          } else if (Ac == FusedActivationFunctionType::kRelu1) {
-            for (int k = 0; k < 4; k++) {
-              acc[k] = vmaxq_f32(vdupq_n_f32(-1.f),
-                                 vminq_f32(vdupq_n_f32(1.f), acc[k]));
-            }
-          }
-          for (int k = 0; k < 4; k++) {
-            vst1q_f32(output_ptr + 4 * k, acc[k]);
-          }
-          output_ptr += 16;
-        }
-        // Handle 4 values at a time
-        for (; i <= num_output_values - 4; i += 4) {
-          float32x4_t acc = vld1q_f32(acc_buffer + i);
-          if (Ac == FusedActivationFunctionType::kRelu) {
-            acc = vmaxq_f32(vdupq_n_f32(0.f), acc);
-          } else if (Ac == FusedActivationFunctionType::kRelu6) {
-            acc = vmaxq_f32(vdupq_n_f32(0.f), vminq_f32(vdupq_n_f32(6.f), acc));
-          } else if (Ac == FusedActivationFunctionType::kRelu1) {
-            acc =
-                vmaxq_f32(vdupq_n_f32(-1.f), vminq_f32(vdupq_n_f32(1.f), acc));
-          }
-          vst1q_f32(output_ptr, acc);
-          output_ptr += 4;
-        }
-#endif
-        // Handle leftover values, one by one. This is very slow.
-        for (; i < num_output_values; i++) {
-          float acc = acc_buffer[i];
-          if (Ac == FusedActivationFunctionType::kRelu) {
-            acc = std::max(0.f, acc);
-          } else if (Ac == FusedActivationFunctionType::kRelu6) {
-            acc = std::max(0.f, std::min(6.f, acc));
-          } else if (Ac == FusedActivationFunctionType::kRelu1) {
-            acc = std::max(-1.f, std::min(1.f, acc));
-          }
-          *output_ptr++ = acc;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace optimized_ops
-}  // namespace rt
-}  // namespace nnfw
-
-
-#endif  // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__