1 files changed, 0 insertions, 1640 deletions
diff --git a/compute/ncnn/src/layer/binaryop.cc b/compute/ncnn/src/layer/binaryop.cc
deleted file mode 100644
index a09d55f78..000000000
--- a/compute/ncnn/src/layer/binaryop.cc
+++ /dev/null
@@ -1,1640 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#include "ncnn/layer/binaryop.h"
-#include <math.h>
-#include <algorithm>
-#include <functional>
-#include <sys/time.h>
-
-#if __ARM_NEON
-#include <arm_neon.h>
-#include "arm/neon_mathfun.h"
-#endif // __ARM_NEON
-
-namespace nnfw
-{
-namespace ncnn
-{
-
-template <typename Op> static int binary_op(const Mat &a, const Mat &b, Mat &c)
-{
-  Op op;
-
-  int w = a.w;
-  int h = a.h;
-  int channels = a.c;
-  int size = w * h;
-
-  int w1 = b.w;
-  int h1 = b.h;
-  int channels1 = b.c;
-  int size1 = w1 * h1;
-
-  if (a.dims == 3)
-  {
-    c.create(w, h, channels);
-    if (c.empty())
-      return -100;
-
-    if (b.dims == 3)
-    {
-      if (b.w == 1 && b.h == 1)
-      {
-
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = a.channel(q);
-          const float *ptr1 = b.channel(q);
-          float *outptr = c.channel(q);
-
-          float tt = *ptr1;
-          for (int i = 0; i < size; i++)
-          {
-            outptr[i] = op(ptr[i], tt);
-          }
-        }
-
-        return 0;
-      }
-
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = a.channel(q);
-        const float *ptr1 = b.channel(q);
-        float *outptr = c.channel(q);
-
-        for (int i = 0; i < size; i++)
-        {
-          outptr[i] = op(ptr[i], ptr1[i]);
-        }
-      }
-
-      return 0;
-    }
-
-    if (b.dims == 2)
-    {
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = a.channel(q);
-        const float *ptr1 = (const float *)b + h * q;
-        float *outptr = c.channel(q);
-
-        for (int y = 0; y < h; y++)
-        {
-          const float b0 = ptr1[y];
-          for (int x = 0; x < w; x++)
-          {
-            outptr[x] = op(ptr[x], b0);
-          }
-
-          ptr += w;
-          outptr += w;
-        }
-      }
-
-      return 0;
-    }
-
-    if (b.dims == 1)
-    {
-      if (b.w == 1)
-      {
-        const float b0 = b[0];
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = a.channel(q);
-          float *outptr = c.channel(q);
-
-          for (int i = 0; i < size; i++)
-          {
-            outptr[i] = op(ptr[i], b0);
-          }
-        }
-
-        return 0;
-      }
-
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = a.channel(q);
-        const float b0 = b[q];
-        float *outptr = c.channel(q);
-
-        for (int i = 0; i < size; i++)
-        {
-          outptr[i] = op(ptr[i], b0);
-        }
-      }
-
-      return 0;
-    }
-  }
-  else if (a.dims == 2)
-  {
-    if (b.dims == 3)
-    {
-      c.create(w1, h1, channels1);
-      if (c.empty())
-        return -100;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels1; q++)
-      {
-        const float *ptr = (const float *)a + h1 * q;
-        const float *ptr1 = b.channel(q);
-        float *outptr = c.channel(q);
-
-        for (int y = 0; y < h1; y++)
-        {
-          const float a0 = ptr[y];
-          for (int x = 0; x < w1; x++)
-          {
-            outptr[x] = op(a0, ptr1[x]);
-          }
-
-          ptr1 += w1;
-          outptr += w1;
-        }
-      }
-
-      return 0;
-    }
-
-    c.create(w, h);
-    if (c.empty())
-      return -100;
-
-    if (b.dims == 2)
-    {
-      for (int i = 0; i < size; i++)
-      {
-        c[i] = op(a[i], b[i]);
-      }
-
-      return 0;
-    }
-
-    if (b.dims == 1)
-    {
-      c.create(w, h);
-      if (c.empty())
-        return -100;
-
-      if (b.w == 1)
-      {
-        const float b0 = b[0];
-        for (int i = 0; i < size; i++)
-        {
-          c[i] = op(a[i], b0);
-        }
-
-        return 0;
-      }
-
-      const float *ptr = a;
-      float *outptr = c;
-
-      for (int y = 0; y < h; y++)
-      {
-        const float b0 = b[y];
-        for (int x = 0; x < w; x++)
-        {
-          outptr[x] = op(ptr[x], b0);
-        }
-
-        ptr += w;
-        outptr += w;
-      }
-
-      return 0;
-    }
-  }
-  else if (a.dims == 1)
-  {
-    if (a.w == 1)
-    {
-      if (b.dims == 3)
-      {
-        c.create(w1, h1, channels1);
-        if (c.empty())
-          return -100;
-
-        const float a0 = a[0];
-#pragma omp parallel for
-        for (int q = 0; q < channels1; q++)
-        {
-          const float *ptr1 = b.channel(q);
-          float *outptr = c.channel(q);
-
-          for (int i = 0; i < size1; i++)
-          {
-            outptr[i] = op(a0, ptr1[i]);
-          }
-        }
-
-        return 0;
-      }
-
-      if (b.dims == 2)
-      {
-        c.create(w1, h1);
-        if (c.empty())
-          return -100;
-
-        const float a0 = a[0];
-        for (int i = 0; i < size1; i++)
-        {
-          c[i] = op(a0, b[i]);
-        }
-
-        return 0;
-      }
-
-      if (b.dims == 1)
-      {
-        c.create(w1);
-        if (c.empty())
-          return -100;
-
-        const float a0 = a[0];
-        for (int i = 0; i < size1; i++)
-        {
-          c[i] = op(a0, b[i]);
-        }
-
-        return 0;
-      }
-    }
-
-    if (b.dims == 3)
-    {
-      c.create(w1, h1, channels1);
-      if (c.empty())
-        return -100;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels1; q++)
-      {
-        const float a0 = a[q];
-        const float *ptr1 = b.channel(q);
-        float *outptr = c.channel(q);
-
-        for (int i = 0; i < size1; i++)
-        {
-          outptr[i] = op(a0, ptr1[i]);
-        }
-      }
-
-      return 0;
-    }
-
-    if (b.dims == 2)
-    {
-      c.create(w1, h1);
-      if (c.empty())
-        return -100;
-
-      const float *ptr1 = b;
-      float *outptr = c;
-
-      for (int y = 0; y < h1; y++)
-      {
-        const float a0 = a[y];
-        for (int x = 0; x < w1; x++)
-        {
-          outptr[x] = op(a0, ptr1[x]);
-        }
-
-        ptr1 += w1;
-        outptr += w1;
-      }
-
-      return 0;
-    }
-
-    if (b.dims == 1)
-    {
-      c.create(w);
-      if (c.empty())
-        return -100;
-
-      if (b.w == 1)
-      {
-        const float b0 = b[0];
-        for (int i = 0; i < size; i++)
-        {
-          c[i] = op(a[i], b0);
-        }
-
-        return 0;
-      }
-
-      for (int i = 0; i < size; i++)
-      {
-        c[i] = op(a[i], b[i]);
-      }
-    }
-  }
-
-  return 0;
-}
-
-template <typename Op> static int binary_op_scalar_inplace(Mat &a, float b)
-{
-  Op op;
-
-  int w = a.w;
-  int h = a.h;
-  int channels = a.c;
-  int size = w * h;
-
-#pragma omp parallel for
-  for (int q = 0; q < channels; q++)
-  {
-    float *ptr = a.channel(q);
-
-    for (int i = 0; i < size; i++)
-    {
-      ptr[i] = op(ptr[i], b);
-    }
-  }
-
-  return 0;
-}
-
-template <typename T> struct binary_op_max : std::binary_function<T, T, T>
-{
-  T operator()(const T &x, const T &y) const { return std::max(x, y); }
-};
-
-template <typename T> struct binary_op_min : std::binary_function<T, T, T>
-{
-  T operator()(const T &x, const T &y) const { return std::min(x, y); }
-};
-
-template <typename T> struct binary_op_pow : std::binary_function<T, T, T>
-{
-  T operator()(const T &x, const T &y) const { return pow(x, y); }
-};
-
-template <typename T> struct binary_op_SquaredDifference : std::binary_function<T, T, T>
-{
-  T operator()(const T &x, const T &y) const { return pow((x - y), 2); }
-};
-
-int ncnn_binary_op(const BinaryOpParam &param, const Mat &bottom_blob, const Mat &bottom_blob1,
-                   Mat &top_blob)
-{
-  int ret = 0;
-  auto op_type = param.op_type;
-  // auto b = param.b;
-
-  // Only support add operation, none broadcasting
-  // Other case, need to remove internal memory allocation and check correctness
-  if (op_type != BinaryOp::Operation_ADD)
-  {
-    throw std::runtime_error{"NYI: Only support ADD operation"};
-  }
-  if (bottom_blob.dims != bottom_blob1.dims)
-  {
-    throw std::runtime_error{"NYI: Cannot use broadcasting"};
-  }
-
-// printf("-------------------BinaryOp---------------\n");
-
-// printf("op_type = %d, ", op_type);
-// printf("in1: (%d, %d, %d), dims = %d, ", bottom_blob.w, bottom_blob.h, bottom_blob.c,
-// bottom_blob.dims);
-// printf("in2: (%d, %d, %d), dims = %d\n", bottom_blob1.w, bottom_blob1.h, bottom_blob1.c,
-// bottom_blob1.dims);
-
-#if __ARM_NEON
-  int w = bottom_blob.w;
-  int h = bottom_blob.h;
-  int channels = bottom_blob.c;
-  int size = w * h;
-
-  int w1 = bottom_blob1.w;
-  int h1 = bottom_blob1.h;
-  int channels1 = bottom_blob1.c;
-  int size1 = w1 * h1;
-
-  if (op_type == BinaryOp::Operation_ADD)
-  {
-    if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
-    {
-      // Fix for nnfw: disable allocation for output
-      // top_blob.create(w, h, channels);
-      if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
-      {
-
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *out = const_cast<float *>(outptr);
-          float tt = *ptr1;
-
-          float32x4_t _p2 = vdupq_n_f32(tt);
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-
-            _p1 = vaddq_f32(_p1, _p2);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = (*in1 + tt);
-            in1++;
-            out++;
-          }
-
-#else
-          float tt = *ptr1;
-          for (int i = 0; i < size; i++)
-          {
-            outptr[i] = (ptr[i] + tt);
-          }
-#endif
-        }
-
-        ret = 0;
-      }
-      else
-      {
-        if (size * bottom_blob.elemsize % 16 != 0)
-        {
-          throw std::runtime_error{"Unmatched alignment"};
-        }
-
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *in2 = const_cast<float *>(ptr1);
-          float *out = const_cast<float *>(outptr);
-
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-            float32x4_t _p2 = vld1q_f32(in2);
-
-            _p1 = vaddq_f32(_p1, _p2);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            in2 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = *in1 + *in2;
-            in1++;
-            in2++;
-            out++;
-          }
-        }
-      }
-    }
-    else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
-    {
-      top_blob.create(w, h, channels);
-      if (bottom_blob1.w == 1)
-      {
-        ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
-        // return ret;
-        goto out;
-      }
-      float *pt = (float *)bottom_blob1.data;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = bottom_blob.channel(q);
-        const float b0 = pt[q];
-        float *outptr = top_blob.channel(q);
-
-        int nn = size >> 2;
-        int remain = size - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vld1q_f32(in1);
-          float32x4_t _p2 = vdupq_n_f32(b0);
-
-          _p1 = vaddq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (*in1 + b0);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w1, h1, channels1);
-      if (top_blob.empty())
-        return -100;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels1; q++)
-      {
-        const float a0 = bottom_blob[q];
-        const float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        int nn = size1 >> 2;
-        int remain = size1 - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr1);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vdupq_n_f32(a0);
-          float32x4_t _p2 = vld1q_f32(in1);
-
-          _p1 = vaddq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (a0 + *in1);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else
-      ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
-  }
-
-#if 0 // Disable operation except Operation_ADD
-
-  if (op_type == BinaryOp::Operation_SUB)
-  {
-    if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w, h, channels);
-
-      if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
-      {
-
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *out = const_cast<float *>(outptr);
-          float tt = *ptr1;
-
-          float32x4_t _p2 = vdupq_n_f32(tt);
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-
-            _p1 = vsubq_f32(_p1, _p2);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = (*in1 - tt);
-            in1++;
-            out++;
-          }
-
-#else
-          float tt = *ptr1;
-          for (int i = 0; i < size; i++)
-          {
-            outptr[i] = (ptr[i] - tt);
-          }
-#endif
-        }
-
-        ret = 0;
-      }
-      else
-      {
-        top_blob.create(w, h, channels);
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *in2 = const_cast<float *>(ptr1);
-          float *out = const_cast<float *>(outptr);
-
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-            float32x4_t _p2 = vld1q_f32(in2);
-
-            _p1 = vsubq_f32(_p1, _p2);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            in2 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = *in1 - *in2;
-            in1++;
-            in2++;
-            out++;
-          }
-        }
-      }
-    }
-    else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
-    {
-      top_blob.create(w, h, channels);
-      if (bottom_blob1.w == 1)
-      {
-        ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
-        // return ret;
-        goto out;
-      }
-
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = bottom_blob.channel(q);
-        const float b0 = bottom_blob1[q];
-        float *outptr = top_blob.channel(q);
-
-        int nn = size >> 2;
-        int remain = size - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vld1q_f32(in1);
-          float32x4_t _p2 = vdupq_n_f32(b0);
-
-          _p1 = vsubq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (*in1 - b0);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w1, h1, channels1);
-      if (top_blob.empty())
-        return -100;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels1; q++)
-      {
-        const float a0 = bottom_blob[q];
-        const float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        int nn = size1 >> 2;
-        int remain = size1 - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr1);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vdupq_n_f32(a0);
-          float32x4_t _p2 = vld1q_f32(in1);
-
-          _p1 = vsubq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (a0 - *in1);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else
-      ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
-  }
-
-  if (op_type == BinaryOp::Operation_MUL)
-  {
-    if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w, h, channels);
-
-      if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
-      {
-
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *out = const_cast<float *>(outptr);
-          float tt = *ptr1;
-
-          float32x4_t _p2 = vdupq_n_f32(tt);
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-
-            _p1 = vmulq_f32(_p1, _p2);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = (*in1 * tt);
-            in1++;
-            out++;
-          }
-
-#else
-          float tt = *ptr1;
-          for (int i = 0; i < size; i++)
-          {
-            outptr[i] = (ptr[i] * tt);
-          }
-#endif
-        }
-
-        ret = 0;
-      }
-      else
-      {
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *in2 = const_cast<float *>(ptr1);
-          float *out = const_cast<float *>(outptr);
-
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-            float32x4_t _p2 = vld1q_f32(in2);
-
-            _p1 = vmulq_f32(_p1, _p2);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            in2 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = *in1 * *in2;
-            in1++;
-            in2++;
-            out++;
-          }
-        }
-      }
-    }
-    else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
-    {
-      top_blob.create(w, h, channels);
-      if (bottom_blob1.w == 1)
-      {
-        ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
-        // return ret;
-        goto out;
-      }
-
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = bottom_blob.channel(q);
-        const float b0 = bottom_blob1[q];
-        float *outptr = top_blob.channel(q);
-
-        int nn = size >> 2;
-        int remain = size - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vld1q_f32(in1);
-          float32x4_t _p2 = vdupq_n_f32(b0);
-
-          _p1 = vmulq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (*in1 * b0);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w1, h1, channels1);
-      if (top_blob.empty())
-        return -100;
-
-      if (bottom_blob.w != bottom_blob1.c)
-      {
-        ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
-        goto out;
-      }
-
-      float *pt = (float *)bottom_blob.data;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels1; q++)
-      {
-        const float a0 = pt[q];
-        const float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        int nn = size1 >> 2;
-        int remain = size1 - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr1);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vdupq_n_f32(a0);
-          float32x4_t _p2 = vld1q_f32(in1);
-
-          _p1 = vmulq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (a0 * *in1);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else
-      ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
-  }
-
-  if (op_type == BinaryOp::Operation_DIV)
-  {
-    if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w, h, channels);
-      if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
-      {
-
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *out = const_cast<float *>(outptr);
-          float tt = *ptr1;
-
-          float32x4_t _p2 = vdupq_n_f32(tt);
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-
-            float32x4_t _p3 = vrecpeq_f32(_p2);
-            _p3 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
-            _p1 = vmulq_f32(_p1, _p3);
-
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = (*in1 / tt);
-            in1++;
-            out++;
-          }
-
-#else
-          float tt = *ptr1;
-          for (int i = 0; i < size; i++)
-          {
-            outptr[i] = (ptr[i] / tt);
-          }
-#endif
-        }
-
-        // return 0;
-        goto out;
-      }
-      else
-      {
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *in2 = const_cast<float *>(ptr1);
-          float *out = const_cast<float *>(outptr);
-
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-            float32x4_t _p2 = vld1q_f32(in2);
-
-            float32x4_t _p3 = vrecpeq_f32(_p2);
-            _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
-            _p1 = vmulq_f32(_p1, _p2);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            in2 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = *in1 / *in2;
-            in1++;
-            in2++;
-            out++;
-          }
-        }
-      }
-    }
-    else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
-    {
-      top_blob.create(w, h, channels);
-      if (bottom_blob1.w == 1)
-      {
-        ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
-        // return ret;
-        goto out;
-      }
-
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = bottom_blob.channel(q);
-        const float b0 = bottom_blob1[q];
-        float *outptr = top_blob.channel(q);
-
-        int nn = size >> 2;
-        int remain = size - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vld1q_f32(in1);
-          float32x4_t _p2 = vdupq_n_f32(b0);
-
-          //_p1 = vsubq_f32(_p1, _p2);
-          float32x4_t _p3 = vrecpeq_f32(_p2);
-          _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
-          _p1 = vmulq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (*in1 / b0);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w1, h1, channels1);
-      if (top_blob.empty())
-        return -100;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels1; q++)
-      {
-        const float a0 = bottom_blob[q];
-        const float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        int nn = size1 >> 2;
-        int remain = size1 - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr1);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vdupq_n_f32(a0);
-          float32x4_t _p2 = vld1q_f32(in1);
-
-          //_p1 = vsubq_f32(_p1, _p2);
-          float32x4_t _p3 = vrecpeq_f32(_p2);
-          _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
-          _p1 = vmulq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (a0 / *in1);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else
-      ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
-  }
-
-  if (op_type == BinaryOp::Operation_MAX)
-    ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob);
-
-  if (op_type == BinaryOp::Operation_MIN)
-    ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob);
-
-  if (op_type == BinaryOp::Operation_POW)
-  {
-    if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w, h, channels);
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = bottom_blob.channel(q);
-        const float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        int nn = size >> 2;
-        int remain = size - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr);
-        float *in2 = const_cast<float *>(ptr1);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vld1q_f32(in1);
-          float32x4_t _p2 = vld1q_f32(in2);
-
-          _p1 = pow_ps(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          in2 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = pow(*in1, *in2);
-          in1++;
-          in2++;
-          out++;
-        }
-      }
-    }
-    else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
-    {
-      top_blob.create(w, h, channels);
-      if (bottom_blob1.w == 1)
-      {
-        ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
-        // return ret;
-        goto out;
-      }
-
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = bottom_blob.channel(q);
-        const float b0 = bottom_blob1[q];
-        float *outptr = top_blob.channel(q);
-
-        int nn = size >> 2;
-        int remain = size - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vld1q_f32(in1);
-          float32x4_t _p2 = vdupq_n_f32(b0);
-
-          _p1 = pow_ps(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = pow(*in1, b0);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w1, h1, channels1);
-      if (top_blob.empty())
-        return -100;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels1; q++)
-      {
-        const float a0 = bottom_blob[q];
-        const float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        int nn = size1 >> 2;
-        int remain = size1 - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr1);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vdupq_n_f32(a0);
-          float32x4_t _p2 = vld1q_f32(in1);
-
-          _p1 = pow_ps(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = pow(a0, *in1);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else
-      ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
-  }
-
-  if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
-  {
-    if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w, h, channels);
-
-      if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
-      {
-
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *out = const_cast<float *>(outptr);
-          float tt = *ptr1;
-
-          float32x4_t _p2 = vdupq_n_f32(tt);
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-
-            _p1 = vsubq_f32(_p1, _p2);
-            _p1 = vmulq_f32(_p1, _p1);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            float t2 = *in1 - tt;
-            *out = t2 * t2;
-            in1++;
-            out++;
-          }
-
-#else
-          float tt = *ptr1;
-          for (int i = 0; i < size; i++)
-          {
-            float t2 = (ptr[i] - tt);
-            outptr[i] = t2 * t2;
-          }
-#endif
-        }
-
-        ret = 0;
-      }
-      else
-      {
-#pragma omp parallel for
-        for (int q = 0; q < channels; q++)
-        {
-          const float *ptr = bottom_blob.channel(q);
-          const float *ptr1 = bottom_blob1.channel(q);
-          float *outptr = top_blob.channel(q);
-
-          int nn = size >> 2;
-          int remain = size - (nn << 2);
-
-          float *in1 = const_cast<float *>(ptr);
-          float *in2 = const_cast<float *>(ptr1);
-          float *out = const_cast<float *>(outptr);
-
-          for (; nn > 0; nn--)
-          {
-            float32x4_t _p1 = vld1q_f32(in1);
-            float32x4_t _p2 = vld1q_f32(in2);
-
-            _p1 = vsubq_f32(_p1, _p2);
-            _p1 = vmulq_f32(_p1, _p1);
-            vst1q_f32(out, _p1);
-            in1 += 4;
-            in2 += 4;
-            out += 4;
-          }
-          for (; remain > 0; remain--)
-          {
-            *out = (*in1 - *in2) * (*in1 - *in2);
-            in1++;
-            in2++;
-            out++;
-          }
-        }
-      }
-    }
-    else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
-    {
-      top_blob.create(w, h, channels);
-      if (bottom_blob1.w == 1)
-      {
-        ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
-        // return ret;
-        goto out;
-      }
-
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        const float *ptr = bottom_blob.channel(q);
-        const float b0 = bottom_blob1[q];
-        float *outptr = top_blob.channel(q);
-
-        int nn = size >> 2;
-        int remain = size - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vld1q_f32(in1);
-          float32x4_t _p2 = vdupq_n_f32(b0);
-
-          _p1 = vsubq_f32(_p1, _p2);
-          _p1 = vmulq_f32(_p1, _p1);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (*in1 - b0) * (*in1 - b0);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
-    {
-      top_blob.create(w1, h1, channels1);
-      if (top_blob.empty())
-        return -100;
-
-#pragma omp parallel for
-      for (int q = 0; q < channels1; q++)
-      {
-        const float a0 = bottom_blob[q];
-        const float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        int nn = size1 >> 2;
-        int remain = size1 - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr1);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vdupq_n_f32(a0);
-          float32x4_t _p2 = vld1q_f32(in1);
-
-          _p1 = vsubq_f32(_p1, _p2);
-          _p1 = vmulq_f32(_p1, _p1);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = (a0 - *in1) * (a0 - *in1);
-          in1++;
-          out++;
-        }
-      }
-    }
-    else
-      ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
-  }
-
-#endif // 0 (Disable operation except Operation_ADD)
-
-#else
-
-  if (op_type == BinaryOp::Operation_ADD)
-    ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
-
-  if (op_type == BinaryOp::Operation_SUB)
-    ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
-
-  if (op_type == BinaryOp::Operation_MUL)
-    ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
-
-  if (op_type == BinaryOp::Operation_DIV)
-    ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
-
-  if (op_type == BinaryOp::Operation_MAX)
-    ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob);
-
-  if (op_type == BinaryOp::Operation_MIN)
-    ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob);
-
-  if (op_type == BinaryOp::Operation_POW)
-    ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
-  if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
-    ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
-#endif
-
-/*
-for (int p = 0; p < top_blob.c && p < 5; p++)
-{
-    float* outptr = top_blob.channel(p);
-    printf("channel: %d\n", p);
-    for (int i = 0; i < 1; i++)
-    {
-        for (int j = 0; j < 5; j++)
-        {
-            printf("%f ", outptr[j]);
-        }
-        printf("\n");
-        outptr += top_blob.w;
-    }
-}
-printf("----------------------------\n");
-*/
-
-out:
-  return ret;
-}
-
-int ncnn_binary_op_inplace(const BinaryOpParam &param, Mat &bottom_top_blob)
-{
-  auto op_type = param.op_type;
-  auto b = param.b;
-
-  // printf("-------------------BinaryOp-----forward_inplace----------\n");
-  if (op_type == BinaryOp::Operation_ADD)
-    return binary_op_scalar_inplace<std::plus<float>>(bottom_top_blob, b);
-
-  if (op_type == BinaryOp::Operation_SUB)
-    return binary_op_scalar_inplace<std::minus<float>>(bottom_top_blob, b);
-
-  if (op_type == BinaryOp::Operation_MUL)
-    return binary_op_scalar_inplace<std::multiplies<float>>(bottom_top_blob, b);
-
-  if (op_type == BinaryOp::Operation_DIV)
-    return binary_op_scalar_inplace<std::divides<float>>(bottom_top_blob, b);
-
-  if (op_type == BinaryOp::Operation_MAX)
-    return binary_op_scalar_inplace<binary_op_max<float>>(bottom_top_blob, b);
-
-  if (op_type == BinaryOp::Operation_MIN)
-    return binary_op_scalar_inplace<binary_op_min<float>>(bottom_top_blob, b);
-
-  if (op_type == BinaryOp::Operation_POW)
-    return binary_op_scalar_inplace<binary_op_pow<float>>(bottom_top_blob, b);
-
-  if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
-    return binary_op_scalar_inplace<binary_op_SquaredDifference<float>>(bottom_top_blob, b);
-
-  return 0;
-}
-
-int ncnn_binary_op_inplace(const BinaryOpParam &param, Mat &bottom_blob, Mat &bottom_top_blob)
-{
-  int ret = 0;
-
-  Mat &bottom_blob1 = bottom_top_blob;
-  Mat &top_blob = bottom_top_blob;
-  auto op_type = param.op_type;
-
-  if (op_type == BinaryOp::Operation_ADD)
-  {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    int size = w * h;
-
-// Unused variables
-// int w1 = bottom_blob1.w;
-// int h1 = bottom_blob1.h;
-// int channels1 = bottom_blob1.c;
-// int size1 = w1 * h1;
-
-#if __ARM_NEON
-
-    if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
-    {
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        float *ptr = bottom_blob.channel(q);
-        float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        int nn = size >> 2;
-        int remain = size - (nn << 2);
-
-        float *in1 = const_cast<float *>(ptr);
-        float *in2 = const_cast<float *>(ptr1);
-        float *out = const_cast<float *>(outptr);
-
-        for (; nn > 0; nn--)
-        {
-          float32x4_t _p1 = vld1q_f32(in1);
-          float32x4_t _p2 = vld1q_f32(in2);
-
-          _p1 = vaddq_f32(_p1, _p2);
-          vst1q_f32(out, _p1);
-          in1 += 4;
-          in2 += 4;
-          out += 4;
-        }
-        for (; remain > 0; remain--)
-        {
-          *out = *in1 + *in2;
-          in1++;
-          in2++;
-          out++;
-        }
-      }
-    }
-#else
-    if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
-    {
-#pragma omp parallel for
-      for (int q = 0; q < channels; q++)
-      {
-        float *ptr = bottom_blob.channel(q);
-        float *ptr1 = bottom_blob1.channel(q);
-        float *outptr = top_blob.channel(q);
-
-        for (int i = 0; i < size; i++)
-        {
-          outptr[i] = ptr[i] + ptr1[i];
-        }
-      }
-      return 0;
-    }
-#endif
-  }
-  else
-  {
-    return -1;
-  }
-  return ret;
-}
-
-} // namespace ncnn
-} // namespace ncnn