summaryrefslogtreecommitdiff
path: root/compute/ncnn/src/layer/binaryop.cc
diff options
context:
space:
mode:
Diffstat (limited to 'compute/ncnn/src/layer/binaryop.cc')
-rw-r--r--compute/ncnn/src/layer/binaryop.cc1640
1 files changed, 0 insertions, 1640 deletions
diff --git a/compute/ncnn/src/layer/binaryop.cc b/compute/ncnn/src/layer/binaryop.cc
deleted file mode 100644
index a09d55f78..000000000
--- a/compute/ncnn/src/layer/binaryop.cc
+++ /dev/null
@@ -1,1640 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#include "ncnn/layer/binaryop.h"
-#include <math.h>
-#include <algorithm>
-#include <functional>
-#include <sys/time.h>
-
-#if __ARM_NEON
-#include <arm_neon.h>
-#include "arm/neon_mathfun.h"
-#endif // __ARM_NEON
-
-namespace nnfw
-{
-namespace ncnn
-{
-
-template <typename Op> static int binary_op(const Mat &a, const Mat &b, Mat &c)
-{
- Op op;
-
- int w = a.w;
- int h = a.h;
- int channels = a.c;
- int size = w * h;
-
- int w1 = b.w;
- int h1 = b.h;
- int channels1 = b.c;
- int size1 = w1 * h1;
-
- if (a.dims == 3)
- {
- c.create(w, h, channels);
- if (c.empty())
- return -100;
-
- if (b.dims == 3)
- {
- if (b.w == 1 && b.h == 1)
- {
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = a.channel(q);
- const float *ptr1 = b.channel(q);
- float *outptr = c.channel(q);
-
- float tt = *ptr1;
- for (int i = 0; i < size; i++)
- {
- outptr[i] = op(ptr[i], tt);
- }
- }
-
- return 0;
- }
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = a.channel(q);
- const float *ptr1 = b.channel(q);
- float *outptr = c.channel(q);
-
- for (int i = 0; i < size; i++)
- {
- outptr[i] = op(ptr[i], ptr1[i]);
- }
- }
-
- return 0;
- }
-
- if (b.dims == 2)
- {
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = a.channel(q);
- const float *ptr1 = (const float *)b + h * q;
- float *outptr = c.channel(q);
-
- for (int y = 0; y < h; y++)
- {
- const float b0 = ptr1[y];
- for (int x = 0; x < w; x++)
- {
- outptr[x] = op(ptr[x], b0);
- }
-
- ptr += w;
- outptr += w;
- }
- }
-
- return 0;
- }
-
- if (b.dims == 1)
- {
- if (b.w == 1)
- {
- const float b0 = b[0];
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = a.channel(q);
- float *outptr = c.channel(q);
-
- for (int i = 0; i < size; i++)
- {
- outptr[i] = op(ptr[i], b0);
- }
- }
-
- return 0;
- }
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = a.channel(q);
- const float b0 = b[q];
- float *outptr = c.channel(q);
-
- for (int i = 0; i < size; i++)
- {
- outptr[i] = op(ptr[i], b0);
- }
- }
-
- return 0;
- }
- }
- else if (a.dims == 2)
- {
- if (b.dims == 3)
- {
- c.create(w1, h1, channels1);
- if (c.empty())
- return -100;
-
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float *ptr = (const float *)a + h1 * q;
- const float *ptr1 = b.channel(q);
- float *outptr = c.channel(q);
-
- for (int y = 0; y < h1; y++)
- {
- const float a0 = ptr[y];
- for (int x = 0; x < w1; x++)
- {
- outptr[x] = op(a0, ptr1[x]);
- }
-
- ptr1 += w1;
- outptr += w1;
- }
- }
-
- return 0;
- }
-
- c.create(w, h);
- if (c.empty())
- return -100;
-
- if (b.dims == 2)
- {
- for (int i = 0; i < size; i++)
- {
- c[i] = op(a[i], b[i]);
- }
-
- return 0;
- }
-
- if (b.dims == 1)
- {
- c.create(w, h);
- if (c.empty())
- return -100;
-
- if (b.w == 1)
- {
- const float b0 = b[0];
- for (int i = 0; i < size; i++)
- {
- c[i] = op(a[i], b0);
- }
-
- return 0;
- }
-
- const float *ptr = a;
- float *outptr = c;
-
- for (int y = 0; y < h; y++)
- {
- const float b0 = b[y];
- for (int x = 0; x < w; x++)
- {
- outptr[x] = op(ptr[x], b0);
- }
-
- ptr += w;
- outptr += w;
- }
-
- return 0;
- }
- }
- else if (a.dims == 1)
- {
- if (a.w == 1)
- {
- if (b.dims == 3)
- {
- c.create(w1, h1, channels1);
- if (c.empty())
- return -100;
-
- const float a0 = a[0];
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float *ptr1 = b.channel(q);
- float *outptr = c.channel(q);
-
- for (int i = 0; i < size1; i++)
- {
- outptr[i] = op(a0, ptr1[i]);
- }
- }
-
- return 0;
- }
-
- if (b.dims == 2)
- {
- c.create(w1, h1);
- if (c.empty())
- return -100;
-
- const float a0 = a[0];
- for (int i = 0; i < size1; i++)
- {
- c[i] = op(a0, b[i]);
- }
-
- return 0;
- }
-
- if (b.dims == 1)
- {
- c.create(w1);
- if (c.empty())
- return -100;
-
- const float a0 = a[0];
- for (int i = 0; i < size1; i++)
- {
- c[i] = op(a0, b[i]);
- }
-
- return 0;
- }
- }
-
- if (b.dims == 3)
- {
- c.create(w1, h1, channels1);
- if (c.empty())
- return -100;
-
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float a0 = a[q];
- const float *ptr1 = b.channel(q);
- float *outptr = c.channel(q);
-
- for (int i = 0; i < size1; i++)
- {
- outptr[i] = op(a0, ptr1[i]);
- }
- }
-
- return 0;
- }
-
- if (b.dims == 2)
- {
- c.create(w1, h1);
- if (c.empty())
- return -100;
-
- const float *ptr1 = b;
- float *outptr = c;
-
- for (int y = 0; y < h1; y++)
- {
- const float a0 = a[y];
- for (int x = 0; x < w1; x++)
- {
- outptr[x] = op(a0, ptr1[x]);
- }
-
- ptr1 += w1;
- outptr += w1;
- }
-
- return 0;
- }
-
- if (b.dims == 1)
- {
- c.create(w);
- if (c.empty())
- return -100;
-
- if (b.w == 1)
- {
- const float b0 = b[0];
- for (int i = 0; i < size; i++)
- {
- c[i] = op(a[i], b0);
- }
-
- return 0;
- }
-
- for (int i = 0; i < size; i++)
- {
- c[i] = op(a[i], b[i]);
- }
- }
- }
-
- return 0;
-}
-
-template <typename Op> static int binary_op_scalar_inplace(Mat &a, float b)
-{
- Op op;
-
- int w = a.w;
- int h = a.h;
- int channels = a.c;
- int size = w * h;
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- float *ptr = a.channel(q);
-
- for (int i = 0; i < size; i++)
- {
- ptr[i] = op(ptr[i], b);
- }
- }
-
- return 0;
-}
-
-template <typename T> struct binary_op_max : std::binary_function<T, T, T>
-{
- T operator()(const T &x, const T &y) const { return std::max(x, y); }
-};
-
-template <typename T> struct binary_op_min : std::binary_function<T, T, T>
-{
- T operator()(const T &x, const T &y) const { return std::min(x, y); }
-};
-
-template <typename T> struct binary_op_pow : std::binary_function<T, T, T>
-{
- T operator()(const T &x, const T &y) const { return pow(x, y); }
-};
-
-template <typename T> struct binary_op_SquaredDifference : std::binary_function<T, T, T>
-{
- T operator()(const T &x, const T &y) const { return pow((x - y), 2); }
-};
-
-int ncnn_binary_op(const BinaryOpParam &param, const Mat &bottom_blob, const Mat &bottom_blob1,
- Mat &top_blob)
-{
- int ret = 0;
- auto op_type = param.op_type;
- // auto b = param.b;
-
- // Only support add operation, none broadcasting
- // Other case, need to remove internal memory allocation and check correctness
- if (op_type != BinaryOp::Operation_ADD)
- {
- throw std::runtime_error{"NYI: Only support ADD operation"};
- }
- if (bottom_blob.dims != bottom_blob1.dims)
- {
- throw std::runtime_error{"NYI: Cannot use broadcasting"};
- }
-
-// printf("-------------------BinaryOp---------------\n");
-
-// printf("op_type = %d, ", op_type);
-// printf("in1: (%d, %d, %d), dims = %d, ", bottom_blob.w, bottom_blob.h, bottom_blob.c,
-// bottom_blob.dims);
-// printf("in2: (%d, %d, %d), dims = %d\n", bottom_blob1.w, bottom_blob1.h, bottom_blob1.c,
-// bottom_blob1.dims);
-
-#if __ARM_NEON
- int w = bottom_blob.w;
- int h = bottom_blob.h;
- int channels = bottom_blob.c;
- int size = w * h;
-
- int w1 = bottom_blob1.w;
- int h1 = bottom_blob1.h;
- int channels1 = bottom_blob1.c;
- int size1 = w1 * h1;
-
- if (op_type == BinaryOp::Operation_ADD)
- {
- if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
- {
- // Fix for nnfw: disable allocation for output
- // top_blob.create(w, h, channels);
- if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
- {
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
- float tt = *ptr1;
-
- float32x4_t _p2 = vdupq_n_f32(tt);
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
-
- _p1 = vaddq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 + tt);
- in1++;
- out++;
- }
-
-#else
- float tt = *ptr1;
- for (int i = 0; i < size; i++)
- {
- outptr[i] = (ptr[i] + tt);
- }
-#endif
- }
-
- ret = 0;
- }
- else
- {
- if (size * bottom_blob.elemsize % 16 != 0)
- {
- throw std::runtime_error{"Unmatched alignment"};
- }
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *in2 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vld1q_f32(in2);
-
- _p1 = vaddq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- in2 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = *in1 + *in2;
- in1++;
- in2++;
- out++;
- }
- }
- }
- }
- else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
- {
- top_blob.create(w, h, channels);
- if (bottom_blob1.w == 1)
- {
- ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
- // return ret;
- goto out;
- }
- float *pt = (float *)bottom_blob1.data;
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float b0 = pt[q];
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vdupq_n_f32(b0);
-
- _p1 = vaddq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 + b0);
- in1++;
- out++;
- }
- }
- }
- else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
- {
- top_blob.create(w1, h1, channels1);
- if (top_blob.empty())
- return -100;
-
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float a0 = bottom_blob[q];
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size1 >> 2;
- int remain = size1 - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vdupq_n_f32(a0);
- float32x4_t _p2 = vld1q_f32(in1);
-
- _p1 = vaddq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (a0 + *in1);
- in1++;
- out++;
- }
- }
- }
- else
- ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
- }
-
-#if 0 // Disable operation except Operation_ADD
-
- if (op_type == BinaryOp::Operation_SUB)
- {
- if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
- {
- top_blob.create(w, h, channels);
-
- if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
- {
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
- float tt = *ptr1;
-
- float32x4_t _p2 = vdupq_n_f32(tt);
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
-
- _p1 = vsubq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 - tt);
- in1++;
- out++;
- }
-
-#else
- float tt = *ptr1;
- for (int i = 0; i < size; i++)
- {
- outptr[i] = (ptr[i] - tt);
- }
-#endif
- }
-
- ret = 0;
- }
- else
- {
- top_blob.create(w, h, channels);
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *in2 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vld1q_f32(in2);
-
- _p1 = vsubq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- in2 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = *in1 - *in2;
- in1++;
- in2++;
- out++;
- }
- }
- }
- }
- else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
- {
- top_blob.create(w, h, channels);
- if (bottom_blob1.w == 1)
- {
- ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
- // return ret;
- goto out;
- }
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float b0 = bottom_blob1[q];
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vdupq_n_f32(b0);
-
- _p1 = vsubq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 - b0);
- in1++;
- out++;
- }
- }
- }
- else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
- {
- top_blob.create(w1, h1, channels1);
- if (top_blob.empty())
- return -100;
-
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float a0 = bottom_blob[q];
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size1 >> 2;
- int remain = size1 - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vdupq_n_f32(a0);
- float32x4_t _p2 = vld1q_f32(in1);
-
- _p1 = vsubq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (a0 - *in1);
- in1++;
- out++;
- }
- }
- }
- else
- ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
- }
-
- if (op_type == BinaryOp::Operation_MUL)
- {
- if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
- {
- top_blob.create(w, h, channels);
-
- if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
- {
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
- float tt = *ptr1;
-
- float32x4_t _p2 = vdupq_n_f32(tt);
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
-
- _p1 = vmulq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 * tt);
- in1++;
- out++;
- }
-
-#else
- float tt = *ptr1;
- for (int i = 0; i < size; i++)
- {
- outptr[i] = (ptr[i] * tt);
- }
-#endif
- }
-
- ret = 0;
- }
- else
- {
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *in2 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vld1q_f32(in2);
-
- _p1 = vmulq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- in2 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = *in1 * *in2;
- in1++;
- in2++;
- out++;
- }
- }
- }
- }
- else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
- {
- top_blob.create(w, h, channels);
- if (bottom_blob1.w == 1)
- {
- ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
- // return ret;
- goto out;
- }
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float b0 = bottom_blob1[q];
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vdupq_n_f32(b0);
-
- _p1 = vmulq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 * b0);
- in1++;
- out++;
- }
- }
- }
- else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
- {
- top_blob.create(w1, h1, channels1);
- if (top_blob.empty())
- return -100;
-
- if (bottom_blob.w != bottom_blob1.c)
- {
- ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
- goto out;
- }
-
- float *pt = (float *)bottom_blob.data;
-
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float a0 = pt[q];
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size1 >> 2;
- int remain = size1 - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vdupq_n_f32(a0);
- float32x4_t _p2 = vld1q_f32(in1);
-
- _p1 = vmulq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (a0 * *in1);
- in1++;
- out++;
- }
- }
- }
- else
- ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
- }
-
- if (op_type == BinaryOp::Operation_DIV)
- {
- if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
- {
- top_blob.create(w, h, channels);
- if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
- {
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
- float tt = *ptr1;
-
- float32x4_t _p2 = vdupq_n_f32(tt);
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
-
- float32x4_t _p3 = vrecpeq_f32(_p2);
- _p3 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
- _p1 = vmulq_f32(_p1, _p3);
-
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 / tt);
- in1++;
- out++;
- }
-
-#else
- float tt = *ptr1;
- for (int i = 0; i < size; i++)
- {
- outptr[i] = (ptr[i] / tt);
- }
-#endif
- }
-
- // return 0;
- goto out;
- }
- else
- {
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *in2 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vld1q_f32(in2);
-
- float32x4_t _p3 = vrecpeq_f32(_p2);
- _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
- _p1 = vmulq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- in2 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = *in1 / *in2;
- in1++;
- in2++;
- out++;
- }
- }
- }
- }
- else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
- {
- top_blob.create(w, h, channels);
- if (bottom_blob1.w == 1)
- {
- ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
- // return ret;
- goto out;
- }
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float b0 = bottom_blob1[q];
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vdupq_n_f32(b0);
-
- //_p1 = vsubq_f32(_p1, _p2);
- float32x4_t _p3 = vrecpeq_f32(_p2);
- _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
- _p1 = vmulq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 / b0);
- in1++;
- out++;
- }
- }
- }
- else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
- {
- top_blob.create(w1, h1, channels1);
- if (top_blob.empty())
- return -100;
-
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float a0 = bottom_blob[q];
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size1 >> 2;
- int remain = size1 - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vdupq_n_f32(a0);
- float32x4_t _p2 = vld1q_f32(in1);
-
- //_p1 = vsubq_f32(_p1, _p2);
- float32x4_t _p3 = vrecpeq_f32(_p2);
- _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3);
- _p1 = vmulq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (a0 / *in1);
- in1++;
- out++;
- }
- }
- }
- else
- ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
- }
-
- if (op_type == BinaryOp::Operation_MAX)
- ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob);
-
- if (op_type == BinaryOp::Operation_MIN)
- ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob);
-
- if (op_type == BinaryOp::Operation_POW)
- {
- if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
- {
- top_blob.create(w, h, channels);
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *in2 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vld1q_f32(in2);
-
- _p1 = pow_ps(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- in2 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = pow(*in1, *in2);
- in1++;
- in2++;
- out++;
- }
- }
- }
- else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
- {
- top_blob.create(w, h, channels);
- if (bottom_blob1.w == 1)
- {
- ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
- // return ret;
- goto out;
- }
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float b0 = bottom_blob1[q];
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vdupq_n_f32(b0);
-
- _p1 = pow_ps(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = pow(*in1, b0);
- in1++;
- out++;
- }
- }
- }
- else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
- {
- top_blob.create(w1, h1, channels1);
- if (top_blob.empty())
- return -100;
-
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float a0 = bottom_blob[q];
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size1 >> 2;
- int remain = size1 - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vdupq_n_f32(a0);
- float32x4_t _p2 = vld1q_f32(in1);
-
- _p1 = pow_ps(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = pow(a0, *in1);
- in1++;
- out++;
- }
- }
- }
- else
- ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
- }
-
- if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
- {
- if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
- {
- top_blob.create(w, h, channels);
-
- if (bottom_blob1.w == 1 && bottom_blob1.h == 1)
- {
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
-#if __ARM_NEON
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
- float tt = *ptr1;
-
- float32x4_t _p2 = vdupq_n_f32(tt);
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
-
- _p1 = vsubq_f32(_p1, _p2);
- _p1 = vmulq_f32(_p1, _p1);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- float t2 = *in1 - tt;
- *out = t2 * t2;
- in1++;
- out++;
- }
-
-#else
- float tt = *ptr1;
- for (int i = 0; i < size; i++)
- {
- float t2 = (ptr[i] - tt);
- outptr[i] = t2 * t2;
- }
-#endif
- }
-
- ret = 0;
- }
- else
- {
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *in2 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vld1q_f32(in2);
-
- _p1 = vsubq_f32(_p1, _p2);
- _p1 = vmulq_f32(_p1, _p1);
- vst1q_f32(out, _p1);
- in1 += 4;
- in2 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 - *in2) * (*in1 - *in2);
- in1++;
- in2++;
- out++;
- }
- }
- }
- }
- else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1)
- {
- top_blob.create(w, h, channels);
- if (bottom_blob1.w == 1)
- {
- ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
- // return ret;
- goto out;
- }
-
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- const float *ptr = bottom_blob.channel(q);
- const float b0 = bottom_blob1[q];
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vdupq_n_f32(b0);
-
- _p1 = vsubq_f32(_p1, _p2);
- _p1 = vmulq_f32(_p1, _p1);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (*in1 - b0) * (*in1 - b0);
- in1++;
- out++;
- }
- }
- }
- else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3)
- {
- top_blob.create(w1, h1, channels1);
- if (top_blob.empty())
- return -100;
-
-#pragma omp parallel for
- for (int q = 0; q < channels1; q++)
- {
- const float a0 = bottom_blob[q];
- const float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size1 >> 2;
- int remain = size1 - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vdupq_n_f32(a0);
- float32x4_t _p2 = vld1q_f32(in1);
-
- _p1 = vsubq_f32(_p1, _p2);
- _p1 = vmulq_f32(_p1, _p1);
- vst1q_f32(out, _p1);
- in1 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = (a0 - *in1) * (a0 - *in1);
- in1++;
- out++;
- }
- }
- }
- else
- ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
- }
-
-#endif // 0 (Disable operation except Operation_ADD)
-
-#else
-
- if (op_type == BinaryOp::Operation_ADD)
- ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob);
-
- if (op_type == BinaryOp::Operation_SUB)
- ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob);
-
- if (op_type == BinaryOp::Operation_MUL)
- ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob);
-
- if (op_type == BinaryOp::Operation_DIV)
- ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob);
-
- if (op_type == BinaryOp::Operation_MAX)
- ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob);
-
- if (op_type == BinaryOp::Operation_MIN)
- ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob);
-
- if (op_type == BinaryOp::Operation_POW)
- ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob);
- if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
- ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob);
-#endif
-
-/*
-for (int p = 0; p < top_blob.c && p < 5; p++)
-{
- float* outptr = top_blob.channel(p);
- printf("channel: %d\n", p);
- for (int i = 0; i < 1; i++)
- {
- for (int j = 0; j < 5; j++)
- {
- printf("%f ", outptr[j]);
- }
- printf("\n");
- outptr += top_blob.w;
- }
-}
-printf("----------------------------\n");
-*/
-
-out:
- return ret;
-}
-
-int ncnn_binary_op_inplace(const BinaryOpParam &param, Mat &bottom_top_blob)
-{
- auto op_type = param.op_type;
- auto b = param.b;
-
- // printf("-------------------BinaryOp-----forward_inplace----------\n");
- if (op_type == BinaryOp::Operation_ADD)
- return binary_op_scalar_inplace<std::plus<float>>(bottom_top_blob, b);
-
- if (op_type == BinaryOp::Operation_SUB)
- return binary_op_scalar_inplace<std::minus<float>>(bottom_top_blob, b);
-
- if (op_type == BinaryOp::Operation_MUL)
- return binary_op_scalar_inplace<std::multiplies<float>>(bottom_top_blob, b);
-
- if (op_type == BinaryOp::Operation_DIV)
- return binary_op_scalar_inplace<std::divides<float>>(bottom_top_blob, b);
-
- if (op_type == BinaryOp::Operation_MAX)
- return binary_op_scalar_inplace<binary_op_max<float>>(bottom_top_blob, b);
-
- if (op_type == BinaryOp::Operation_MIN)
- return binary_op_scalar_inplace<binary_op_min<float>>(bottom_top_blob, b);
-
- if (op_type == BinaryOp::Operation_POW)
- return binary_op_scalar_inplace<binary_op_pow<float>>(bottom_top_blob, b);
-
- if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE)
- return binary_op_scalar_inplace<binary_op_SquaredDifference<float>>(bottom_top_blob, b);
-
- return 0;
-}
-
-int ncnn_binary_op_inplace(const BinaryOpParam &param, Mat &bottom_blob, Mat &bottom_top_blob)
-{
- int ret = 0;
-
- Mat &bottom_blob1 = bottom_top_blob;
- Mat &top_blob = bottom_top_blob;
- auto op_type = param.op_type;
-
- if (op_type == BinaryOp::Operation_ADD)
- {
- int w = bottom_blob.w;
- int h = bottom_blob.h;
- int channels = bottom_blob.c;
- int size = w * h;
-
-// Unused variables
-// int w1 = bottom_blob1.w;
-// int h1 = bottom_blob1.h;
-// int channels1 = bottom_blob1.c;
-// int size1 = w1 * h1;
-
-#if __ARM_NEON
-
- if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
- {
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- float *ptr = bottom_blob.channel(q);
- float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- int nn = size >> 2;
- int remain = size - (nn << 2);
-
- float *in1 = const_cast<float *>(ptr);
- float *in2 = const_cast<float *>(ptr1);
- float *out = const_cast<float *>(outptr);
-
- for (; nn > 0; nn--)
- {
- float32x4_t _p1 = vld1q_f32(in1);
- float32x4_t _p2 = vld1q_f32(in2);
-
- _p1 = vaddq_f32(_p1, _p2);
- vst1q_f32(out, _p1);
- in1 += 4;
- in2 += 4;
- out += 4;
- }
- for (; remain > 0; remain--)
- {
- *out = *in1 + *in2;
- in1++;
- in2++;
- out++;
- }
- }
- }
-#else
- if (bottom_blob.dims == 3 && bottom_blob1.dims == 3)
- {
-#pragma omp parallel for
- for (int q = 0; q < channels; q++)
- {
- float *ptr = bottom_blob.channel(q);
- float *ptr1 = bottom_blob1.channel(q);
- float *outptr = top_blob.channel(q);
-
- for (int i = 0; i < size; i++)
- {
- outptr[i] = ptr[i] + ptr1[i];
- }
- }
- return 0;
- }
-#endif
- }
- else
- {
- return -1;
- }
- return ret;
-}
-
-} // namespace ncnn
-} // namespace ncnn