diff options
Diffstat (limited to 'compute/ncnn/src/layer/binaryop.cc')
-rw-r--r-- | compute/ncnn/src/layer/binaryop.cc | 1640 |
1 files changed, 0 insertions, 1640 deletions
diff --git a/compute/ncnn/src/layer/binaryop.cc b/compute/ncnn/src/layer/binaryop.cc deleted file mode 100644 index a09d55f78..000000000 --- a/compute/ncnn/src/layer/binaryop.cc +++ /dev/null @@ -1,1640 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "ncnn/layer/binaryop.h" -#include <math.h> -#include <algorithm> -#include <functional> -#include <sys/time.h> - -#if __ARM_NEON -#include <arm_neon.h> -#include "arm/neon_mathfun.h" -#endif // __ARM_NEON - -namespace nnfw -{ -namespace ncnn -{ - -template <typename Op> static int binary_op(const Mat &a, const Mat &b, Mat &c) -{ - Op op; - - int w = a.w; - int h = a.h; - int channels = a.c; - int size = w * h; - - int w1 = b.w; - int h1 = b.h; - int channels1 = b.c; - int size1 = w1 * h1; - - if (a.dims == 3) - { - c.create(w, h, channels); - if (c.empty()) - return -100; - - if (b.dims == 3) - { - if (b.w == 1 && b.h == 1) - { - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = a.channel(q); - const float *ptr1 = b.channel(q); - float *outptr = c.channel(q); - - float tt = *ptr1; - for (int i = 0; i < size; i++) - { - outptr[i] = op(ptr[i], tt); - } - } - - return 0; - } - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = a.channel(q); - const float *ptr1 = b.channel(q); - float *outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - outptr[i] = op(ptr[i], ptr1[i]); - } - } - - return 0; - } - - if (b.dims == 2) - { -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = a.channel(q); - const float *ptr1 = (const float *)b + h * q; - float *outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - const float b0 = ptr1[y]; - for (int x = 0; x < w; x++) - { - outptr[x] = op(ptr[x], b0); - } - - ptr += w; - outptr += w; - } - } - - return 0; - } - - if (b.dims == 1) - { - if (b.w == 1) - { - const float b0 = b[0]; -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = a.channel(q); - float *outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - outptr[i] = op(ptr[i], b0); - } - } - - return 0; - } - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = a.channel(q); - const float b0 = b[q]; - float *outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - outptr[i] = op(ptr[i], b0); - } - } - - return 0; - } - } - else if (a.dims == 2) - { - if (b.dims == 3) - { - c.create(w1, h1, channels1); - if (c.empty()) - return -100; - -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float *ptr = (const float *)a + h1 * q; - const float *ptr1 = b.channel(q); - float *outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - const float a0 = ptr[y]; - for (int x = 0; x < w1; x++) - { - outptr[x] = op(a0, ptr1[x]); - } - - ptr1 += w1; - outptr += w1; - } - } - - return 0; - } - - c.create(w, h); - if (c.empty()) - return -100; - - if (b.dims == 2) - { - for (int i = 0; i < size; i++) - { - c[i] = op(a[i], b[i]); - } - - return 0; - } - - if (b.dims == 1) - { - c.create(w, h); - if (c.empty()) - return -100; - - if (b.w == 1) - { - const float b0 = b[0]; - for (int i = 0; i < size; i++) - { - c[i] = op(a[i], b0); - } - - return 0; - } - - const float *ptr = a; - float *outptr = c; - - for (int y = 0; y < h; y++) - { - const float b0 = b[y]; - for (int x = 0; x < w; x++) - { - outptr[x] = op(ptr[x], b0); - } - - ptr += w; - outptr += w; - } - - return 0; - } - } - else if (a.dims == 1) - { - if (a.w == 1) - { - if (b.dims == 3) - { - c.create(w1, h1, channels1); - if (c.empty()) - return -100; - - const float a0 = a[0]; -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float *ptr1 = b.channel(q); - float *outptr = c.channel(q); - - for (int i = 0; i < size1; i++) - { - outptr[i] = op(a0, ptr1[i]); - } - } - - return 0; - } - - if (b.dims == 2) - { - c.create(w1, h1); - if (c.empty()) - return -100; - - const float a0 = a[0]; - for (int i = 0; i < size1; i++) - { - c[i] = op(a0, b[i]); - } - - return 0; - } - - if (b.dims == 1) - { - c.create(w1); - if (c.empty()) - return -100; - - const float a0 = a[0]; - for (int i = 0; i < size1; i++) - { - c[i] = op(a0, b[i]); - } - - return 0; - } - } - - if (b.dims == 3) - { - c.create(w1, h1, channels1); - if (c.empty()) - return -100; - -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float a0 = a[q]; - const float *ptr1 = b.channel(q); - float *outptr = c.channel(q); - - for (int i = 0; i < size1; i++) - { - outptr[i] = op(a0, ptr1[i]); - } - } - - return 0; - } - - if (b.dims == 2) - { - c.create(w1, h1); - if (c.empty()) - return -100; - - const float *ptr1 = b; - float *outptr = c; - - for (int y = 0; y < h1; y++) - { - const float a0 = a[y]; - for (int x = 0; x < w1; x++) - { - outptr[x] = op(a0, ptr1[x]); - } - - ptr1 += w1; - outptr += w1; - } - - return 0; - } - - if (b.dims == 1) - { - c.create(w); - if (c.empty()) - return -100; - - if (b.w == 1) - { - const float b0 = b[0]; - for (int i = 0; i < size; i++) - { - c[i] = op(a[i], b0); - } - - return 0; - } - - for (int i = 0; i < size; i++) - { - c[i] = op(a[i], b[i]); - } - } - } - - return 0; -} - -template <typename Op> static int binary_op_scalar_inplace(Mat &a, float b) -{ - Op op; - - int w = a.w; - int h = a.h; - int channels = a.c; - int size = w * h; - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - float *ptr = a.channel(q); - - for (int i = 0; i < size; i++) - { - ptr[i] = op(ptr[i], b); - } - } - - return 0; -} - -template <typename T> struct binary_op_max : std::binary_function<T, T, T> -{ - T operator()(const T &x, const T &y) const { return std::max(x, y); } -}; - -template <typename T> struct binary_op_min : std::binary_function<T, T, T> -{ - T operator()(const T &x, const T &y) const { return std::min(x, y); } -}; - -template <typename T> struct binary_op_pow : std::binary_function<T, T, T> -{ - T operator()(const T &x, const T &y) const { return pow(x, y); } -}; - -template <typename T> struct binary_op_SquaredDifference : std::binary_function<T, T, T> -{ - T operator()(const T &x, const T &y) const { return pow((x - y), 2); } -}; - -int ncnn_binary_op(const BinaryOpParam ¶m, const Mat &bottom_blob, const Mat &bottom_blob1, - Mat &top_blob) -{ - int ret = 0; - auto op_type = param.op_type; - // auto b = param.b; - - // Only support add operation, none broadcasting - // Other case, need to remove internal memory allocation and check correctness - if (op_type != BinaryOp::Operation_ADD) - { - throw std::runtime_error{"NYI: Only support ADD operation"}; - } - if (bottom_blob.dims != bottom_blob1.dims) - { - throw std::runtime_error{"NYI: Cannot use broadcasting"}; - } - -// printf("-------------------BinaryOp---------------\n"); - -// printf("op_type = %d, ", op_type); -// printf("in1: (%d, %d, %d), dims = %d, ", bottom_blob.w, bottom_blob.h, bottom_blob.c, -// bottom_blob.dims); -// printf("in2: (%d, %d, %d), dims = %d\n", bottom_blob1.w, bottom_blob1.h, bottom_blob1.c, -// bottom_blob1.dims); - -#if __ARM_NEON - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - int w1 = bottom_blob1.w; - int h1 = bottom_blob1.h; - int channels1 = bottom_blob1.c; - int size1 = w1 * h1; - - if (op_type == BinaryOp::Operation_ADD) - { - if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) - { - // Fix for nnfw: disable allocation for output - // top_blob.create(w, h, channels); - if (bottom_blob1.w == 1 && bottom_blob1.h == 1) - { - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - float tt = *ptr1; - - float32x4_t _p2 = vdupq_n_f32(tt); - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - - _p1 = vaddq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 + tt); - in1++; - out++; - } - -#else - float tt = *ptr1; - for (int i = 0; i < size; i++) - { - outptr[i] = (ptr[i] + tt); - } -#endif - } - - ret = 0; - } - else - { - if (size * bottom_blob.elemsize % 16 != 0) - { - throw std::runtime_error{"Unmatched alignment"}; - } - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *in2 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vld1q_f32(in2); - - _p1 = vaddq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - in2 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = *in1 + *in2; - in1++; - in2++; - out++; - } - } - } - } - else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) - { - top_blob.create(w, h, channels); - if (bottom_blob1.w == 1) - { - ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob); - // return ret; - goto out; - } - float *pt = (float *)bottom_blob1.data; - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float b0 = pt[q]; - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vdupq_n_f32(b0); - - _p1 = vaddq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 + b0); - in1++; - out++; - } - } - } - else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) - { - top_blob.create(w1, h1, channels1); - if (top_blob.empty()) - return -100; - -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float a0 = bottom_blob[q]; - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size1 >> 2; - int remain = size1 - (nn << 2); - - float *in1 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vdupq_n_f32(a0); - float32x4_t _p2 = vld1q_f32(in1); - - _p1 = vaddq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (a0 + *in1); - in1++; - out++; - } - } - } - else - ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob); - } - -#if 0 // Disable operation except Operation_ADD - - if (op_type == BinaryOp::Operation_SUB) - { - if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) - { - top_blob.create(w, h, channels); - - if (bottom_blob1.w == 1 && bottom_blob1.h == 1) - { - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - float tt = *ptr1; - - float32x4_t _p2 = vdupq_n_f32(tt); - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - - _p1 = vsubq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 - tt); - in1++; - out++; - } - -#else - float tt = *ptr1; - for (int i = 0; i < size; i++) - { - outptr[i] = (ptr[i] - tt); - } -#endif - } - - ret = 0; - } - else - { - top_blob.create(w, h, channels); -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *in2 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vld1q_f32(in2); - - _p1 = vsubq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - in2 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = *in1 - *in2; - in1++; - in2++; - out++; - } - } - } - } - else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) - { - top_blob.create(w, h, channels); - if (bottom_blob1.w == 1) - { - ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob); - // return ret; - goto out; - } - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float b0 = bottom_blob1[q]; - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vdupq_n_f32(b0); - - _p1 = vsubq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 - b0); - in1++; - out++; - } - } - } - else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) - { - top_blob.create(w1, h1, channels1); - if (top_blob.empty()) - return -100; - -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float a0 = bottom_blob[q]; - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size1 >> 2; - int remain = size1 - (nn << 2); - - float *in1 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vdupq_n_f32(a0); - float32x4_t _p2 = vld1q_f32(in1); - - _p1 = vsubq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (a0 - *in1); - in1++; - out++; - } - } - } - else - ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob); - } - - if (op_type == BinaryOp::Operation_MUL) - { - if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) - { - top_blob.create(w, h, channels); - - if (bottom_blob1.w == 1 && bottom_blob1.h == 1) - { - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - float tt = *ptr1; - - float32x4_t _p2 = vdupq_n_f32(tt); - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - - _p1 = vmulq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 * tt); - in1++; - out++; - } - -#else - float tt = *ptr1; - for (int i = 0; i < size; i++) - { - outptr[i] = (ptr[i] * tt); - } -#endif - } - - ret = 0; - } - else - { -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *in2 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vld1q_f32(in2); - - _p1 = vmulq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - in2 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = *in1 * *in2; - in1++; - in2++; - out++; - } - } - } - } - else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) - { - top_blob.create(w, h, channels); - if (bottom_blob1.w == 1) - { - ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob); - // return ret; - goto out; - } - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float b0 = bottom_blob1[q]; - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vdupq_n_f32(b0); - - _p1 = vmulq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 * b0); - in1++; - out++; - } - } - } - else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) - { - top_blob.create(w1, h1, channels1); - if (top_blob.empty()) - return -100; - - if (bottom_blob.w != bottom_blob1.c) - { - ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob); - goto out; - } - - float *pt = (float *)bottom_blob.data; - -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float a0 = pt[q]; - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size1 >> 2; - int remain = size1 - (nn << 2); - - float *in1 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vdupq_n_f32(a0); - float32x4_t _p2 = vld1q_f32(in1); - - _p1 = vmulq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (a0 * *in1); - in1++; - out++; - } - } - } - else - ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob); - } - - if (op_type == BinaryOp::Operation_DIV) - { - if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) - { - top_blob.create(w, h, channels); - if (bottom_blob1.w == 1 && bottom_blob1.h == 1) - { - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - float tt = *ptr1; - - float32x4_t _p2 = vdupq_n_f32(tt); - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - - float32x4_t _p3 = vrecpeq_f32(_p2); - _p3 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3); - _p1 = vmulq_f32(_p1, _p3); - - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 / tt); - in1++; - out++; - } - -#else - float tt = *ptr1; - for (int i = 0; i < size; i++) - { - outptr[i] = (ptr[i] / tt); - } -#endif - } - - // return 0; - goto out; - } - else - { -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *in2 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vld1q_f32(in2); - - float32x4_t _p3 = vrecpeq_f32(_p2); - _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3); - _p1 = vmulq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - in2 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = *in1 / *in2; - in1++; - in2++; - out++; - } - } - } - } - else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) - { - top_blob.create(w, h, channels); - if (bottom_blob1.w == 1) - { - ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob); - // return ret; - goto out; - } - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float b0 = bottom_blob1[q]; - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vdupq_n_f32(b0); - - //_p1 = vsubq_f32(_p1, _p2); - float32x4_t _p3 = vrecpeq_f32(_p2); - _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3); - _p1 = vmulq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 / b0); - in1++; - out++; - } - } - } - else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) - { - top_blob.create(w1, h1, channels1); - if (top_blob.empty()) - return -100; - -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float a0 = bottom_blob[q]; - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size1 >> 2; - int remain = size1 - (nn << 2); - - float *in1 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vdupq_n_f32(a0); - float32x4_t _p2 = vld1q_f32(in1); - - //_p1 = vsubq_f32(_p1, _p2); - float32x4_t _p3 = vrecpeq_f32(_p2); - _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3); - _p1 = vmulq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (a0 / *in1); - in1++; - out++; - } - } - } - else - ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob); - } - - if (op_type == BinaryOp::Operation_MAX) - ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob); - - if (op_type == BinaryOp::Operation_MIN) - ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob); - - if (op_type == BinaryOp::Operation_POW) - { - if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) - { - top_blob.create(w, h, channels); -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *in2 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vld1q_f32(in2); - - _p1 = pow_ps(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - in2 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = pow(*in1, *in2); - in1++; - in2++; - out++; - } - } - } - else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) - { - top_blob.create(w, h, channels); - if (bottom_blob1.w == 1) - { - ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob); - // return ret; - goto out; - } - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float b0 = bottom_blob1[q]; - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vdupq_n_f32(b0); - - _p1 = pow_ps(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = pow(*in1, b0); - in1++; - out++; - } - } - } - else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) - { - top_blob.create(w1, h1, channels1); - if (top_blob.empty()) - return -100; - -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float a0 = bottom_blob[q]; - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size1 >> 2; - int remain = size1 - (nn << 2); - - float *in1 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vdupq_n_f32(a0); - float32x4_t _p2 = vld1q_f32(in1); - - _p1 = pow_ps(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = pow(a0, *in1); - in1++; - out++; - } - } - } - else - ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob); - } - - if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE) - { - if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) - { - top_blob.create(w, h, channels); - - if (bottom_blob1.w == 1 && bottom_blob1.h == 1) - { - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - float tt = *ptr1; - - float32x4_t _p2 = vdupq_n_f32(tt); - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - - _p1 = vsubq_f32(_p1, _p2); - _p1 = vmulq_f32(_p1, _p1); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - float t2 = *in1 - tt; - *out = t2 * t2; - in1++; - out++; - } - -#else - float tt = *ptr1; - for (int i = 0; i < size; i++) - { - float t2 = (ptr[i] - tt); - outptr[i] = t2 * t2; - } -#endif - } - - ret = 0; - } - else - { -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *in2 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vld1q_f32(in2); - - _p1 = vsubq_f32(_p1, _p2); - _p1 = vmulq_f32(_p1, _p1); - vst1q_f32(out, _p1); - in1 += 4; - in2 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 - *in2) * (*in1 - *in2); - in1++; - in2++; - out++; - } - } - } - } - else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) - { - top_blob.create(w, h, channels); - if (bottom_blob1.w == 1) - { - ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob); - // return ret; - goto out; - } - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const float *ptr = bottom_blob.channel(q); - const float b0 = bottom_blob1[q]; - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vdupq_n_f32(b0); - - _p1 = vsubq_f32(_p1, _p2); - _p1 = vmulq_f32(_p1, _p1); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (*in1 - b0) * (*in1 - b0); - in1++; - out++; - } - } - } - else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) - { - top_blob.create(w1, h1, channels1); - if (top_blob.empty()) - return -100; - -#pragma omp parallel for - for (int q = 0; q < channels1; q++) - { - const float a0 = bottom_blob[q]; - const float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size1 >> 2; - int remain = size1 - (nn << 2); - - float *in1 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vdupq_n_f32(a0); - float32x4_t _p2 = vld1q_f32(in1); - - _p1 = vsubq_f32(_p1, _p2); - _p1 = vmulq_f32(_p1, _p1); - vst1q_f32(out, _p1); - in1 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = (a0 - *in1) * (a0 - *in1); - in1++; - out++; - } - } - } - else - ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob); - } - -#endif // 0 (Disable operation except Operation_ADD) - -#else - - if (op_type == BinaryOp::Operation_ADD) - ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob); - - if (op_type == BinaryOp::Operation_SUB) - ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob); - - if (op_type == BinaryOp::Operation_MUL) - ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob); - - if (op_type == BinaryOp::Operation_DIV) - ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob); - - if (op_type == BinaryOp::Operation_MAX) - ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob); - - if (op_type == BinaryOp::Operation_MIN) - ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob); - - if (op_type == BinaryOp::Operation_POW) - ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob); - if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE) - ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob); -#endif - -/* -for (int p = 0; p < top_blob.c && p < 5; p++) -{ - float* outptr = top_blob.channel(p); - printf("channel: %d\n", p); - for (int i = 0; i < 1; i++) - { - for (int j = 0; j < 5; j++) - { - printf("%f ", outptr[j]); - } - printf("\n"); - outptr += top_blob.w; - } -} -printf("----------------------------\n"); -*/ - -out: - return ret; -} - -int ncnn_binary_op_inplace(const BinaryOpParam ¶m, Mat &bottom_top_blob) -{ - auto op_type = param.op_type; - auto b = param.b; - - // printf("-------------------BinaryOp-----forward_inplace----------\n"); - if (op_type == BinaryOp::Operation_ADD) - return binary_op_scalar_inplace<std::plus<float>>(bottom_top_blob, b); - - if (op_type == BinaryOp::Operation_SUB) - return binary_op_scalar_inplace<std::minus<float>>(bottom_top_blob, b); - - if (op_type == BinaryOp::Operation_MUL) - return binary_op_scalar_inplace<std::multiplies<float>>(bottom_top_blob, b); - - if (op_type == BinaryOp::Operation_DIV) - return binary_op_scalar_inplace<std::divides<float>>(bottom_top_blob, b); - - if (op_type == BinaryOp::Operation_MAX) - return binary_op_scalar_inplace<binary_op_max<float>>(bottom_top_blob, b); - - if (op_type == BinaryOp::Operation_MIN) - return binary_op_scalar_inplace<binary_op_min<float>>(bottom_top_blob, b); - - if (op_type == BinaryOp::Operation_POW) - return binary_op_scalar_inplace<binary_op_pow<float>>(bottom_top_blob, b); - - if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE) - return binary_op_scalar_inplace<binary_op_SquaredDifference<float>>(bottom_top_blob, b); - - return 0; -} - -int ncnn_binary_op_inplace(const BinaryOpParam ¶m, Mat &bottom_blob, Mat &bottom_top_blob) -{ - int ret = 0; - - Mat &bottom_blob1 = bottom_top_blob; - Mat &top_blob = bottom_top_blob; - auto op_type = param.op_type; - - if (op_type == BinaryOp::Operation_ADD) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - -// Unused variables -// int w1 = bottom_blob1.w; -// int h1 = bottom_blob1.h; -// int channels1 = bottom_blob1.c; -// int size1 = w1 * h1; - -#if __ARM_NEON - - if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) - { -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - float *ptr = bottom_blob.channel(q); - float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - int nn = size >> 2; - int remain = size - (nn << 2); - - float *in1 = const_cast<float *>(ptr); - float *in2 = const_cast<float *>(ptr1); - float *out = const_cast<float *>(outptr); - - for (; nn > 0; nn--) - { - float32x4_t _p1 = vld1q_f32(in1); - float32x4_t _p2 = vld1q_f32(in2); - - _p1 = vaddq_f32(_p1, _p2); - vst1q_f32(out, _p1); - in1 += 4; - in2 += 4; - out += 4; - } - for (; remain > 0; remain--) - { - *out = *in1 + *in2; - in1++; - in2++; - out++; - } - } - } -#else - if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) - { -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - float *ptr = bottom_blob.channel(q); - float *ptr1 = bottom_blob1.channel(q); - float *outptr = top_blob.channel(q); - - for (int i = 0; i < size; i++) - { - outptr[i] = ptr[i] + ptr1[i]; - } - } - return 0; - } -#endif - } - else - { - return -1; - } - return ret; -} - -} // namespace ncnn -} // namespace ncnn |