diff options
Diffstat (limited to 'compute/ncnn/src/mat.cc')
-rw-r--r-- | compute/ncnn/src/mat.cc | 940 |
1 files changed, 0 insertions, 940 deletions
diff --git a/compute/ncnn/src/mat.cc b/compute/ncnn/src/mat.cc deleted file mode 100644 index 568378ef7..000000000 --- a/compute/ncnn/src/mat.cc +++ /dev/null @@ -1,940 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ncnn/mat.h" - -#if __ARM_NEON -#include <arm_neon.h> -#endif // __ARM_NEON - -// Fix for nnfw: comment out cpu.h -//#include "cpu.h" - -namespace nnfw -{ -namespace ncnn -{ - -void Mat::substract_mean_normalize(const float *mean_vals, const float *norm_vals) -{ - int size = w * h; - - if (mean_vals && !norm_vals) - { -// substract mean only -#pragma omp parallel for - for (int q = 0; q < c; q++) - { - float *ptr = channel(q); // data + cstep * q; - const float mean = mean_vals[q]; - -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); -#else - int remain = size; -#endif // __ARM_NEON - -#if __ARM_NEON -#if __aarch64__ - if (nn > 0) - { - asm volatile("dup v1.4s, %w4 \n" - "0: \n" - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.4s}, [%1] \n" - "fsub v0.4s, v0.4s, v1.4s \n" - "subs %w0, %w0, #1 \n" - "st1 {v0.4s}, [%1], #16 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(ptr) // %1 - : "0"(nn), "1"(ptr), - "r"(mean) // %4 - : "cc", "memory", "v0", "v1"); - } -#else - if (nn > 0) - { - asm volatile("vdup.f32 q1, %4 \n" - "0: \n" - "pld [%1, #128] \n" - "vld1.f32 {d0-d1}, [%1 :128] \n" - "vsub.f32 q0, q0, q1 \n" - "subs %0, #1 \n" - "vst1.f32 {d0-d1}, [%1 :128]! \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(ptr) // %1 - : "0"(nn), "1"(ptr), - "r"(mean) // %4 - : "cc", "memory", "q0", "q1"); - } -#endif // __aarch64__ -#endif // __ARM_NEON - for (; remain > 0; remain--) - { - *ptr -= mean; - ptr++; - } - } - } - else if (!mean_vals && norm_vals) - { -// normalize only -#pragma omp parallel for - for (int q = 0; q < c; q++) - { - float *ptr = channel(q); // data + cstep * q; - const float norm = norm_vals[q]; - -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); -#else - int remain = size; -#endif // __ARM_NEON - -#if __ARM_NEON -#if __aarch64__ - if (nn > 0) - { - asm volatile("dup v1.4s, %w4 \n" - "0: \n" - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.4s}, [%1] \n" - "fmul v0.4s, v0.4s, v1.4s \n" - "subs %w0, %w0, #1 \n" - "st1 {v0.4s}, [%1], #16 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(ptr) // %1 - : "0"(nn), "1"(ptr), - "r"(norm) // %4 - : "cc", "memory", "v0", "v1"); - } -#else - if (nn > 0) - { - asm volatile("vdup.f32 q1, %4 \n" - "0: \n" - "pld [%1, #128] \n" - "vld1.f32 {d0-d1}, [%1 :128] \n" - "vmul.f32 q0, q0, q1 \n" - "subs %0, #1 \n" - "vst1.f32 {d0-d1}, [%1 :128]! \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(ptr) // %1 - : "0"(nn), "1"(ptr), - "r"(norm) // %4 - : "cc", "memory", "q0", "q1"); - } -#endif // __aarch64__ -#endif // __ARM_NEON - for (; remain > 0; remain--) - { - *ptr *= norm; - ptr++; - } - } - } - else if (mean_vals && norm_vals) - { -// substract mean and normalize -#pragma omp parallel for - for (int q = 0; q < c; q++) - { - float *ptr = channel(q); // data + cstep * q; - const float mean = mean_vals[q]; - const float norm = norm_vals[q]; - -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); -#else - int remain = size; -#endif // __ARM_NEON - -#if __ARM_NEON -#if __aarch64__ - if (nn > 0) - { - asm volatile("dup v1.4s, %w4 \n" - "dup v2.4s, %w5 \n" - "0: \n" - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.4s}, [%1] \n" - "fsub v0.4s, v0.4s, v1.4s \n" - "fmul v0.4s, v0.4s, v2.4s \n" - "subs %w0, %w0, #1 \n" - "st1 {v0.4s}, [%1], #16 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(ptr) // %1 - : "0"(nn), "1"(ptr), - "r"(mean), // %4 - "r"(norm) // %5 - : "cc", "memory", "v0", "v1", "v2"); - } -#else - if (nn > 0) - { - asm volatile("vdup.f32 q1, %4 \n" - "vdup.f32 q2, %5 \n" - "0: \n" - "pld [%1, #128] \n" - "vld1.f32 {d0-d1}, [%1 :128] \n" - "vsub.f32 q0, q0, q1 \n" - "vmul.f32 q0, q0, q2 \n" - "subs %0, #1 \n" - "vst1.f32 {d0-d1}, [%1 :128]! \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(ptr) // %1 - : "0"(nn), "1"(ptr), - "r"(mean), // %4 - "r"(norm) // %5 - : "cc", "memory", "q0", "q1", "q2"); - } -#endif // __aarch64__ -#endif // __ARM_NEON - for (; remain > 0; remain--) - { - *ptr = (*ptr - mean) * norm; - ptr++; - } - } - } -} - -// convert half precision floating point to float -static float half2float(unsigned short value) -{ - // 1 : 5 : 10 - unsigned short sign = (value & 0x8000) >> 15; - unsigned short exponent = (value & 0x7c00) >> 10; - unsigned short significand = value & 0x03FF; - - // fprintf(stderr, "%d %d %d\n", sign, exponent, significand); - - // 1 : 8 : 23 - union { - unsigned int u; - float f; - } tmp; - if (exponent == 0) - { - if (significand == 0) - { - // zero - tmp.u = (sign << 31); - } - else - { - // denormal - exponent = 0; - // find non-zero bit - while ((significand & 0x200) == 0) - { - significand <<= 1; - exponent++; - } - significand <<= 1; - significand &= 0x3FF; - tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13); - } - } - else if (exponent == 0x1F) - { - // infinity or NaN - tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13); - } - else - { - // normalized - tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13); - } - - return tmp.f; -} - -Mat Mat::from_float16(const unsigned short *data, int size) -{ - Mat m(size); - if (m.empty()) - return m; - - float *ptr = m; //.data; - -#if __ARM_NEON && (__ARM_FP & 2) - // Fix for nnfw: Alway support vfpv4 - // int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0; - int nn = size >> 2; - int remain = size - (nn << 2); -#else - int remain = size; -#endif // __ARM_NEON - -#if __ARM_NEON && (__ARM_FP & 2) -#if __aarch64__ - if (nn > 0) - { - asm volatile("0: \n" - "ld1 {v0.4h}, [%1], #8 \n" - "fcvtl v1.4s, v0.4h \n" - "subs %w0, %w0, #1 \n" - "st1 {v1.4s}, [%2], #16 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(data), // %1 - "=r"(ptr) // %2 - : "0"(nn), "1"(data), "2"(ptr) - : "cc", "memory", "v0", "v1"); - } -#else - if (nn > 0) - { - asm volatile("0: \n" - "pld [%1, #64] \n" - "vld1.s16 {d0}, [%1 :64]! \n" - "vcvt.f32.f16 q1, d0 \n" - "subs %0, #1 \n" - "vst1.f32 {d2-d3}, [%2 :128]! \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(data), // %1 - "=r"(ptr) // %2 - : "0"(nn), "1"(data), "2"(ptr) - : "cc", "memory", "q0", "q1"); - } -#endif // __aarch64__ -#endif // __ARM_NEON - for (; remain > 0; remain--) - { - *ptr = half2float(*data); - - data++; - ptr++; - } - - return m; -} - -static void copy_make_border_image(const Mat &src, Mat &dst, int top, int left, int type, float v) -{ - int w = dst.w; - int h = dst.h; - - const float *ptr = src; //.data; - float *outptr = dst; //.data; - - if (type == BORDER_CONSTANT) - { - int y = 0; - // fill top - for (; y < top; y++) - { - int x = 0; - for (; x < w; x++) - { - outptr[x] = v; - } - outptr += w; - } - // fill center - for (; y < (top + src.h); y++) - { - int x = 0; - for (; x < left; x++) - { - outptr[x] = v; - } - if (src.w < 12) - { - for (; x < (left + src.w); x++) - { - outptr[x] = ptr[x - left]; - } - } - else - { - memcpy(outptr + left, ptr, src.w * sizeof(float)); - x += src.w; - } - for (; x < w; x++) - { - outptr[x] = v; - } - ptr += src.w; - outptr += w; - } - // fill bottom - for (; y < h; y++) - { - int x = 0; - for (; x < w; x++) - { - outptr[x] = v; - } - outptr += w; - } - } - else if (type == BORDER_REPLICATE) - { - int y = 0; - // fill top - for (; y < top; y++) - { - int x = 0; - for (; x < left; x++) - { - outptr[x] = ptr[0]; - } - if (src.w < 12) - { - for (; x < (left + src.w); x++) - { - outptr[x] = ptr[x - left]; - } - } - else - { - memcpy(outptr + left, ptr, src.w * sizeof(float)); - x += src.w; - } - for (; x < w; x++) - { - outptr[x] = ptr[src.w - 1]; - } - outptr += w; - } - // fill center - for (; y < (top + src.h); y++) - { - int x = 0; - for (; x < left; x++) - { - outptr[x] = ptr[0]; - } - if (src.w < 12) - { - for (; x < (left + src.w); x++) - { - outptr[x] = ptr[x - left]; - } - } - else - { - memcpy(outptr + left, ptr, src.w * sizeof(float)); - x += src.w; - } - for (; x < w; x++) - { - outptr[x] = ptr[src.w - 1]; - } - ptr += src.w; - outptr += w; - } - // fill bottom - ptr -= src.w; - for (; y < h; y++) - { - int x = 0; - for (; x < left; x++) - { - outptr[x] = ptr[0]; - } - if (src.w < 12) - { - for (; x < (left + src.w); x++) - { - outptr[x] = ptr[x - left]; - } - } - else - { - memcpy(outptr + left, ptr, src.w * sizeof(float)); - x += src.w; - } - for (; x < w; x++) - { - outptr[x] = ptr[src.w - 1]; - } - outptr += w; - } - } -} - -#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_) -static void copy_make_border_image_inplace(const Mat &src, Mat &dst, int top, int left, int type, - float v) -{ - int w = dst.w; - int h = dst.h; - - const float *ptr = src; - float *outptr = dst; - - if (type == BORDER_CONSTANT) - { - // fill bottom - int y = src.h + top; - outptr += y * w; - for (; y < h; y++) - { - int x = 0; - for (; x < w; x++) - { - outptr[x] = v; - } - outptr += w; - } - - // fill center - y = src.h + top - 1; - outptr = dst; - outptr += y * w; - ptr += (src.h - 1) * src.w; - - for (; y >= top; y--) - { - int x = left + src.w; - for (; x < w; x++) - { - outptr[x] = v; - } - - x = left + src.w - 1; - - for (; x >= left; x--) - { - outptr[x] = ptr[x - left]; - } - - for (x = 0; x < left; x++) - { - outptr[x] = v; - } - ptr -= src.w; - outptr -= w; - } - - // fill top - y = 0; - outptr = dst; - for (; y < top; y++) - { - int x = 0; - for (; x < w; x++) - { - outptr[x] = v; - } - outptr += w; - } - } -} -#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_ - -void copy_make_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right, int type, - float v) -{ - int w = src.w + left + right; - int h = src.h + top + bottom; - - if (w == src.w && h == src.h) - { - dst = src; - return; - } - - if (src.dims == 2) - { - dst.create(w, h); - if (dst.empty()) - return; - copy_make_border_image(src, dst, top, left, type, v); - } - else if (src.dims == 3) - { - int channels = src.c; - dst.create(w, h, channels); - if (dst.empty()) - return; - - if (src.data != dst.data) - { -// unroll image channel -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const Mat m = src.channel(q); - Mat borderm = dst.channel(q); - - copy_make_border_image(m, borderm, top, left, type, v); - } - } - else - { -#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_) - for (int q = channels - 1; q >= 0; q--) - { - Mat m = src.channel(q); - Mat borderm = dst.channel(q); - copy_make_border_image_inplace(m, borderm, top, left, type, v); - } -#else -// unroll image channel -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const Mat m = src.channel(q); - Mat borderm = dst.channel(q); - - copy_make_border_image(m, borderm, top, left, type, v); - } -#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_ - } - } -} - -static void copy_cut_border_image(const Mat &src, Mat &dst, int top, int left) -{ - int w = dst.w; - int h = dst.h; - - const float *ptr = src.row(top) + left; //.data + src.w * top + left; - float *outptr = dst; //.data; - - for (int y = 0; y < h; y++) - { - if (w < 12) - { - for (int x = 0; x < w; x++) - { - outptr[x] = ptr[x]; - } - } - else - { - memcpy(outptr, ptr, w * sizeof(float)); - } - outptr += w; - ptr += src.w; - } -} - -void copy_cut_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right) -{ - int w = src.w - left - right; - int h = src.h - top - bottom; - -#ifndef _MEMORY_TO_TIME_ - if (w == src.w && h == src.h) - { - dst = src; - return; - } -#endif - - if (src.dims == 2) - { - dst.create(w, h); - if (dst.empty()) - return; - - copy_cut_border_image(src, dst, top, left); - } - else if (src.dims == 3) - { - int channels = src.c; - - dst.create(w, h, channels); - if (dst.empty()) - return; - -#if !defined(_MEMORY_TO_TIME_) || !defined(_TIME_TO_MEMORY_) -// unroll image channel -#pragma omp parallel for -#endif - for (int q = 0; q < channels; q++) - { - const Mat m = src.channel(q); - Mat cutm = dst.channel(q); - - copy_cut_border_image(m, cutm, top, left); - } - } -} - -static void resize_bilinear_image(const Mat &src, Mat &dst, int w, int h) -{ - double scale_x = (double)src.w / w; - double scale_y = (double)src.h / h; - - int *buf = new int[w + h + w * 2 + h * 2]; - - int *xofs = buf; // new int[w]; - int *yofs = buf + w; // new int[h]; - - float *alpha = (float *)(buf + w + h); // new float[w * 2]; - float *beta = (float *)(buf + w + h + w * 2); // new float[h * 2]; - - float fx; - float fy; - int sx; - int sy; - - for (int dx = 0; dx < w; dx++) - { - fx = (float)((dx + 0.5) * scale_x - 0.5); - sx = fx; // cvFloor(fx); - fx -= sx; - - if (sx >= src.w - 1) - { - sx = src.w - 2; - fx = 1.f; - } - - xofs[dx] = sx; - - alpha[dx * 2] = 1.f - fx; - alpha[dx * 2 + 1] = fx; - } - - for (int dy = 0; dy < h; dy++) - { - fy = (float)((dy + 0.5) * scale_y - 0.5); - sy = fy; // cvFloor(fy); - fy -= sy; - - if (sy >= src.h - 1) - { - sy = src.h - 2; - fy = 1.f; - } - - yofs[dy] = sy; - - beta[dy * 2] = 1.f - fy; - beta[dy * 2 + 1] = fy; - } - - // loop body - Mat rowsbuf0(w + 1); - Mat rowsbuf1(w + 1); - float *rows0 = rowsbuf0; - float *rows1 = rowsbuf1; - - int prev_sy1 = -1; - - for (int dy = 0; dy < h; dy++) - { - int sy = yofs[dy]; - - if (sy == prev_sy1) - { - // hresize one row - float *rows0_old = rows0; - rows0 = rows1; - rows1 = rows0_old; - const float *S1 = src.row(sy + 1); - - const float *alphap = alpha; - float *rows1p = rows1; - int dx = 0; -#if __ARM_NEON - for (; dx + 1 < w; dx += 2) - { - int sx = xofs[dx]; - int sxn = xofs[dx + 1]; - const float *S1p = S1 + sx; - const float *S1np = S1 + sxn; - - float32x4_t _a = vld1q_f32(alphap); - float32x2_t _S1 = vld1_f32(S1p); - float32x2_t _S1n = vld1_f32(S1np); - - float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); - float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); - float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); - - vst1_f32(rows1p + dx, _rows1); - - alphap += 4; - } -#endif // __ARM_NEON - for (; dx < w; dx++) - { - int sx = xofs[dx]; - const float *S1p = S1 + sx; - - float a0 = alphap[0]; - float a1 = alphap[1]; - rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; - - alphap += 2; - } - } - else - { - // hresize two rows - const float *S0 = src.row(sy); - const float *S1 = src.row(sy + 1); - - const float *alphap = alpha; - float *rows0p = rows0; - float *rows1p = rows1; - int dx = 0; -#if __ARM_NEON - for (; dx + 1 < w; dx += 2) - { - int sx = xofs[dx]; - int sxn = xofs[dx + 1]; - const float *S0p = S0 + sx; - const float *S1p = S1 + sx; - const float *S0np = S0 + sxn; - const float *S1np = S1 + sxn; - - float32x4_t _a = vld1q_f32(alphap); - float32x2_t _S0 = vld1_f32(S0p); - float32x2_t _S1 = vld1_f32(S1p); - float32x2_t _S0n = vld1_f32(S0np); - float32x2_t _S1n = vld1_f32(S1np); - - float32x4_t _S0S0n = vcombine_f32(_S0, _S0n); - float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); - float32x4_t _ms0 = vmulq_f32(_S0S0n, _a); - float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); - float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); - float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); - - vst1_f32(rows0p + dx, _rows0); - vst1_f32(rows1p + dx, _rows1); - - alphap += 4; - } -#endif // __ARM_NEON - for (; dx < w; dx++) - { - int sx = xofs[dx]; - const float *S0p = S0 + sx; - const float *S1p = S1 + sx; - - float a0 = alphap[0]; - float a1 = alphap[1]; - rows0p[dx] = S0p[0] * a0 + S0p[1] * a1; - rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; - - alphap += 2; - } - } - - prev_sy1 = sy + 1; - - // vresize - float b0 = beta[0]; - float b1 = beta[1]; - - float *rows0p = rows0; - float *rows1p = rows1; - float *Dp = dst.row(dy); - -#if __ARM_NEON - int nn = w >> 3; -#else - int nn = 0; -#endif - int remain = w - (nn << 3); - -#if __ARM_NEON - float32x4_t _b0 = vdupq_n_f32(b0); - float32x4_t _b1 = vdupq_n_f32(b1); - for (; nn > 0; nn--) - { - float32x4_t _rows0 = vld1q_f32(rows0p); - float32x4_t _rows1 = vld1q_f32(rows1p); - - float32x4_t _D = vmulq_f32(_rows0, _b0); - _D = vmlaq_f32(_D, _rows1, _b1); - - vst1q_f32(Dp, _D); - - float32x4_t _rows0n = vld1q_f32(rows0p + 4); - float32x4_t _rows1n = vld1q_f32(rows1p + 4); - - float32x4_t _Dn = vmulq_f32(_rows0n, _b0); - _Dn = vmlaq_f32(_Dn, _rows1n, _b1); - - vst1q_f32(Dp + 4, _Dn); - - Dp += 8; - rows0p += 8; - rows1p += 8; - } -#endif // __ARM_NEON - for (; remain; --remain) - { - // D[x] = rows0[x]*b0 + rows1[x]*b1; - *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; - } - - beta += 2; - } - - delete[] buf; -} - -void resize_bilinear(const Mat &src, Mat &dst, int w, int h) -{ - if (w == src.w && h == src.h) - { - dst = src; - return; - } - - if (src.dims == 2) - { - dst.create(w, h); - if (dst.empty()) - return; - - resize_bilinear_image(src, dst, w, h); - } - else if (src.dims == 3) - { - int channels = src.c; - - dst.create(w, h, channels); - if (dst.empty()) - return; - -// unroll image channel -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { - const Mat m = src.channel(q); - Mat resizem = dst.channel(q); - - resize_bilinear_image(m, resizem, w, h); - } - } -} - -} // namespace ncnn -} // namespace nnfw |