diff options
Diffstat (limited to 'compute/ncnn/src/mat.cc')
-rw-r--r-- | compute/ncnn/src/mat.cc | 940 |
1 files changed, 940 insertions, 0 deletions
diff --git a/compute/ncnn/src/mat.cc b/compute/ncnn/src/mat.cc new file mode 100644 index 000000000..568378ef7 --- /dev/null +++ b/compute/ncnn/src/mat.cc @@ -0,0 +1,940 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ncnn/mat.h" + +#if __ARM_NEON +#include <arm_neon.h> +#endif // __ARM_NEON + +// Fix for nnfw: comment out cpu.h +//#include "cpu.h" + +namespace nnfw +{ +namespace ncnn +{ + +void Mat::substract_mean_normalize(const float *mean_vals, const float *norm_vals) +{ + int size = w * h; + + if (mean_vals && !norm_vals) + { +// substract mean only +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float mean = mean_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fsub v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean) // %4 + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vsub.f32 q0, q0, q1 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean) // %4 + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr -= mean; + ptr++; + } + } + } + else if (!mean_vals && norm_vals) + { +// normalize only +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float norm = norm_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fmul v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(norm) // %4 + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vmul.f32 q0, q0, q1 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(norm) // %4 + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr *= norm; + ptr++; + } + } + } + else if (mean_vals && norm_vals) + { +// substract mean and normalize +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float mean = mean_vals[q]; + const float norm = norm_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "dup v2.4s, %w5 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fsub v0.4s, v0.4s, v1.4s \n" + "fmul v0.4s, v0.4s, v2.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean), // %4 + "r"(norm) // %5 + : "cc", "memory", "v0", "v1", "v2"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "vdup.f32 q2, %5 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vsub.f32 q0, q0, q1 \n" + "vmul.f32 q0, q0, q2 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean), // %4 + "r"(norm) // %5 + : "cc", "memory", "q0", "q1", "q2"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr = (*ptr - mean) * norm; + ptr++; + } + } + } +} + +// convert half precision floating point to float +static float half2float(unsigned short value) +{ + // 1 : 5 : 10 + unsigned short sign = (value & 0x8000) >> 15; + unsigned short exponent = (value & 0x7c00) >> 10; + unsigned short significand = value & 0x03FF; + + // fprintf(stderr, "%d %d %d\n", sign, exponent, significand); + + // 1 : 8 : 23 + union { + unsigned int u; + float f; + } tmp; + if (exponent == 0) + { + if (significand == 0) + { + // zero + tmp.u = (sign << 31); + } + else + { + // denormal + exponent = 0; + // find non-zero bit + while ((significand & 0x200) == 0) + { + significand <<= 1; + exponent++; + } + significand <<= 1; + significand &= 0x3FF; + tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13); + } + } + else if (exponent == 0x1F) + { + // infinity or NaN + tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13); + } + else + { + // normalized + tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13); + } + + return tmp.f; +} + +Mat Mat::from_float16(const unsigned short *data, int size) +{ + Mat m(size); + if (m.empty()) + return m; + + float *ptr = m; //.data; + +#if __ARM_NEON && (__ARM_FP & 2) + // Fix for nnfw: Alway support vfpv4 + // int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0; + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON && (__ARM_FP & 2) +#if __aarch64__ + if (nn > 0) + { + asm volatile("0: \n" + "ld1 {v0.4h}, [%1], #8 \n" + "fcvtl v1.4s, v0.4h \n" + "subs %w0, %w0, #1 \n" + "st1 {v1.4s}, [%2], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(data), // %1 + "=r"(ptr) // %2 + : "0"(nn), "1"(data), "2"(ptr) + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("0: \n" + "pld [%1, #64] \n" + "vld1.s16 {d0}, [%1 :64]! \n" + "vcvt.f32.f16 q1, d0 \n" + "subs %0, #1 \n" + "vst1.f32 {d2-d3}, [%2 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(data), // %1 + "=r"(ptr) // %2 + : "0"(nn), "1"(data), "2"(ptr) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr = half2float(*data); + + data++; + ptr++; + } + + return m; +} + +static void copy_make_border_image(const Mat &src, Mat &dst, int top, int left, int type, float v) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src; //.data; + float *outptr = dst; //.data; + + if (type == BORDER_CONSTANT) + { + int y = 0; + // fill top + for (; y < top; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + // fill center + for (; y < (top + src.h); y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = v; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = v; + } + ptr += src.w; + outptr += w; + } + // fill bottom + for (; y < h; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + } + else if (type == BORDER_REPLICATE) + { + int y = 0; + // fill top + for (; y < top; y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + outptr += w; + } + // fill center + for (; y < (top + src.h); y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + ptr += src.w; + outptr += w; + } + // fill bottom + ptr -= src.w; + for (; y < h; y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + outptr += w; + } + } +} + +#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_) +static void copy_make_border_image_inplace(const Mat &src, Mat &dst, int top, int left, int type, + float v) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src; + float *outptr = dst; + + if (type == BORDER_CONSTANT) + { + // fill bottom + int y = src.h + top; + outptr += y * w; + for (; y < h; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + + // fill center + y = src.h + top - 1; + outptr = dst; + outptr += y * w; + ptr += (src.h - 1) * src.w; + + for (; y >= top; y--) + { + int x = left + src.w; + for (; x < w; x++) + { + outptr[x] = v; + } + + x = left + src.w - 1; + + for (; x >= left; x--) + { + outptr[x] = ptr[x - left]; + } + + for (x = 0; x < left; x++) + { + outptr[x] = v; + } + ptr -= src.w; + outptr -= w; + } + + // fill top + y = 0; + outptr = dst; + for (; y < top; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + } +} +#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_ + +void copy_make_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right, int type, + float v) +{ + int w = src.w + left + right; + int h = src.h + top + bottom; + + if (w == src.w && h == src.h) + { + dst = src; + return; + } + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + copy_make_border_image(src, dst, top, left, type, v); + } + else if (src.dims == 3) + { + int channels = src.c; + dst.create(w, h, channels); + if (dst.empty()) + return; + + if (src.data != dst.data) + { +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat borderm = dst.channel(q); + + copy_make_border_image(m, borderm, top, left, type, v); + } + } + else + { +#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_) + for (int q = channels - 1; q >= 0; q--) + { + Mat m = src.channel(q); + Mat borderm = dst.channel(q); + copy_make_border_image_inplace(m, borderm, top, left, type, v); + } +#else +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat borderm = dst.channel(q); + + copy_make_border_image(m, borderm, top, left, type, v); + } +#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_ + } + } +} + +static void copy_cut_border_image(const Mat &src, Mat &dst, int top, int left) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src.row(top) + left; //.data + src.w * top + left; + float *outptr = dst; //.data; + + for (int y = 0; y < h; y++) + { + if (w < 12) + { + for (int x = 0; x < w; x++) + { + outptr[x] = ptr[x]; + } + } + else + { + memcpy(outptr, ptr, w * sizeof(float)); + } + outptr += w; + ptr += src.w; + } +} + +void copy_cut_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right) +{ + int w = src.w - left - right; + int h = src.h - top - bottom; + +#ifndef _MEMORY_TO_TIME_ + if (w == src.w && h == src.h) + { + dst = src; + return; + } +#endif + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + + copy_cut_border_image(src, dst, top, left); + } + else if (src.dims == 3) + { + int channels = src.c; + + dst.create(w, h, channels); + if (dst.empty()) + return; + +#if !defined(_MEMORY_TO_TIME_) || !defined(_TIME_TO_MEMORY_) +// unroll image channel +#pragma omp parallel for +#endif + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat cutm = dst.channel(q); + + copy_cut_border_image(m, cutm, top, left); + } + } +} + +static void resize_bilinear_image(const Mat &src, Mat &dst, int w, int h) +{ + double scale_x = (double)src.w / w; + double scale_y = (double)src.h / h; + + int *buf = new int[w + h + w * 2 + h * 2]; + + int *xofs = buf; // new int[w]; + int *yofs = buf + w; // new int[h]; + + float *alpha = (float *)(buf + w + h); // new float[w * 2]; + float *beta = (float *)(buf + w + h + w * 2); // new float[h * 2]; + + float fx; + float fy; + int sx; + int sy; + + for (int dx = 0; dx < w; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = fx; // cvFloor(fx); + fx -= sx; + + if (sx >= src.w - 1) + { + sx = src.w - 2; + fx = 1.f; + } + + xofs[dx] = sx; + + alpha[dx * 2] = 1.f - fx; + alpha[dx * 2 + 1] = fx; + } + + for (int dy = 0; dy < h; dy++) + { + fy = (float)((dy + 0.5) * scale_y - 0.5); + sy = fy; // cvFloor(fy); + fy -= sy; + + if (sy >= src.h - 1) + { + sy = src.h - 2; + fy = 1.f; + } + + yofs[dy] = sy; + + beta[dy * 2] = 1.f - fy; + beta[dy * 2 + 1] = fy; + } + + // loop body + Mat rowsbuf0(w + 1); + Mat rowsbuf1(w + 1); + float *rows0 = rowsbuf0; + float *rows1 = rowsbuf1; + + int prev_sy1 = -1; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // hresize one row + float *rows0_old = rows0; + rows0 = rows1; + rows1 = rows0_old; + const float *S1 = src.row(sy + 1); + + const float *alphap = alpha; + float *rows1p = rows1; + int dx = 0; +#if __ARM_NEON + for (; dx + 1 < w; dx += 2) + { + int sx = xofs[dx]; + int sxn = xofs[dx + 1]; + const float *S1p = S1 + sx; + const float *S1np = S1 + sxn; + + float32x4_t _a = vld1q_f32(alphap); + float32x2_t _S1 = vld1_f32(S1p); + float32x2_t _S1n = vld1_f32(S1np); + + float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); + float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); + float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); + + vst1_f32(rows1p + dx, _rows1); + + alphap += 4; + } +#endif // __ARM_NEON + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float *S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + else + { + // hresize two rows + const float *S0 = src.row(sy); + const float *S1 = src.row(sy + 1); + + const float *alphap = alpha; + float *rows0p = rows0; + float *rows1p = rows1; + int dx = 0; +#if __ARM_NEON + for (; dx + 1 < w; dx += 2) + { + int sx = xofs[dx]; + int sxn = xofs[dx + 1]; + const float *S0p = S0 + sx; + const float *S1p = S1 + sx; + const float *S0np = S0 + sxn; + const float *S1np = S1 + sxn; + + float32x4_t _a = vld1q_f32(alphap); + float32x2_t _S0 = vld1_f32(S0p); + float32x2_t _S1 = vld1_f32(S1p); + float32x2_t _S0n = vld1_f32(S0np); + float32x2_t _S1n = vld1_f32(S1np); + + float32x4_t _S0S0n = vcombine_f32(_S0, _S0n); + float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); + float32x4_t _ms0 = vmulq_f32(_S0S0n, _a); + float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); + float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); + float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); + + vst1_f32(rows0p + dx, _rows0); + vst1_f32(rows1p + dx, _rows1); + + alphap += 4; + } +#endif // __ARM_NEON + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float *S0p = S0 + sx; + const float *S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows0p[dx] = S0p[0] * a0 + S0p[1] * a1; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + + prev_sy1 = sy + 1; + + // vresize + float b0 = beta[0]; + float b1 = beta[1]; + + float *rows0p = rows0; + float *rows1p = rows1; + float *Dp = dst.row(dy); + +#if __ARM_NEON + int nn = w >> 3; +#else + int nn = 0; +#endif + int remain = w - (nn << 3); + +#if __ARM_NEON + float32x4_t _b0 = vdupq_n_f32(b0); + float32x4_t _b1 = vdupq_n_f32(b1); + for (; nn > 0; nn--) + { + float32x4_t _rows0 = vld1q_f32(rows0p); + float32x4_t _rows1 = vld1q_f32(rows1p); + + float32x4_t _D = vmulq_f32(_rows0, _b0); + _D = vmlaq_f32(_D, _rows1, _b1); + + vst1q_f32(Dp, _D); + + float32x4_t _rows0n = vld1q_f32(rows0p + 4); + float32x4_t _rows1n = vld1q_f32(rows1p + 4); + + float32x4_t _Dn = vmulq_f32(_rows0n, _b0); + _Dn = vmlaq_f32(_Dn, _rows1n, _b1); + + vst1q_f32(Dp + 4, _Dn); + + Dp += 8; + rows0p += 8; + rows1p += 8; + } +#endif // __ARM_NEON + for (; remain; --remain) + { + // D[x] = rows0[x]*b0 + rows1[x]*b1; + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; + } + + beta += 2; + } + + delete[] buf; +} + +void resize_bilinear(const Mat &src, Mat &dst, int w, int h) +{ + if (w == src.w && h == src.h) + { + dst = src; + return; + } + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + + resize_bilinear_image(src, dst, w, h); + } + else if (src.dims == 3) + { + int channels = src.c; + + dst.create(w, h, channels); + if (dst.empty()) + return; + +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat resizem = dst.channel(q); + + resize_bilinear_image(m, resizem, w, h); + } + } +} + +} // namespace ncnn +} // namespace nnfw |