diff options
Diffstat (limited to 'runtimes/libs/srcn/src/direct_conv_colmajor.cc')
-rw-r--r-- | runtimes/libs/srcn/src/direct_conv_colmajor.cc | 5872 |
1 files changed, 5872 insertions, 0 deletions
diff --git a/runtimes/libs/srcn/src/direct_conv_colmajor.cc b/runtimes/libs/srcn/src/direct_conv_colmajor.cc new file mode 100644 index 000000000..394ea6d58 --- /dev/null +++ b/runtimes/libs/srcn/src/direct_conv_colmajor.cc @@ -0,0 +1,5872 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include <stdlib.h> +#include <arm_neon.h> +#include "srcn/conv_type.h" + +namespace nnfw +{ +namespace srcn +{ + +#if __aarch64__ +static void direct_conv_l(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &_kernel, const int _stride, const int padding, + const int pad_top, const int pad_left) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + const int kernel_w = _kernel.w; + const int kernel_h = _kernel.h; + + for (int m = 0; m < kernel_w * kernel_h; m++) + { + const float *_kernel0 = _kernel.data + m * inch * outch; + const float *img0 = + bottom_blob.data + (m / kernel_w - pad_top) * w * inch + (m % kernel_w - pad_left) * inch; + +#ifdef _OPENMP +#pragma omp parallel for +#endif // _OPENMP + for (int p = 0; p < outh; p++) + { + float *out0 = top_blob.data + p * outw * outch; + + // clear output + if (m == 0) + { + for (int j = 0; j < outw * outch; j++) + { + *(out0 + j) = 0.f; + } + } + + if (padding) + { + if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h)) + { + continue; + } + } + + const float *img1 = img0 + p * w * inch * _stride; + + int q = 0; + for (; q + 3 < outw; /*q += 4*/) + { + if (padding) + { + if (((q + 3) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 4; + img1 += inch * _stride * 4; + q += 4; + continue; + } + else if ((q + 3) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + else if (q * _stride + m % kernel_w < pad_left) + { + int delta = (pad_left - m % kernel_w) / _stride - q; + delta += (pad_left - m % kernel_w) % _stride ? 1 : 0; + out0 += outch * delta; + img1 += inch * _stride * delta; + q += delta; + continue; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *_x2 = img1 + inch * _stride * 2; + const float *_x3 = img1 + inch * _stride * 3; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("v5") = vld1q_f32(_x1); + register float32x4_t rx2 asm("v16") = vld1q_f32(_x2); + register float32x4_t rx3 asm("v17") = vld1q_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v30.4s, v12.4s, %[rx2].s[2]\n" + "fmla v31.4s, v12.4s, %[rx3].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + "fmla v30.4s, v13.4s, %[rx2].s[3]\n" + "fmla v31.4s, v13.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v30.4s, v12.4s, %[rx2].s[2]\n" + "fmla v31.4s, v12.4s, %[rx3].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + "fmla v30.4s, v13.4s, %[rx2].s[3]\n" + "fmla v31.4s, v13.4s, %[rx3].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n), [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v30.2s, v7.2s, %[rx2].s[1]\n" + "fmla v31.2s, v7.2s, %[rx3].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v15.2s, v8.2s, %[rx1].s[2]\n" + "fmla v30.2s, v8.2s, %[rx2].s[2]\n" + "fmla v31.2s, v8.2s, %[rx3].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + "fmla v15.2s, v9.2s, %[rx1].s[3]\n" + "fmla v30.2s, v9.2s, %[rx2].s[3]\n" + "fmla v31.2s, v9.2s, %[rx3].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15", "v30", + "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x2 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x2 + 3)); + + *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x3 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x3 + 3)); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + _x2 += 4; + _x3 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_f32(_x1); + register float32x2_t rx2 asm("v16") = vld1_f32(_x2); + register float32x2_t rx3 asm("v17") = vld1_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile( + "cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), + [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v30.2s, v7.2s, %[rx2].s[1]\n" + "fmla v31.2s, v7.2s, %[rx3].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v14", "v15", "v30", "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)); + *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + _x2 += 2; + _x3 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1); + register float32x2_t rx2 asm("v16") = vld1_dup_f32(_x2); + register float32x2_t rx3 asm("v17") = vld1_dup_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile( + "cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), + [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v10", "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v14", "v15", "v30", "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + *outptr2 += (*kernel0) * (*_x2); + *outptr3 += (*kernel0) * (*_x3); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + _x0 += 1; + _x1 += 1; + _x2 += 1; + _x3 += 1; + } + + img1 += inch * 4 * _stride; + out0 += outch * 4; + q += 4; + } + + for (; q + 1 < outw; /*q += 2*/) + { + if (padding) + { + if (((q + 1) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 2; + img1 += inch * _stride * 2; + q += 2; + continue; + } + else if ((q + 1) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + else if (q * _stride + m % kernel_w < pad_left) + { + out0 += outch; + img1 += inch * _stride; + q++; + continue; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("v5") = vld1q_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v15.2s, v8.2s, %[rx1].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + "fmla v15.2s, v9.2s, %[rx1].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v7", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v10", "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + + kernel0++; + outptr0++; + outptr1++; + } + + _x0 += 1; + _x1 += 1; + } + + img1 += inch * 2 * _stride; + out0 += outch * 2; + q += 2; + } + + for (; q < outw; q++) + { + if (padding) + { + if ((q * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w >= pad_left + w)) + { + img1 += inch * _stride; + out0 += outch; + continue; + } + } + + const float *_x0 = img1; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + kernel0++; + outptr0++; + } + + kernel0 += outch * 3; + _x0 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + + float *outptr0 = out0; + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v7", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + + kernel0++; + outptr0++; + } + + kernel0 += outch; + _x0 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + + float *outptr0 = out0; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v10", "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + + kernel0++; + outptr0++; + } + + _x0 += 1; + } + + img1 += inch * _stride; + out0 += outch; + } + } + } +} + +static void direct_conv_s(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &_kernel, const int _stride, const int padding, + const int pad_top, const int pad_left) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + const int kernel_w = _kernel.w; + const int kernel_h = _kernel.h; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int p = 0; p < outh; p++) + { + const float *img0 = bottom_blob.data + (p * _stride - pad_top) * w * inch; + float *out = top_blob.data + p * outw * outch; + + // clear output + for (int j = 0; j < outw * outch; j++) + { + *(out + j) = 0.f; + } + + for (int m = 0; m < kernel_w * kernel_h; m++) + { + if (padding) + { + if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h)) + { + continue; + } + } + + float *out0 = out; + const float *_kernel0 = _kernel.data + m * inch * outch; + const float *img1 = img0 + (m / kernel_w) * w * inch + (m % kernel_w - pad_left) * inch; + + int q = 0; + for (; q + 3 < outw; /*q += 4*/) + { + if (padding) + { + if (((q + 3) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 4; + img1 += inch * _stride * 4; + q += 4; + continue; + } + else if ((q + 3) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + else if (q * _stride + m % kernel_w < pad_left) + { + int delta = (pad_left - m % kernel_w) / _stride - q; + delta += (pad_left - m % kernel_w) % _stride ? 1 : 0; + out0 += outch * delta; + img1 += inch * _stride * delta; + q += delta; + continue; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *_x2 = img1 + inch * _stride * 2; + const float *_x3 = img1 + inch * _stride * 3; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("v5") = vld1q_f32(_x1); + register float32x4_t rx2 asm("v16") = vld1q_f32(_x2); + register float32x4_t rx3 asm("v17") = vld1q_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v30.4s, v12.4s, %[rx2].s[2]\n" + "fmla v31.4s, v12.4s, %[rx3].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + "fmla v30.4s, v13.4s, %[rx2].s[3]\n" + "fmla v31.4s, v13.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v30.4s, v12.4s, %[rx2].s[2]\n" + "fmla v31.4s, v12.4s, %[rx3].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + "fmla v30.4s, v13.4s, %[rx2].s[3]\n" + "fmla v31.4s, v13.4s, %[rx3].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n), [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v30.2s, v7.2s, %[rx2].s[1]\n" + "fmla v31.2s, v7.2s, %[rx3].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v15.2s, v8.2s, %[rx1].s[2]\n" + "fmla v30.2s, v8.2s, %[rx2].s[2]\n" + "fmla v31.2s, v8.2s, %[rx3].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + "fmla v15.2s, v9.2s, %[rx1].s[3]\n" + "fmla v30.2s, v9.2s, %[rx2].s[3]\n" + "fmla v31.2s, v9.2s, %[rx3].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15", "v30", + "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x2 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x2 + 3)); + + *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x3 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x3 + 3)); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + _x2 += 4; + _x3 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_f32(_x1); + register float32x2_t rx2 asm("v16") = vld1_f32(_x2); + register float32x2_t rx3 asm("v17") = vld1_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile( + "cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), + [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v30.2s, v7.2s, %[rx2].s[1]\n" + "fmla v31.2s, v7.2s, %[rx3].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v14", "v15", "v30", "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)); + *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + _x2 += 2; + _x3 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1); + register float32x2_t rx2 asm("v16") = vld1_dup_f32(_x2); + register float32x2_t rx3 asm("v17") = vld1_dup_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile( + "cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), + [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v10", "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v14", "v15", "v30", "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + *outptr2 += (*kernel0) * (*_x2); + *outptr3 += (*kernel0) * (*_x3); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + _x0 += 1; + _x1 += 1; + _x2 += 1; + _x3 += 1; + } + + img1 += inch * 4 * _stride; + out0 += outch * 4; + q += 4; + } + + for (; q + 1 < outw; /*q += 2*/) + { + if (padding) + { + if (((q + 1) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 2; + img1 += inch * _stride * 2; + q += 2; + continue; + } + else if ((q + 1) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + else if (q * _stride + m % kernel_w < pad_left) + { + out0 += outch; + img1 += inch * _stride; + q++; + continue; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("v5") = vld1q_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v15.2s, v8.2s, %[rx1].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + "fmla v15.2s, v9.2s, %[rx1].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v7", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v10", "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + + kernel0++; + outptr0++; + outptr1++; + } + + _x0 += 1; + _x1 += 1; + } + + img1 += inch * 2 * _stride; + out0 += outch * 2; + q += 2; + } + + for (; q < outw; q++) + { + if (padding) + { + if ((q * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w >= pad_left + w)) + { + img1 += inch * _stride; + out0 += outch; + continue; + } + } + + const float *_x0 = img1; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + kernel0++; + outptr0++; + } + + kernel0 += outch * 3; + _x0 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + + float *outptr0 = out0; + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v7", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + + kernel0++; + outptr0++; + } + + kernel0 += outch; + _x0 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + + float *outptr0 = out0; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v10", "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + + kernel0++; + outptr0++; + } + + _x0 += 1; + } + + img1 += inch * _stride; + out0 += outch; + } + } + } +} + +#else // __aarch64__ +static void direct_conv_l(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &_kernel, const int _stride, const int padding, + const int pad_top, const int pad_left) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + const int kernel_w = _kernel.w; + const int kernel_h = _kernel.h; + + for (int m = 0; m < kernel_w * kernel_h; m++) + { + const float *_kernel0 = _kernel.data + m * inch * outch; + const float *img0 = + bottom_blob.data + (m / kernel_w - pad_top) * w * inch + (m % kernel_w - pad_left) * inch; + +#ifdef _OPENMP +#pragma omp parallel for +#endif // _OPENMP + for (int p = 0; p < outh; p++) + { + float *out0 = top_blob.data + p * outw * outch; + // clear output. + if (m == 0) + { + for (int j = 0; j < outw * outch; j++) + { + *(out0 + j) = 0.f; + } + } + + if (padding) + { + if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h)) + { + continue; + } + } + + const float *img1 = img0 + p * w * inch * _stride; + + int q = 0; + for (; q + 1 < outw; /*q += 2*/) + { + if (padding) + { + if (((q + 1) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 2; + img1 += inch * _stride * 2; + q += 2; + continue; + } + else if (q * _stride + m % kernel_w < pad_left) + { + out0 += outch; + img1 += inch * _stride; + q++; + continue; + } + else if ((q + 1) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("q4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("q5") = vld1q_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q15, q10, %e[rx1][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q15, q11, %e[rx1][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q15, q12, %f[rx1][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + "vmla.f32 q15, q13, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q15, q10, %e[rx1][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q15, q11, %e[rx1][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q15, q12, %f[rx1][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + "vmla.f32 q15, q13, %f[rx1][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18}, [r0]\n" + + "vmla.f32 d28, d12, %e[rx0][0]\n" + "vmla.f32 d30, d12, %e[rx1][0]\n" + "vmla.f32 d28, d14, %e[rx0][1]\n" + "vmla.f32 d30, d14, %e[rx1][1]\n" + "vmla.f32 d28, d16, %f[rx0][0]\n" + "vmla.f32 d30, d16, %f[rx1][0]\n" + "vmla.f32 d28, d18, %f[rx0][1]\n" + "vmla.f32 d30, d18, %f[rx1][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_f32(_x0); + register float32x2_t rx1 asm("d10") = vld1_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + "vmla.f32 q15, q11, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + "vmla.f32 q15, q11, %P[rx1][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d30, d12, %P[rx1][0]\n" + "vmla.f32 d28, d14, %P[rx0][1]\n" + "vmla.f32 d30, d14, %P[rx1][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("d10") = vld1_dup_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q10", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d30, d12, %P[rx1][0]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + + kernel0++; + outptr0++; + outptr1++; + } + + _x0 += 1; + _x1 += 1; + } + + img1 += inch * 2 * _stride; + out0 += outch * 2; + q += 2; + } + + for (; q < outw; q++) + { + if (padding) + { + if ((q * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + bottom_blob.w) + { + img1 += inch * _stride; + out0 += outch; + continue; + } + } + + const float *_x0 = img1; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("q4") = vld1q_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18}, [r0]\n" + + "vmla.f32 d28, d12, %e[rx0][0]\n" + "vmla.f32 d28, d14, %e[rx0][1]\n" + "vmla.f32 d28, d16, %f[rx0][0]\n" + "vmla.f32 d28, d18, %f[rx0][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + kernel0++; + outptr0++; + } + + kernel0 += outch * 3; + _x0 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d28, d14, %P[rx0][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + + kernel0++; + outptr0++; + } + + kernel0 += outch; + _x0 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0); + + float *outptr0 = out0; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q10", "q14" + +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + + kernel0++; + outptr0++; + } + + _x0 += 1; + } + + img1 += inch * _stride; + out0 += outch; + } + } + } +} + +static void direct_conv_s(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &_kernel, const int _stride, const int padding, + const int pad_top, const int pad_left) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + const int kernel_w = _kernel.w; + const int kernel_h = _kernel.h; + +#ifdef _OPENMP +#pragma omp parallel for +#endif // _OPENMP + for (int p = 0; p < outh; p++) + { + const float *img0 = bottom_blob.data + (p * _stride - pad_top) * w * inch; + float *out = top_blob.data + p * outw * outch; + + // clear output. + for (int j = 0; j < outw * outch; j++) + { + *(out + j) = 0.f; + } + + for (int m = 0; m < kernel_w * kernel_h; m++) + { + if (padding) + { + if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h)) + { + continue; + } + } + + float *out0 = out; + const float *_kernel0 = _kernel.data + m * inch * outch; + const float *img1 = img0 + (m / kernel_w) * w * inch + (m % kernel_w - pad_left) * inch; + + int q = 0; + for (; q + 1 < outw; /*q += 2*/) + { + if (padding) + { + if (((q + 1) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w >= pad_left + w)) + { + out0 += outch * 2; + img1 += inch * _stride * 2; + q += 2; + continue; + } + else if (q * _stride + m % kernel_w < pad_left) + { + out0 += outch; + img1 += inch * _stride; + q++; + continue; + } + else if ((q + 1) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("q4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("q5") = vld1q_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q15, q10, %e[rx1][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q15, q11, %e[rx1][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q15, q12, %f[rx1][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + "vmla.f32 q15, q13, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q15, q10, %e[rx1][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q15, q11, %e[rx1][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q15, q12, %f[rx1][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + "vmla.f32 q15, q13, %f[rx1][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18}, [r0]\n" + + "vmla.f32 d28, d12, %e[rx0][0]\n" + "vmla.f32 d30, d12, %e[rx1][0]\n" + "vmla.f32 d28, d14, %e[rx0][1]\n" + "vmla.f32 d30, d14, %e[rx1][1]\n" + "vmla.f32 d28, d16, %f[rx0][0]\n" + "vmla.f32 d30, d16, %f[rx1][0]\n" + "vmla.f32 d28, d18, %f[rx0][1]\n" + "vmla.f32 d30, d18, %f[rx1][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15" +#else + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_f32(_x0); + register float32x2_t rx1 asm("d10") = vld1_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + "vmla.f32 q15, q11, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + "vmla.f32 q15, q11, %P[rx1][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d30, d12, %P[rx1][0]\n" + "vmla.f32 d28, d14, %P[rx0][1]\n" + "vmla.f32 d30, d14, %P[rx1][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("d10") = vld1_dup_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q10", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d30, d12, %P[rx1][0]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + + kernel0++; + outptr0++; + outptr1++; + } + + _x0 += 1; + _x1 += 1; + } + + img1 += inch * 2 * _stride; + out0 += outch * 2; + q += 2; + } + + for (; q < outw; q++) + { + if (padding) + { + if ((q * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w >= pad_left + w)) + { + img1 += inch * _stride; + out0 += outch; + continue; + } + } + + const float *_x0 = img1; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("q4") = vld1q_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18}, [r0]\n" + + "vmla.f32 d28, d12, %e[rx0][0]\n" + "vmla.f32 d28, d14, %e[rx0][1]\n" + "vmla.f32 d28, d16, %f[rx0][0]\n" + "vmla.f32 d28, d18, %f[rx0][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + kernel0++; + outptr0++; + } + + kernel0 += outch * 3; + _x0 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d28, d14, %P[rx0][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + + kernel0++; + outptr0++; + } + + kernel0 += outch; + _x0 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0); + + float *outptr0 = out0; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q10", "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + + kernel0++; + outptr0++; + } + + _x0 += 1; + } + + img1 += inch * _stride; + out0 += outch; + } + } + } +} +#endif // __aarch64__ + +void direct_conv_colmajor(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &kernel, const convParams_t ¶ms, int num_threads) +{ + omp_set_num_threads(num_threads); + + if (bottom_blob.c * top_blob.c < 256 * 256) + { + direct_conv_s(bottom_blob, top_blob, kernel, params.stride_w, params.padding, params.pad_h, + params.pad_w); + return; + } + + direct_conv_l(bottom_blob, top_blob, kernel, params.stride_w, params.padding, params.pad_h, + params.pad_w); +} + +} // namespace srcn +} // namespace nnfw |