diff options
Diffstat (limited to 'compute/ncnn/src/layer/instance_norm.cc')
-rw-r--r-- | compute/ncnn/src/layer/instance_norm.cc | 371 |
1 files changed, 0 insertions, 371 deletions
diff --git a/compute/ncnn/src/layer/instance_norm.cc b/compute/ncnn/src/layer/instance_norm.cc deleted file mode 100644 index 08c3f2c23..000000000 --- a/compute/ncnn/src/layer/instance_norm.cc +++ /dev/null @@ -1,371 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "ncnn/layer/instance_norm.h" -#ifdef _OPENMP -#include <omp.h> -#endif - -#include <math.h> -#include "ncnn/mat.h" -#ifdef __ARM_NEON -#include <arm_neon.h> -#endif // __ARM_NEON - -namespace nnfw -{ -namespace ncnn -{ - -void ncnn_instance_norm_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, - int channels, float eps) -{ - // x = (x - mean) / (sqrt(var) + eps) * gamma + beta - - int w = in_mat.w; - int h = in_mat.h; - int size = w * h; -#ifdef __ARM_NEON - int nn = size >> 2; - int left4 = size & 3; -#endif - -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { -#ifdef __ARM_NEON - float *in_ptr = in_mat.channel(q); - float *out_ptr = out_mat.channel(q); - float32x4_t _sum = vdupq_n_f32(0.f); - float32x4_t _sq_sum = vdupq_n_f32(0.f); - for (int n = nn; n > 0; n--) - { - float32x4_t _p = vld1q_f32(in_ptr); - _sum = vaddq_f32(_sum, _p); - _p = vmulq_f32(_p, _p); - _sq_sum = vaddq_f32(_sq_sum, _p); - in_ptr += 4; - } - float sum = vgetq_lane_f32(_sum, 0) + vgetq_lane_f32(_sum, 1); - sum += vgetq_lane_f32(_sum, 2); - sum += vgetq_lane_f32(_sum, 3); - float sqsum = vgetq_lane_f32(_sq_sum, 0) + vgetq_lane_f32(_sq_sum, 1); - sqsum += vgetq_lane_f32(_sq_sum, 2); - sqsum += vgetq_lane_f32(_sq_sum, 3); - - for (int left = left4; left > 0; left--) - { - sum += *in_ptr; - sqsum += (*in_ptr) * (*in_ptr); - in_ptr++; - } - - float mean = sum / size; - float var = sqsum / size - mean * mean; - float gamma = gamma_mat[q]; - float beta = beta_mat[q]; - float a = gamma / (sqrt(var + eps)); - float b = -mean * a + beta; - - in_ptr = in_mat.channel(q); - float32x4_t _a = vdupq_n_f32(a); - float32x4_t _b = vdupq_n_f32(b); - for (int n = nn; n > 0; n--) - { - float32x4_t _p = vld1q_f32(in_ptr); - _p = vmulq_f32(_p, _a); - _p = vaddq_f32(_p, _b); - vst1q_f32(out_ptr, _p); - in_ptr += 4; - out_ptr += 4; - } - for (int left = left4; left > 0; left--) - { - *out_ptr = (*in_ptr) * a + b; - in_ptr++; - out_ptr++; - } -#else - float *in_ptr = in_mat.channel(q); - float *out_ptr = out_mat.channel(q); - // mean and var - float sum = 0.f; - float sqsum = 0.f; - for (int i = 0; i < size; i++) - { - sum += in_ptr[i]; - sqsum += in_ptr[i] * in_ptr[i]; - } - float mean = sum / size; - float var = sqsum / size - mean * mean; - - float gamma = gamma_mat[q]; - float beta = beta_mat[q]; - - float a = gamma / (sqrt(var + eps)); - float b = -mean * a + beta; - for (int i = 0; i < size; i++) - { - out_ptr[i] = in_ptr[i] * a + b; - } -#endif - } -} - -void ncnn_instance_norm_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, - int /*channels*/, float eps) -{ - // Treat CHW layout as HWC layout - int h = in_mat.c; - int w = in_mat.h; - int c = in_mat.w; - - int size = w * h; - int total = size * c; - - float sum[c] = {}; - float sqsum[c] = {}; - - float mean[c] = {}; - float var[c] = {}; - float a[c] = {}; - float b[c] = {}; - - float *in_ptr = in_mat.channel(0); - float *out_ptr = out_mat.channel(0); - -#pragma omp parallel for reduction(+ : sum, sqsum) schedule(guided) - for (int i = 0; i < total; i += c) - { - for (int j = 0; j < c; j++) - { - sum[j] += in_ptr[i + j]; - sqsum[j] += in_ptr[i + j] * in_ptr[i + j]; - } - } - - for (int i = 0; i < c; i++) - { - mean[i] = sum[i] / size; - var[i] = sqsum[i] / size - mean[i] * mean[i]; - a[i] = gamma_mat[i] / (sqrt(var[i] + eps)); - b[i] = -mean[i] * a[i] + beta_mat[i]; - } - -#pragma omp parallel for schedule(guided) - for (int i = 0; i < total; i += c) - { - for (int j = 0; j < c; j++) - { - out_ptr[i + j] = in_ptr[i + j] * a[j] + b[j]; - } - } -} - -void ncnn_instance_norm_with_relu_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, - int channels, float eps, float /*slope*/) -{ - int w = in_mat.w; - int h = in_mat.h; - int size = w * h; -#ifdef __ARM_NEON - int nn = size >> 2; - int left4 = size & 3; -#endif -#pragma omp parallel for - for (int q = 0; q < channels; q++) - { -#ifdef __ARM_NEON - float *in_ptr = in_mat.channel(q); - float *out_ptr = out_mat.channel(q); - float32x4_t _sum = vdupq_n_f32(0.f); - float32x4_t _sq_sum = vdupq_n_f32(0.f); - for (int n = nn; n > 0; n--) - { - float32x4_t _p = vld1q_f32(in_ptr); - _sum = vaddq_f32(_sum, _p); - _p = vmulq_f32(_p, _p); - _sq_sum = vaddq_f32(_sq_sum, _p); - in_ptr += 4; - } - // float sum = - // vgetq_lane_f32(_sum,0)+vgetq_lane_f32(_sum,1)+vgetq_lane_f32(_sum,2)+vgetq_lane_f32(_sum,3); - // float sqsum = vgetq_lane_f32(_sq_sum,0)+vgetq_lane_f32(_sq_sum,1)+ - // vgetq_lane_f32(_sq_sum,2)+vgetq_lane_f32(_sq_sum,3); - float sum = vgetq_lane_f32(_sum, 0) + vgetq_lane_f32(_sum, 1); - sum += vgetq_lane_f32(_sum, 2); - sum += vgetq_lane_f32(_sum, 3); - float sqsum = vgetq_lane_f32(_sq_sum, 0) + vgetq_lane_f32(_sq_sum, 1); - sqsum += vgetq_lane_f32(_sq_sum, 2); - sqsum += vgetq_lane_f32(_sq_sum, 3); - for (int left = left4; left > 0; left--) - { - sum += *in_ptr; - sqsum += (*in_ptr) * (*in_ptr); - in_ptr++; - } - - float mean = sum / size; - float var = sqsum / size - mean * mean; - float gamma = gamma_mat[q]; - float beta = beta_mat[q]; - float a = gamma / (sqrt(var + eps)); - float b = -mean * a + beta; - // TODO:slop is not used here , only for RELU which slop is always = 0; - in_ptr = in_mat.channel(q); - float32x4_t _a = vdupq_n_f32(a); - float32x4_t _b = vdupq_n_f32(b); - float32x4_t _zero = vdupq_n_f32(0.f); - for (int n = nn; n > 0; n--) - { - float32x4_t _p = vld1q_f32(in_ptr); - _p = vmulq_f32(_p, _a); - _p = vaddq_f32(_p, _b); - _p = vmaxq_f32(_p, _zero); - vst1q_f32(out_ptr, _p); - in_ptr += 4; - out_ptr += 4; - } - for (int left = left4; left > 0; left--) - { - int temp = (*in_ptr) * a + b; - *out_ptr = temp > 0 ? temp : 0; - in_ptr++; - out_ptr++; - } -#else - float *in_ptr = in_mat.channel(q); - float *out_ptr = out_mat.channel(q); - - // mean and var - float sum = 0.f; - float sqsum = 0.f; - for (int i = 0; i < size; i++) - { - sum += in_ptr[i]; - sqsum += in_ptr[i] * in_ptr[i]; - } - float mean = sum / size; - float var = sqsum / size - mean * mean; - - float gamma = gamma_mat[q]; - float beta = beta_mat[q]; - - float a = gamma / (sqrt(var + eps)); - float b = -mean * a + beta; - - if (slope == 0.f) - { - for (int i = 0; i < size; i++) - { - float temp = in_ptr[i] * a + b; - out_ptr[i] = temp > 0 ? temp : 0; - } - } - else - { - for (int i = 0; i < size; i++) - { - float temp = in_ptr[i] * a + b; - out_ptr[i] = temp > 0 ? temp : temp * slope; - } - } -#endif - } -} - -void ncnn_instance_norm_with_relu_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, - int /*channels*/, float eps, float slope) -{ - // Treat CHW layout as HWC layout - int h = in_mat.c; - int w = in_mat.h; - int c = in_mat.w; - - int size = w * h; - int total = size * c; - - float sum[c] = {}; - float sqsum[c] = {}; - - float mean[c] = {}; - float var[c] = {}; - float a[c] = {}; - float b[c] = {}; - - float *in_ptr = in_mat.channel(0); - float *out_ptr = out_mat.channel(0); - -#pragma omp parallel for reduction(+ : sum, sqsum) schedule(guided) - for (int i = 0; i < total; i += c) - { - for (int j = 0; j < c; j++) - { - sum[j] += in_ptr[i + j]; - sqsum[j] += in_ptr[i + j] * in_ptr[i + j]; - } - } - - for (int i = 0; i < c; i++) - { - mean[i] = sum[i] / size; - var[i] = sqsum[i] / size - mean[i] * mean[i]; - a[i] = gamma_mat[i] / (sqrt(var[i] + eps)); - b[i] = -mean[i] * a[i] + beta_mat[i]; - } - - if (slope == 0.f) - { -#pragma omp parallel for schedule(guided) - for (int i = 0; i < total; i += c) - { - for (int j = 0; j < c; j++) - { - float temp = in_ptr[i + j] * a[j] + b[j]; - out_ptr[i + j] = temp > 0 ? temp : 0; - } - } - } - else - { -#pragma omp parallel for schedule(guided) - for (int i = 0; i < total; i += c) - { - for (int j = 0; j < c; j++) - { - float temp = in_ptr[i + j] * a[j] + b[j]; - out_ptr[i + j] = temp > 0 ? temp : temp * slope; - } - } - } -} - -} // namespace ncnn - -} // namespace nnfw |