From 0c24f3754ba1108e51fb0366ccb70f8c18da6397 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Fri, 22 Feb 2019 11:15:11 -0800 Subject: Revert D14181620: [caffe2/int8] optimize max pool 2d Differential Revision: D14181620 Original commit changeset: ffc6c4412bd1 fbshipit-source-id: 4391703164a672c9a8daecb24a46578765df67c6 --- caffe2/quantization/server/pool_dnnlowp_op.cc | 66 +++++++------------- caffe2/quantization/server/pool_dnnlowp_op_avx2.cc | 70 ---------------------- caffe2/quantization/server/pool_dnnlowp_op_avx2.h | 26 -------- 3 files changed, 21 insertions(+), 141 deletions(-) delete mode 100644 caffe2/quantization/server/pool_dnnlowp_op_avx2.cc delete mode 100644 caffe2/quantization/server/pool_dnnlowp_op_avx2.h (limited to 'caffe2') diff --git a/caffe2/quantization/server/pool_dnnlowp_op.cc b/caffe2/quantization/server/pool_dnnlowp_op.cc index 0dda848bed..bbf6026113 100644 --- a/caffe2/quantization/server/pool_dnnlowp_op.cc +++ b/caffe2/quantization/server/pool_dnnlowp_op.cc @@ -3,7 +3,6 @@ #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h" #include "caffe2/quantization/server/op_wrapper.h" -#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h" #include "caffe2/utils/eigen_utils.h" namespace caffe2 { @@ -583,55 +582,32 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase { } break; case 2: - if (is_same::value) { #ifdef _OPENMP #pragma omp parallel for #endif - for (int n = 0; n < X.dim32(0); ++n) { - max_pool_avx2( - reinterpret_cast(Xdata), - n, - height, - width, - channels, - pooled_height, - pooled_width, - kernel_h(), - kernel_w(), - stride_h(), - stride_w(), - pad_t(), - pad_l(), - reinterpret_cast(Ydata)); - } - } else { -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int n = 0; n < X.dim32(0); ++n) { - const T* Xdata_temp = Xdata + n * height * width * channels; - T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; - for (int ph = 0; ph < pooled_height; ++ph) { - int hstart = ph * stride_h() - pad_t(); - int hend = min(hstart + kernel_h(), height); - hstart = max(hstart, 0); - for (int pw = 0; pw < pooled_width; ++pw) { - int wstart = pw * stride_w() - pad_l(); - int wend = min(wstart + kernel_w(), width); - wstart = max(wstart, 0); - int size = (hend - hstart) * (wend - wstart); - for (int c = 0; c < channels; ++c) { - T Yh = MaxPool::initialize(); - const int pool_idx = (ph * pooled_width + pw) * channels + c; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int input_idx = (h * width + w) * channels + c; - MaxPool::process(Xdata_temp[input_idx], Yh); - } + for (int n = 0; n < X.dim32(0); ++n) { + const T* Xdata_temp = Xdata + n * height * width * channels; + T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; + for (int ph = 0; ph < pooled_height; ++ph) { + int hstart = ph * stride_h() - pad_t(); + int hend = min(hstart + kernel_h(), height); + hstart = max(hstart, 0); + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w() - pad_l(); + int wend = min(wstart + kernel_w(), width); + wstart = max(wstart, 0); + int size = (hend - hstart) * (wend - wstart); + for (int c = 0; c < channels; ++c) { + T Yh = MaxPool::initialize(); + const int pool_idx = (ph * pooled_width + pw) * channels + c; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int input_idx = (h * width + w) * channels + c; + MaxPool::process(Xdata_temp[input_idx], Yh); } - MaxPool::finalize(size, Yh); - Ydata_temp[pool_idx] = Yh; } + MaxPool::finalize(size, Yh); + Ydata_temp[pool_idx] = Yh; } } } diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc deleted file mode 100644 index 92d0816f57..0000000000 --- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h" - -#include -#include - -namespace caffe2 { - -using namespace std; - -void max_pool_avx2( - const uint8_t* Xdata, - int n, - int height, - int width, - int channels, - int pooled_height, - int pooled_width, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - int pad_t, - int pad_l, - uint8_t* Ydata) { - const uint8_t* Xdata_temp = Xdata + n * height * width * channels; - uint8_t* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; - for (int ph = 0; ph < pooled_height; ++ph) { - int hstart = ph * stride_h - pad_t; - int hend = hstart + kernel_h < height ? hstart + kernel_h : height; - hstart = hstart > 0 ? hstart : 0; - for (int pw = 0; pw < pooled_width; ++pw) { - int wstart = pw * stride_w - pad_l; - int wend = wstart + kernel_w < width ? wstart + kernel_w : width; - wstart = wstart > 0 ? wstart : 0; - - uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels; - constexpr int VLEN = 8; - // vectorized loop - for (int c = 0; c < channels / VLEN * VLEN; c += VLEN) { - __m256i Y_v = _mm256_setzero_si256(); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int input_idx = (h * width + w) * channels + c; - Y_v = _mm256_max_epu8( - _mm256_loadu_si256( - reinterpret_cast(Xdata_temp + input_idx)), - Y_v); - } - } - _mm256_storeu_si256(reinterpret_cast<__m256i*>(Yh + c), Y_v); - } - - // remainder - for (int c = channels / VLEN * VLEN; c < channels; ++c) { - Yh[c] = 0; - } - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - for (int c = channels / VLEN * VLEN; c < channels; ++c) { - const int input_idx = (h * width + w) * channels + c; - Yh[c] = - Xdata_temp[input_idx] > Yh[c] ? Xdata_temp[input_idx] : Yh[c]; - } - } - } - } // pw loop - } // ph loop -} - -} // namespace caffe2 diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h b/caffe2/quantization/server/pool_dnnlowp_op_avx2.h deleted file mode 100644 index abb057319b..0000000000 --- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { - -/** - * Optimized using AVX2 intrinsics for max pool 2D in NHWC layout - */ -void max_pool_avx2( - const std::uint8_t* Xdata, - int n, - int height, - int width, - int channels, - int pooled_height, - int pooled_width, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - int pad_t, - int pad_l, - std::uint8_t* Ydata); - -} // namespace caffe2 -- cgit v1.2.3