diff options
author | Lu Fang <lufang@fb.com> | 2019-02-22 11:15:11 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-02-22 11:23:59 -0800 |
commit | 0c24f3754ba1108e51fb0366ccb70f8c18da6397 (patch) | |
tree | e41ded829a9d62e3279b01eb785336f4d77828f1 /caffe2 | |
parent | 60de0b885f031b4e30f9c068932137148e29744e (diff) | |
download | pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.tar.gz pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.tar.bz2 pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.zip |
Revert D14181620: [caffe2/int8] optimize max pool 2d
Differential Revision:
D14181620
Original commit changeset: ffc6c4412bd1
fbshipit-source-id: 4391703164a672c9a8daecb24a46578765df67c6
Diffstat (limited to 'caffe2')
-rw-r--r-- | caffe2/quantization/server/pool_dnnlowp_op.cc | 66 | ||||
-rw-r--r-- | caffe2/quantization/server/pool_dnnlowp_op_avx2.cc | 70 | ||||
-rw-r--r-- | caffe2/quantization/server/pool_dnnlowp_op_avx2.h | 26 |
3 files changed, 21 insertions, 141 deletions
diff --git a/caffe2/quantization/server/pool_dnnlowp_op.cc b/caffe2/quantization/server/pool_dnnlowp_op.cc index 0dda848bed..bbf6026113 100644 --- a/caffe2/quantization/server/pool_dnnlowp_op.cc +++ b/caffe2/quantization/server/pool_dnnlowp_op.cc @@ -3,7 +3,6 @@ #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h" #include "caffe2/quantization/server/op_wrapper.h" -#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h" #include "caffe2/utils/eigen_utils.h" namespace caffe2 { @@ -583,55 +582,32 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> { } break; case 2: - if (is_same<T, uint8_t>::value) { #ifdef _OPENMP #pragma omp parallel for #endif - for (int n = 0; n < X.dim32(0); ++n) { - max_pool_avx2( - reinterpret_cast<const uint8_t*>(Xdata), - n, - height, - width, - channels, - pooled_height, - pooled_width, - kernel_h(), - kernel_w(), - stride_h(), - stride_w(), - pad_t(), - pad_l(), - reinterpret_cast<uint8_t*>(Ydata)); - } - } else { -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int n = 0; n < X.dim32(0); ++n) { - const T* Xdata_temp = Xdata + n * height * width * channels; - T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; - for (int ph = 0; ph < pooled_height; ++ph) { - int hstart = ph * stride_h() - pad_t(); - int hend = min(hstart + kernel_h(), height); - hstart = max(hstart, 0); - for (int pw = 0; pw < pooled_width; ++pw) { - int wstart = pw * stride_w() - pad_l(); - int wend = min(wstart + kernel_w(), width); - wstart = max(wstart, 0); - int size = (hend - hstart) * (wend - wstart); - for (int c = 0; c < channels; ++c) { - T Yh = MaxPool<T>::initialize(); - const int pool_idx = (ph * pooled_width + pw) * channels + c; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int input_idx = (h * width + w) * channels + c; - MaxPool<T>::process(Xdata_temp[input_idx], Yh); - } + for (int n = 0; n < X.dim32(0); ++n) { + const T* Xdata_temp = Xdata + n * height * width * channels; + T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; + for (int ph = 0; ph < pooled_height; ++ph) { + int hstart = ph * stride_h() - pad_t(); + int hend = min(hstart + kernel_h(), height); + hstart = max(hstart, 0); + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w() - pad_l(); + int wend = min(wstart + kernel_w(), width); + wstart = max(wstart, 0); + int size = (hend - hstart) * (wend - wstart); + for (int c = 0; c < channels; ++c) { + T Yh = MaxPool<T>::initialize(); + const int pool_idx = (ph * pooled_width + pw) * channels + c; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int input_idx = (h * width + w) * channels + c; + MaxPool<T>::process(Xdata_temp[input_idx], Yh); } - MaxPool<T>::finalize(size, Yh); - Ydata_temp[pool_idx] = Yh; } + MaxPool<T>::finalize(size, Yh); + Ydata_temp[pool_idx] = Yh; } } } diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc deleted file mode 100644 index 92d0816f57..0000000000 --- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h" - -#include <immintrin.h> -#include <cmath> - -namespace caffe2 { - -using namespace std; - -void max_pool_avx2( - const uint8_t* Xdata, - int n, - int height, - int width, - int channels, - int pooled_height, - int pooled_width, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - int pad_t, - int pad_l, - uint8_t* Ydata) { - const uint8_t* Xdata_temp = Xdata + n * height * width * channels; - uint8_t* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels; - for (int ph = 0; ph < pooled_height; ++ph) { - int hstart = ph * stride_h - pad_t; - int hend = hstart + kernel_h < height ? hstart + kernel_h : height; - hstart = hstart > 0 ? hstart : 0; - for (int pw = 0; pw < pooled_width; ++pw) { - int wstart = pw * stride_w - pad_l; - int wend = wstart + kernel_w < width ? wstart + kernel_w : width; - wstart = wstart > 0 ? wstart : 0; - - uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels; - constexpr int VLEN = 8; - // vectorized loop - for (int c = 0; c < channels / VLEN * VLEN; c += VLEN) { - __m256i Y_v = _mm256_setzero_si256(); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int input_idx = (h * width + w) * channels + c; - Y_v = _mm256_max_epu8( - _mm256_loadu_si256( - reinterpret_cast<const __m256i*>(Xdata_temp + input_idx)), - Y_v); - } - } - _mm256_storeu_si256(reinterpret_cast<__m256i*>(Yh + c), Y_v); - } - - // remainder - for (int c = channels / VLEN * VLEN; c < channels; ++c) { - Yh[c] = 0; - } - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - for (int c = channels / VLEN * VLEN; c < channels; ++c) { - const int input_idx = (h * width + w) * channels + c; - Yh[c] = - Xdata_temp[input_idx] > Yh[c] ? Xdata_temp[input_idx] : Yh[c]; - } - } - } - } // pw loop - } // ph loop -} - -} // namespace caffe2 diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h b/caffe2/quantization/server/pool_dnnlowp_op_avx2.h deleted file mode 100644 index abb057319b..0000000000 --- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include <cstdint> - -namespace caffe2 { - -/** - * Optimized using AVX2 intrinsics for max pool 2D in NHWC layout - */ -void max_pool_avx2( - const std::uint8_t* Xdata, - int n, - int height, - int width, - int channels, - int pooled_height, - int pooled_width, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - int pad_t, - int pad_l, - std::uint8_t* Ydata); - -} // namespace caffe2 |