summaryrefslogtreecommitdiff
path: root/caffe2
diff options
context:
space:
mode:
authorLu Fang <lufang@fb.com>2019-02-22 11:15:11 -0800
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-02-22 11:23:59 -0800
commit0c24f3754ba1108e51fb0366ccb70f8c18da6397 (patch)
treee41ded829a9d62e3279b01eb785336f4d77828f1 /caffe2
parent60de0b885f031b4e30f9c068932137148e29744e (diff)
downloadpytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.tar.gz
pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.tar.bz2
pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.zip
Revert D14181620: [caffe2/int8] optimize max pool 2d
Differential Revision: D14181620 Original commit changeset: ffc6c4412bd1 fbshipit-source-id: 4391703164a672c9a8daecb24a46578765df67c6
Diffstat (limited to 'caffe2')
-rw-r--r--caffe2/quantization/server/pool_dnnlowp_op.cc66
-rw-r--r--caffe2/quantization/server/pool_dnnlowp_op_avx2.cc70
-rw-r--r--caffe2/quantization/server/pool_dnnlowp_op_avx2.h26
3 files changed, 21 insertions, 141 deletions
diff --git a/caffe2/quantization/server/pool_dnnlowp_op.cc b/caffe2/quantization/server/pool_dnnlowp_op.cc
index 0dda848bed..bbf6026113 100644
--- a/caffe2/quantization/server/pool_dnnlowp_op.cc
+++ b/caffe2/quantization/server/pool_dnnlowp_op.cc
@@ -3,7 +3,6 @@
#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
#include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
#include "caffe2/quantization/server/op_wrapper.h"
-#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h"
#include "caffe2/utils/eigen_utils.h"
namespace caffe2 {
@@ -583,55 +582,32 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> {
}
break;
case 2:
- if (is_same<T, uint8_t>::value) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
- for (int n = 0; n < X.dim32(0); ++n) {
- max_pool_avx2(
- reinterpret_cast<const uint8_t*>(Xdata),
- n,
- height,
- width,
- channels,
- pooled_height,
- pooled_width,
- kernel_h(),
- kernel_w(),
- stride_h(),
- stride_w(),
- pad_t(),
- pad_l(),
- reinterpret_cast<uint8_t*>(Ydata));
- }
- } else {
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
- for (int n = 0; n < X.dim32(0); ++n) {
- const T* Xdata_temp = Xdata + n * height * width * channels;
- T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
- for (int ph = 0; ph < pooled_height; ++ph) {
- int hstart = ph * stride_h() - pad_t();
- int hend = min(hstart + kernel_h(), height);
- hstart = max(hstart, 0);
- for (int pw = 0; pw < pooled_width; ++pw) {
- int wstart = pw * stride_w() - pad_l();
- int wend = min(wstart + kernel_w(), width);
- wstart = max(wstart, 0);
- int size = (hend - hstart) * (wend - wstart);
- for (int c = 0; c < channels; ++c) {
- T Yh = MaxPool<T>::initialize();
- const int pool_idx = (ph * pooled_width + pw) * channels + c;
- for (int h = hstart; h < hend; ++h) {
- for (int w = wstart; w < wend; ++w) {
- const int input_idx = (h * width + w) * channels + c;
- MaxPool<T>::process(Xdata_temp[input_idx], Yh);
- }
+ for (int n = 0; n < X.dim32(0); ++n) {
+ const T* Xdata_temp = Xdata + n * height * width * channels;
+ T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
+ for (int ph = 0; ph < pooled_height; ++ph) {
+ int hstart = ph * stride_h() - pad_t();
+ int hend = min(hstart + kernel_h(), height);
+ hstart = max(hstart, 0);
+ for (int pw = 0; pw < pooled_width; ++pw) {
+ int wstart = pw * stride_w() - pad_l();
+ int wend = min(wstart + kernel_w(), width);
+ wstart = max(wstart, 0);
+ int size = (hend - hstart) * (wend - wstart);
+ for (int c = 0; c < channels; ++c) {
+ T Yh = MaxPool<T>::initialize();
+ const int pool_idx = (ph * pooled_width + pw) * channels + c;
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ const int input_idx = (h * width + w) * channels + c;
+ MaxPool<T>::process(Xdata_temp[input_idx], Yh);
}
- MaxPool<T>::finalize(size, Yh);
- Ydata_temp[pool_idx] = Yh;
}
+ MaxPool<T>::finalize(size, Yh);
+ Ydata_temp[pool_idx] = Yh;
}
}
}
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
deleted file mode 100644
index 92d0816f57..0000000000
--- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h"
-
-#include <immintrin.h>
-#include <cmath>
-
-namespace caffe2 {
-
-using namespace std;
-
-void max_pool_avx2(
- const uint8_t* Xdata,
- int n,
- int height,
- int width,
- int channels,
- int pooled_height,
- int pooled_width,
- int kernel_h,
- int kernel_w,
- int stride_h,
- int stride_w,
- int pad_t,
- int pad_l,
- uint8_t* Ydata) {
- const uint8_t* Xdata_temp = Xdata + n * height * width * channels;
- uint8_t* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
- for (int ph = 0; ph < pooled_height; ++ph) {
- int hstart = ph * stride_h - pad_t;
- int hend = hstart + kernel_h < height ? hstart + kernel_h : height;
- hstart = hstart > 0 ? hstart : 0;
- for (int pw = 0; pw < pooled_width; ++pw) {
- int wstart = pw * stride_w - pad_l;
- int wend = wstart + kernel_w < width ? wstart + kernel_w : width;
- wstart = wstart > 0 ? wstart : 0;
-
- uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels;
- constexpr int VLEN = 8;
- // vectorized loop
- for (int c = 0; c < channels / VLEN * VLEN; c += VLEN) {
- __m256i Y_v = _mm256_setzero_si256();
- for (int h = hstart; h < hend; ++h) {
- for (int w = wstart; w < wend; ++w) {
- const int input_idx = (h * width + w) * channels + c;
- Y_v = _mm256_max_epu8(
- _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(Xdata_temp + input_idx)),
- Y_v);
- }
- }
- _mm256_storeu_si256(reinterpret_cast<__m256i*>(Yh + c), Y_v);
- }
-
- // remainder
- for (int c = channels / VLEN * VLEN; c < channels; ++c) {
- Yh[c] = 0;
- }
- for (int h = hstart; h < hend; ++h) {
- for (int w = wstart; w < wend; ++w) {
- for (int c = channels / VLEN * VLEN; c < channels; ++c) {
- const int input_idx = (h * width + w) * channels + c;
- Yh[c] =
- Xdata_temp[input_idx] > Yh[c] ? Xdata_temp[input_idx] : Yh[c];
- }
- }
- }
- } // pw loop
- } // ph loop
-}
-
-} // namespace caffe2
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h b/caffe2/quantization/server/pool_dnnlowp_op_avx2.h
deleted file mode 100644
index abb057319b..0000000000
--- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-
-#include <cstdint>
-
-namespace caffe2 {
-
-/**
- * Optimized using AVX2 intrinsics for max pool 2D in NHWC layout
- */
-void max_pool_avx2(
- const std::uint8_t* Xdata,
- int n,
- int height,
- int width,
- int channels,
- int pooled_height,
- int pooled_width,
- int kernel_h,
- int kernel_w,
- int stride_h,
- int stride_w,
- int pad_t,
- int pad_l,
- std::uint8_t* Ydata);
-
-} // namespace caffe2