Revert D14181620: [caffe2/int8] optimize max pool 2d

Differential Revision: D14181620 Original commit changeset: ffc6c4412bd1 fbshipit-source-id: 4391703164a672c9a8daecb24a46578765df67c6
author: Lu Fang <lufang@fb.com> 2019-02-22 11:15:11 -0800
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-02-22 11:23:59 -0800
commit: 0c24f3754ba1108e51fb0366ccb70f8c18da6397 (patch)
tree: e41ded829a9d62e3279b01eb785336f4d77828f1 /caffe2
parent: 60de0b885f031b4e30f9c068932137148e29744e (diff)
download: pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.tar.gz
pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.tar.bz2
pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.zip
3 files changed, 21 insertions, 141 deletions
diff --git a/caffe2/quantization/server/pool_dnnlowp_op.cc b/caffe2/quantization/server/pool_dnnlowp_op.cc
index 0dda848bed..bbf6026113 100644
--- a/caffe2/quantization/server/pool_dnnlowp_op.cc
+++ b/caffe2/quantization/server/pool_dnnlowp_op.cc
@@ -3,7 +3,6 @@
 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
 #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
 #include "caffe2/quantization/server/op_wrapper.h"
-#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h"
 #include "caffe2/utils/eigen_utils.h"
 
 namespace caffe2 {
@@ -583,55 +582,32 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> {
         }
         break;
       case 2:
-        if (is_same<T, uint8_t>::value) {
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-          for (int n = 0; n < X.dim32(0); ++n) {
-            max_pool_avx2(
-                reinterpret_cast<const uint8_t*>(Xdata),
-                n,
-                height,
-                width,
-                channels,
-                pooled_height,
-                pooled_width,
-                kernel_h(),
-                kernel_w(),
-                stride_h(),
-                stride_w(),
-                pad_t(),
-                pad_l(),
-                reinterpret_cast<uint8_t*>(Ydata));
-          }
-        } else {
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-          for (int n = 0; n < X.dim32(0); ++n) {
-            const T* Xdata_temp = Xdata + n * height * width * channels;
-            T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
-            for (int ph = 0; ph < pooled_height; ++ph) {
-              int hstart = ph * stride_h() - pad_t();
-              int hend = min(hstart + kernel_h(), height);
-              hstart = max(hstart, 0);
-              for (int pw = 0; pw < pooled_width; ++pw) {
-                int wstart = pw * stride_w() - pad_l();
-                int wend = min(wstart + kernel_w(), width);
-                wstart = max(wstart, 0);
-                int size = (hend - hstart) * (wend - wstart);
-                for (int c = 0; c < channels; ++c) {
-                  T Yh = MaxPool<T>::initialize();
-                  const int pool_idx = (ph * pooled_width + pw) * channels + c;
-                  for (int h = hstart; h < hend; ++h) {
-                    for (int w = wstart; w < wend; ++w) {
-                      const int input_idx = (h * width + w) * channels + c;
-                      MaxPool<T>::process(Xdata_temp[input_idx], Yh);
-                    }
+        for (int n = 0; n < X.dim32(0); ++n) {
+          const T* Xdata_temp = Xdata + n * height * width * channels;
+          T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            int hstart = ph * stride_h() - pad_t();
+            int hend = min(hstart + kernel_h(), height);
+            hstart = max(hstart, 0);
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int wstart = pw * stride_w() - pad_l();
+              int wend = min(wstart + kernel_w(), width);
+              wstart = max(wstart, 0);
+              int size = (hend - hstart) * (wend - wstart);
+              for (int c = 0; c < channels; ++c) {
+                T Yh = MaxPool<T>::initialize();
+                const int pool_idx = (ph * pooled_width + pw) * channels + c;
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    const int input_idx = (h * width + w) * channels + c;
+                    MaxPool<T>::process(Xdata_temp[input_idx], Yh);
                   }
-                  MaxPool<T>::finalize(size, Yh);
-                  Ydata_temp[pool_idx] = Yh;
                 }
+                MaxPool<T>::finalize(size, Yh);
+                Ydata_temp[pool_idx] = Yh;
               }
             }
           }
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
deleted file mode 100644
index 92d0816f57..0000000000
--- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-#include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h"
-
-#include <immintrin.h>
-#include <cmath>
-
-namespace caffe2 {
-
-using namespace std;
-
-void max_pool_avx2(
-    const uint8_t* Xdata,
-    int n,
-    int height,
-    int width,
-    int channels,
-    int pooled_height,
-    int pooled_width,
-    int kernel_h,
-    int kernel_w,
-    int stride_h,
-    int stride_w,
-    int pad_t,
-    int pad_l,
-    uint8_t* Ydata) {
-  const uint8_t* Xdata_temp = Xdata + n * height * width * channels;
-  uint8_t* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
-  for (int ph = 0; ph < pooled_height; ++ph) {
-    int hstart = ph * stride_h - pad_t;
-    int hend = hstart + kernel_h < height ? hstart + kernel_h : height;
-    hstart = hstart > 0 ? hstart : 0;
-    for (int pw = 0; pw < pooled_width; ++pw) {
-      int wstart = pw * stride_w - pad_l;
-      int wend = wstart + kernel_w < width ? wstart + kernel_w : width;
-      wstart = wstart > 0 ? wstart : 0;
-
-      uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels;
-      constexpr int VLEN = 8;
-      // vectorized loop
-      for (int c = 0; c < channels / VLEN * VLEN; c += VLEN) {
-        __m256i Y_v = _mm256_setzero_si256();
-        for (int h = hstart; h < hend; ++h) {
-          for (int w = wstart; w < wend; ++w) {
-            const int input_idx = (h * width + w) * channels + c;
-            Y_v = _mm256_max_epu8(
-                _mm256_loadu_si256(
-                    reinterpret_cast<const __m256i*>(Xdata_temp + input_idx)),
-                Y_v);
-          }
-        }
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(Yh + c), Y_v);
-      }
-
-      // remainder
-      for (int c = channels / VLEN * VLEN; c < channels; ++c) {
-        Yh[c] = 0;
-      }
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          for (int c = channels / VLEN * VLEN; c < channels; ++c) {
-            const int input_idx = (h * width + w) * channels + c;
-            Yh[c] =
-                Xdata_temp[input_idx] > Yh[c] ? Xdata_temp[input_idx] : Yh[c];
-          }
-        }
-      }
-    } // pw loop
-  } // ph loop
-}
-
-} // namespace caffe2
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h b/caffe2/quantization/server/pool_dnnlowp_op_avx2.h
deleted file mode 100644
index abb057319b..0000000000
--- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-
-#include <cstdint>
-
-namespace caffe2 {
-
-/**
- * Optimized using AVX2 intrinsics for max pool 2D in NHWC layout
- */
-void max_pool_avx2(
-    const std::uint8_t* Xdata,
-    int n,
-    int height,
-    int width,
-    int channels,
-    int pooled_height,
-    int pooled_width,
-    int kernel_h,
-    int kernel_w,
-    int stride_h,
-    int stride_w,
-    int pad_t,
-    int pad_l,
-    std::uint8_t* Ydata);
-
-} // namespace caffe2
author	Lu Fang <lufang@fb.com>	2019-02-22 11:15:11 -0800
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-02-22 11:23:59 -0800
commit	0c24f3754ba1108e51fb0366ccb70f8c18da6397 (patch)
tree	e41ded829a9d62e3279b01eb785336f4d77828f1 /caffe2
parent	60de0b885f031b4e30f9c068932137148e29744e (diff)
download	pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.tar.gz pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.tar.bz2 pytorch-0c24f3754ba1108e51fb0366ccb70f8c18da6397.zip