From 9e1805d38ef9eaae2c56e85c67704105b33f97e5 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <marat@fb.com>
Date: Mon, 26 Nov 2018 17:41:13 -0800
Subject: Switch Int8ChannelShuffle operator to QNNPACK (#14362)

Summary:
1.8-2.2X better performance on ARM devices
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14362

Reviewed By: jerryzh168

Differential Revision: D13192312

Pulled By: Maratyszcza

fbshipit-source-id: 0d3dff067e300c7d741c42615b61246cbf09a829
---
 .../operators/quantized/int8_channel_shuffle_op.h  | 170 +++++++--------------
 third_party/QNNPACK                                |   2 +-
 2 files changed, 54 insertions(+), 118 deletions(-)

diff --git a/caffe2/operators/quantized/int8_channel_shuffle_op.h b/caffe2/operators/quantized/int8_channel_shuffle_op.h
index cdd2a404a9..beef35eece 100644
--- a/caffe2/operators/quantized/int8_channel_shuffle_op.h
+++ b/caffe2/operators/quantized/int8_channel_shuffle_op.h
@@ -1,121 +1,18 @@
 #ifndef CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_
 #define CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_
 
+#include <qnnpack.h>
+
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor_int8.h"
 #include "caffe2/operators/conv_pool_op_base.h"
-#include "caffe2/operators/quantized/int8_simd.h"
 #include "caffe2/operators/quantized/int8_utils.h"
 
 namespace caffe2 {
 
 namespace int8 {
 
-namespace {
-
-template <size_t TileSizeK, size_t TileSizeG>
-inline void
-TransposeTile(const uint8_t* X_tile, size_t K, size_t G, uint8_t* Y_tile) {
-#ifdef INT8_NEON_SIMD
-  static_assert(TileSizeK == 8, "");
-  static_assert(TileSizeG == 4, "");
-  auto Transpose8x4_NEON =
-      [](uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3) {
-        const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
-        const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
-        const uint16x4x2_t c0 = vtrn_u16(
-            vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
-        const uint16x4x2_t c1 = vtrn_u16(
-            vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
-        *a0 = vreinterpret_u8_u16(c0.val[0]);
-        *a1 = vreinterpret_u8_u16(c1.val[0]);
-        *a2 = vreinterpret_u8_u16(c0.val[1]);
-        *a3 = vreinterpret_u8_u16(c1.val[1]);
-      };
-
-  uint8x8_t g0 = vld1_u8(X_tile + 0 * K);
-  uint8x8_t g1 = vld1_u8(X_tile + 1 * K);
-  uint8x8_t g2 = vld1_u8(X_tile + 2 * K);
-  uint8x8_t g3 = vld1_u8(X_tile + 3 * K);
-  Transpose8x4_NEON(&g0, &g1, &g2, &g3);
-  uint8_t tile[TileSizeK / 2][2][TileSizeG];
-  vst1_u8(&tile[0][0][0], g0);
-  vst1_u8(&tile[1][0][0], g1);
-  vst1_u8(&tile[2][0][0], g2);
-  vst1_u8(&tile[3][0][0], g3);
-  for (auto kkk = 0; kkk < 2; ++kkk) {
-    for (auto kk = 0; kk < TileSizeK / 2; ++kk) {
-      const auto k = TileSizeK / 2 * kkk + kk;
-      for (auto g = 0; g < TileSizeG; ++g) {
-        Y_tile[k * G + g] = tile[kk][kkk][g];
-      }
-    }
-  }
-#else
-  uint8_t tile[TileSizeG][TileSizeK];
-  for (auto g = 0; g < TileSizeG; ++g) {
-    for (auto k = 0; k < TileSizeK; ++k) {
-      tile[g][k] = X_tile[g * K + k];
-    }
-  }
-  for (auto k = 0; k < TileSizeK; ++k) {
-    for (auto g = 0; g < TileSizeG; ++g) {
-      Y_tile[k * G + g] = tile[g][k];
-    }
-  }
-#endif
-}
-
-void Int8ChannelShuffle(
-    const uint8_t* X_data,
-    size_t B,
-    size_t K,
-    size_t G,
-    uint8_t* Y_data,
-    ThreadPool* threadPool) {
-  auto divRoundUp = [](size_t n, size_t d) { return (n + d - 1) / d; };
-  constexpr size_t kTileSizeG = 4;
-  constexpr size_t kTileSizeK = 8;
-  auto f = [&](int, size_t b) {
-    for (auto kk = 0; kk < divRoundUp(K, kTileSizeK); ++kk) {
-      for (auto gg = 0; gg < divRoundUp(G, kTileSizeG); ++gg) {
-        const auto g = gg * kTileSizeG;
-        const auto k = kk * kTileSizeK;
-        const auto X_tile = X_data + b * G * K + g * K + k;
-        auto* Y_tile = Y_data + b * G * K + k * G + g;
-        if (kk * kTileSizeK + kTileSizeK <= K &&
-            gg * kTileSizeG + kTileSizeG <= G) {
-          // Complete tile.
-          TransposeTile<kTileSizeK, kTileSizeG>(X_tile, K, G, Y_tile);
-        } else {
-          uint8_t Xp[kTileSizeG][kTileSizeK];
-          uint8_t Yp[kTileSizeK][kTileSizeG];
-          for (auto kt = 0; kt < kTileSizeK; ++kt) {
-            for (auto gt = 0; gt < kTileSizeG; ++gt) {
-              if (k + kt < K && g + gt < G) {
-                Xp[gt][kt] = X_tile[gt * K + kt];
-              }
-            }
-          }
-          TransposeTile<kTileSizeK, kTileSizeG>(
-              &Xp[0][0], kTileSizeK, kTileSizeG, &Yp[0][0]);
-          for (auto kt = 0; kt < kTileSizeK; ++kt) {
-            for (auto gt = 0; gt < kTileSizeG; ++gt) {
-              if (k + kt < K && g + gt < G) {
-                Y_tile[kt * G + gt] = Yp[kt][gt];
-              }
-            }
-          }
-        }
-      }
-    }
-  };
-  threadPool->run(f, B);
-}
-
-} // namespace
-
 class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> {
  public:
   Int8ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws)
@@ -126,36 +23,75 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> {
         "Int8ChannelShuffleOp only supports NHWC order");
   }
 
+  ~Int8ChannelShuffleOp() {
+    if (this->qnnpackOperator_ != nullptr) {
+      qnnp_delete_operator(this->qnnpackOperator_);
+      this->qnnpackOperator_ = nullptr;
+    }
+  }
+
   bool RunOnDeviceWithOrderNHWC() override {
     const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
     auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
     Y->t.ResizeLike(X.t);
     Y->scale = X.scale;
     Y->zero_point = X.zero_point;
-    int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
-    auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
+    const int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
+    const float Y_scale = this->template GetSingleArgument<float>("Y_scale", 1.0f);
     CHECK_EQ(Y_offset, X.zero_point);
     CHECK_EQ(Y_scale, X.scale);
     CHECK_GE(X.zero_point, std::numeric_limits<uint8_t>::min());
     CHECK_LE(X.zero_point, std::numeric_limits<uint8_t>::max());
 
     const auto C = X.t.dim32(3);
-    CAFFE_ENFORCE(C % this->group_ == 0, "");
     const auto G = this->group_;
-    const auto K = C / G;
-    const auto B = X.t.dim32(0) * X.t.dim32(1) * X.t.dim32(2);
-    Int8ChannelShuffle(
-        X.t.data<uint8_t>(),
-        B,
-        K,
-        G,
-        Y->t.mutable_data<uint8_t>(),
-        ws_->GetThreadPool());
+    CAFFE_ENFORCE(C % G == 0, "");
+    const auto B = X.t.numel() / C;
+
+    initQNNPACK();
+
+    if (this->qnnpackOperator_ == nullptr) {
+      const qnnp_status createStatus = qnnp_create_channel_shuffle_nc_x8(
+        G /* groups */,
+        C / G /* group channels */,
+        &this->qnnpackOperator_);
+      CAFFE_ENFORCE(
+          createStatus == qnnp_status_success,
+          "failed to create QNNPACK channel shuffle operator");
+      CAFFE_ENFORCE(this->qnnpackOperator_ != nullptr);
+    }
+
+    const qnnp_status setupStatus = qnnp_setup_channel_shuffle_nc_x8(
+        this->qnnpackOperator_,
+        X.t.numel() / C /* batch size */,
+        X.t.template data<uint8_t>(),
+        C /* X stride */,
+        Y->t.template mutable_data<uint8_t>(),
+        C /* Y stride */);
+    CAFFE_ENFORCE(
+        setupStatus == qnnp_status_success,
+        "failed to setup QNNPACK channel shuffle operator");
+
+#ifdef FBCODE_CAFFE2
+    const qnnp_status runStatus =
+        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
+#else
+    pthreadpool_t threadpool =
+        reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
+    const qnnp_status runStatus =
+        qnnp_run_operator(this->qnnpackOperator_, threadpool);
+#endif
+    CAFFE_ENFORCE(
+        runStatus == qnnp_status_success,
+        "failed to run QNNPACK channel shuffle operator");
+
     return true;
   }
 
  private:
   Workspace* ws_;
+  // QNNPACK channel shuffle operator
+  qnnp_operator_t qnnpackOperator_{nullptr};
 };
 
 } // namespace int8
diff --git a/third_party/QNNPACK b/third_party/QNNPACK
index 8bb459126a..cf768177ba 160000
--- a/third_party/QNNPACK
+++ b/third_party/QNNPACK
@@ -1 +1 @@
-Subproject commit 8bb459126ad44eef11c9b0fbc5d5260ebfe177e4
+Subproject commit cf768177baa93c8c984817cb27080dac42661fea
-- 
cgit v1.2.3