summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarat Dukhan <marat@fb.com>2018-11-26 17:41:13 -0800
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2018-11-26 17:43:32 -0800
commit9e1805d38ef9eaae2c56e85c67704105b33f97e5 (patch)
tree2ac363f3edf2bae93d04fa220e144d4a40571e11
parent2d6f039766193623dbf17c46bfcd17bb8c8bdb32 (diff)
downloadpytorch-9e1805d38ef9eaae2c56e85c67704105b33f97e5.tar.gz
pytorch-9e1805d38ef9eaae2c56e85c67704105b33f97e5.tar.bz2
pytorch-9e1805d38ef9eaae2c56e85c67704105b33f97e5.zip
Switch Int8ChannelShuffle operator to QNNPACK (#14362)
Summary: 1.8-2.2X better performance on ARM devices Pull Request resolved: https://github.com/pytorch/pytorch/pull/14362 Reviewed By: jerryzh168 Differential Revision: D13192312 Pulled By: Maratyszcza fbshipit-source-id: 0d3dff067e300c7d741c42615b61246cbf09a829
-rw-r--r--caffe2/operators/quantized/int8_channel_shuffle_op.h170
m---------third_party/QNNPACK0
2 files changed, 53 insertions, 117 deletions
diff --git a/caffe2/operators/quantized/int8_channel_shuffle_op.h b/caffe2/operators/quantized/int8_channel_shuffle_op.h
index cdd2a404a9..beef35eece 100644
--- a/caffe2/operators/quantized/int8_channel_shuffle_op.h
+++ b/caffe2/operators/quantized/int8_channel_shuffle_op.h
@@ -1,121 +1,18 @@
#ifndef CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_
#define CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_
+#include <qnnpack.h>
+
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor_int8.h"
#include "caffe2/operators/conv_pool_op_base.h"
-#include "caffe2/operators/quantized/int8_simd.h"
#include "caffe2/operators/quantized/int8_utils.h"
namespace caffe2 {
namespace int8 {
-namespace {
-
-template <size_t TileSizeK, size_t TileSizeG>
-inline void
-TransposeTile(const uint8_t* X_tile, size_t K, size_t G, uint8_t* Y_tile) {
-#ifdef INT8_NEON_SIMD
- static_assert(TileSizeK == 8, "");
- static_assert(TileSizeG == 4, "");
- auto Transpose8x4_NEON =
- [](uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3) {
- const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
- const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
- const uint16x4x2_t c0 = vtrn_u16(
- vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
- const uint16x4x2_t c1 = vtrn_u16(
- vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
- *a0 = vreinterpret_u8_u16(c0.val[0]);
- *a1 = vreinterpret_u8_u16(c1.val[0]);
- *a2 = vreinterpret_u8_u16(c0.val[1]);
- *a3 = vreinterpret_u8_u16(c1.val[1]);
- };
-
- uint8x8_t g0 = vld1_u8(X_tile + 0 * K);
- uint8x8_t g1 = vld1_u8(X_tile + 1 * K);
- uint8x8_t g2 = vld1_u8(X_tile + 2 * K);
- uint8x8_t g3 = vld1_u8(X_tile + 3 * K);
- Transpose8x4_NEON(&g0, &g1, &g2, &g3);
- uint8_t tile[TileSizeK / 2][2][TileSizeG];
- vst1_u8(&tile[0][0][0], g0);
- vst1_u8(&tile[1][0][0], g1);
- vst1_u8(&tile[2][0][0], g2);
- vst1_u8(&tile[3][0][0], g3);
- for (auto kkk = 0; kkk < 2; ++kkk) {
- for (auto kk = 0; kk < TileSizeK / 2; ++kk) {
- const auto k = TileSizeK / 2 * kkk + kk;
- for (auto g = 0; g < TileSizeG; ++g) {
- Y_tile[k * G + g] = tile[kk][kkk][g];
- }
- }
- }
-#else
- uint8_t tile[TileSizeG][TileSizeK];
- for (auto g = 0; g < TileSizeG; ++g) {
- for (auto k = 0; k < TileSizeK; ++k) {
- tile[g][k] = X_tile[g * K + k];
- }
- }
- for (auto k = 0; k < TileSizeK; ++k) {
- for (auto g = 0; g < TileSizeG; ++g) {
- Y_tile[k * G + g] = tile[g][k];
- }
- }
-#endif
-}
-
-void Int8ChannelShuffle(
- const uint8_t* X_data,
- size_t B,
- size_t K,
- size_t G,
- uint8_t* Y_data,
- ThreadPool* threadPool) {
- auto divRoundUp = [](size_t n, size_t d) { return (n + d - 1) / d; };
- constexpr size_t kTileSizeG = 4;
- constexpr size_t kTileSizeK = 8;
- auto f = [&](int, size_t b) {
- for (auto kk = 0; kk < divRoundUp(K, kTileSizeK); ++kk) {
- for (auto gg = 0; gg < divRoundUp(G, kTileSizeG); ++gg) {
- const auto g = gg * kTileSizeG;
- const auto k = kk * kTileSizeK;
- const auto X_tile = X_data + b * G * K + g * K + k;
- auto* Y_tile = Y_data + b * G * K + k * G + g;
- if (kk * kTileSizeK + kTileSizeK <= K &&
- gg * kTileSizeG + kTileSizeG <= G) {
- // Complete tile.
- TransposeTile<kTileSizeK, kTileSizeG>(X_tile, K, G, Y_tile);
- } else {
- uint8_t Xp[kTileSizeG][kTileSizeK];
- uint8_t Yp[kTileSizeK][kTileSizeG];
- for (auto kt = 0; kt < kTileSizeK; ++kt) {
- for (auto gt = 0; gt < kTileSizeG; ++gt) {
- if (k + kt < K && g + gt < G) {
- Xp[gt][kt] = X_tile[gt * K + kt];
- }
- }
- }
- TransposeTile<kTileSizeK, kTileSizeG>(
- &Xp[0][0], kTileSizeK, kTileSizeG, &Yp[0][0]);
- for (auto kt = 0; kt < kTileSizeK; ++kt) {
- for (auto gt = 0; gt < kTileSizeG; ++gt) {
- if (k + kt < K && g + gt < G) {
- Y_tile[kt * G + gt] = Yp[kt][gt];
- }
- }
- }
- }
- }
- }
- };
- threadPool->run(f, B);
-}
-
-} // namespace
-
class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> {
public:
Int8ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws)
@@ -126,36 +23,75 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> {
"Int8ChannelShuffleOp only supports NHWC order");
}
+ ~Int8ChannelShuffleOp() {
+ if (this->qnnpackOperator_ != nullptr) {
+ qnnp_delete_operator(this->qnnpackOperator_);
+ this->qnnpackOperator_ = nullptr;
+ }
+ }
+
bool RunOnDeviceWithOrderNHWC() override {
const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
Y->t.ResizeLike(X.t);
Y->scale = X.scale;
Y->zero_point = X.zero_point;
- int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
- auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
+ const int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
+ const float Y_scale = this->template GetSingleArgument<float>("Y_scale", 1.0f);
CHECK_EQ(Y_offset, X.zero_point);
CHECK_EQ(Y_scale, X.scale);
CHECK_GE(X.zero_point, std::numeric_limits<uint8_t>::min());
CHECK_LE(X.zero_point, std::numeric_limits<uint8_t>::max());
const auto C = X.t.dim32(3);
- CAFFE_ENFORCE(C % this->group_ == 0, "");
const auto G = this->group_;
- const auto K = C / G;
- const auto B = X.t.dim32(0) * X.t.dim32(1) * X.t.dim32(2);
- Int8ChannelShuffle(
- X.t.data<uint8_t>(),
- B,
- K,
- G,
- Y->t.mutable_data<uint8_t>(),
- ws_->GetThreadPool());
+ CAFFE_ENFORCE(C % G == 0, "");
+ const auto B = X.t.numel() / C;
+
+ initQNNPACK();
+
+ if (this->qnnpackOperator_ == nullptr) {
+ const qnnp_status createStatus = qnnp_create_channel_shuffle_nc_x8(
+ G /* groups */,
+ C / G /* group channels */,
+ &this->qnnpackOperator_);
+ CAFFE_ENFORCE(
+ createStatus == qnnp_status_success,
+ "failed to create QNNPACK channel shuffle operator");
+ CAFFE_ENFORCE(this->qnnpackOperator_ != nullptr);
+ }
+
+ const qnnp_status setupStatus = qnnp_setup_channel_shuffle_nc_x8(
+ this->qnnpackOperator_,
+ X.t.numel() / C /* batch size */,
+ X.t.template data<uint8_t>(),
+ C /* X stride */,
+ Y->t.template mutable_data<uint8_t>(),
+ C /* Y stride */);
+ CAFFE_ENFORCE(
+ setupStatus == qnnp_status_success,
+ "failed to setup QNNPACK channel shuffle operator");
+
+#ifdef FBCODE_CAFFE2
+ const qnnp_status runStatus =
+ qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
+#else
+ pthreadpool_t threadpool =
+ reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
+ const qnnp_status runStatus =
+ qnnp_run_operator(this->qnnpackOperator_, threadpool);
+#endif
+ CAFFE_ENFORCE(
+ runStatus == qnnp_status_success,
+ "failed to run QNNPACK channel shuffle operator");
+
return true;
}
private:
Workspace* ws_;
+ // QNNPACK channel shuffle operator
+ qnnp_operator_t qnnpackOperator_{nullptr};
};
} // namespace int8
diff --git a/third_party/QNNPACK b/third_party/QNNPACK
-Subproject 8bb459126ad44eef11c9b0fbc5d5260ebfe177e
+Subproject cf768177baa93c8c984817cb27080dac42661fe