From 9e1805d38ef9eaae2c56e85c67704105b33f97e5 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Mon, 26 Nov 2018 17:41:13 -0800 Subject: Switch Int8ChannelShuffle operator to QNNPACK (#14362) Summary: 1.8-2.2X better performance on ARM devices Pull Request resolved: https://github.com/pytorch/pytorch/pull/14362 Reviewed By: jerryzh168 Differential Revision: D13192312 Pulled By: Maratyszcza fbshipit-source-id: 0d3dff067e300c7d741c42615b61246cbf09a829 --- .../operators/quantized/int8_channel_shuffle_op.h | 170 +++++++-------------- third_party/QNNPACK | 2 +- 2 files changed, 54 insertions(+), 118 deletions(-) diff --git a/caffe2/operators/quantized/int8_channel_shuffle_op.h b/caffe2/operators/quantized/int8_channel_shuffle_op.h index cdd2a404a9..beef35eece 100644 --- a/caffe2/operators/quantized/int8_channel_shuffle_op.h +++ b/caffe2/operators/quantized/int8_channel_shuffle_op.h @@ -1,121 +1,18 @@ #ifndef CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_ #define CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_ +#include + #include "caffe2/core/context.h" #include "caffe2/core/operator.h" #include "caffe2/core/tensor_int8.h" #include "caffe2/operators/conv_pool_op_base.h" -#include "caffe2/operators/quantized/int8_simd.h" #include "caffe2/operators/quantized/int8_utils.h" namespace caffe2 { namespace int8 { -namespace { - -template -inline void -TransposeTile(const uint8_t* X_tile, size_t K, size_t G, uint8_t* Y_tile) { -#ifdef INT8_NEON_SIMD - static_assert(TileSizeK == 8, ""); - static_assert(TileSizeG == 4, ""); - auto Transpose8x4_NEON = - [](uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3) { - const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); - const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); - const uint16x4x2_t c0 = vtrn_u16( - vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); - const uint16x4x2_t c1 = vtrn_u16( - vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); - *a0 = vreinterpret_u8_u16(c0.val[0]); - *a1 = vreinterpret_u8_u16(c1.val[0]); - *a2 = vreinterpret_u8_u16(c0.val[1]); - *a3 = vreinterpret_u8_u16(c1.val[1]); - }; - - uint8x8_t g0 = vld1_u8(X_tile + 0 * K); - uint8x8_t g1 = vld1_u8(X_tile + 1 * K); - uint8x8_t g2 = vld1_u8(X_tile + 2 * K); - uint8x8_t g3 = vld1_u8(X_tile + 3 * K); - Transpose8x4_NEON(&g0, &g1, &g2, &g3); - uint8_t tile[TileSizeK / 2][2][TileSizeG]; - vst1_u8(&tile[0][0][0], g0); - vst1_u8(&tile[1][0][0], g1); - vst1_u8(&tile[2][0][0], g2); - vst1_u8(&tile[3][0][0], g3); - for (auto kkk = 0; kkk < 2; ++kkk) { - for (auto kk = 0; kk < TileSizeK / 2; ++kk) { - const auto k = TileSizeK / 2 * kkk + kk; - for (auto g = 0; g < TileSizeG; ++g) { - Y_tile[k * G + g] = tile[kk][kkk][g]; - } - } - } -#else - uint8_t tile[TileSizeG][TileSizeK]; - for (auto g = 0; g < TileSizeG; ++g) { - for (auto k = 0; k < TileSizeK; ++k) { - tile[g][k] = X_tile[g * K + k]; - } - } - for (auto k = 0; k < TileSizeK; ++k) { - for (auto g = 0; g < TileSizeG; ++g) { - Y_tile[k * G + g] = tile[g][k]; - } - } -#endif -} - -void Int8ChannelShuffle( - const uint8_t* X_data, - size_t B, - size_t K, - size_t G, - uint8_t* Y_data, - ThreadPool* threadPool) { - auto divRoundUp = [](size_t n, size_t d) { return (n + d - 1) / d; }; - constexpr size_t kTileSizeG = 4; - constexpr size_t kTileSizeK = 8; - auto f = [&](int, size_t b) { - for (auto kk = 0; kk < divRoundUp(K, kTileSizeK); ++kk) { - for (auto gg = 0; gg < divRoundUp(G, kTileSizeG); ++gg) { - const auto g = gg * kTileSizeG; - const auto k = kk * kTileSizeK; - const auto X_tile = X_data + b * G * K + g * K + k; - auto* Y_tile = Y_data + b * G * K + k * G + g; - if (kk * kTileSizeK + kTileSizeK <= K && - gg * kTileSizeG + kTileSizeG <= G) { - // Complete tile. - TransposeTile(X_tile, K, G, Y_tile); - } else { - uint8_t Xp[kTileSizeG][kTileSizeK]; - uint8_t Yp[kTileSizeK][kTileSizeG]; - for (auto kt = 0; kt < kTileSizeK; ++kt) { - for (auto gt = 0; gt < kTileSizeG; ++gt) { - if (k + kt < K && g + gt < G) { - Xp[gt][kt] = X_tile[gt * K + kt]; - } - } - } - TransposeTile( - &Xp[0][0], kTileSizeK, kTileSizeG, &Yp[0][0]); - for (auto kt = 0; kt < kTileSizeK; ++kt) { - for (auto gt = 0; gt < kTileSizeG; ++gt) { - if (k + kt < K && g + gt < G) { - Y_tile[kt * G + gt] = Yp[kt][gt]; - } - } - } - } - } - } - }; - threadPool->run(f, B); -} - -} // namespace - class Int8ChannelShuffleOp final : public ConvPoolOpBase { public: Int8ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws) @@ -126,36 +23,75 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase { "Int8ChannelShuffleOp only supports NHWC order"); } + ~Int8ChannelShuffleOp() { + if (this->qnnpackOperator_ != nullptr) { + qnnp_delete_operator(this->qnnpackOperator_); + this->qnnpackOperator_ = nullptr; + } + } + bool RunOnDeviceWithOrderNHWC() override { const auto& X = Inputs()[0]->template Get(); auto* Y = Outputs()[0]->template GetMutable(); Y->t.ResizeLike(X.t); Y->scale = X.scale; Y->zero_point = X.zero_point; - int32_t Y_offset = this->template GetSingleArgument("Y_zero_point", 0); - auto Y_scale = this->template GetSingleArgument("Y_scale", 1); + const int32_t Y_offset = this->template GetSingleArgument("Y_zero_point", 0); + const float Y_scale = this->template GetSingleArgument("Y_scale", 1.0f); CHECK_EQ(Y_offset, X.zero_point); CHECK_EQ(Y_scale, X.scale); CHECK_GE(X.zero_point, std::numeric_limits::min()); CHECK_LE(X.zero_point, std::numeric_limits::max()); const auto C = X.t.dim32(3); - CAFFE_ENFORCE(C % this->group_ == 0, ""); const auto G = this->group_; - const auto K = C / G; - const auto B = X.t.dim32(0) * X.t.dim32(1) * X.t.dim32(2); - Int8ChannelShuffle( - X.t.data(), - B, - K, - G, - Y->t.mutable_data(), - ws_->GetThreadPool()); + CAFFE_ENFORCE(C % G == 0, ""); + const auto B = X.t.numel() / C; + + initQNNPACK(); + + if (this->qnnpackOperator_ == nullptr) { + const qnnp_status createStatus = qnnp_create_channel_shuffle_nc_x8( + G /* groups */, + C / G /* group channels */, + &this->qnnpackOperator_); + CAFFE_ENFORCE( + createStatus == qnnp_status_success, + "failed to create QNNPACK channel shuffle operator"); + CAFFE_ENFORCE(this->qnnpackOperator_ != nullptr); + } + + const qnnp_status setupStatus = qnnp_setup_channel_shuffle_nc_x8( + this->qnnpackOperator_, + X.t.numel() / C /* batch size */, + X.t.template data(), + C /* X stride */, + Y->t.template mutable_data(), + C /* Y stride */); + CAFFE_ENFORCE( + setupStatus == qnnp_status_success, + "failed to setup QNNPACK channel shuffle operator"); + +#ifdef FBCODE_CAFFE2 + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); +#else + pthreadpool_t threadpool = + reinterpret_cast(ws_->GetThreadPool()); + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackOperator_, threadpool); +#endif + CAFFE_ENFORCE( + runStatus == qnnp_status_success, + "failed to run QNNPACK channel shuffle operator"); + return true; } private: Workspace* ws_; + // QNNPACK channel shuffle operator + qnnp_operator_t qnnpackOperator_{nullptr}; }; } // namespace int8 diff --git a/third_party/QNNPACK b/third_party/QNNPACK index 8bb459126a..cf768177ba 160000 --- a/third_party/QNNPACK +++ b/third_party/QNNPACK @@ -1 +1 @@ -Subproject commit 8bb459126ad44eef11c9b0fbc5d5260ebfe177e4 +Subproject commit cf768177baa93c8c984817cb27080dac42661fea -- cgit v1.2.3