diff options
author | Marat Dukhan <marat@fb.com> | 2018-11-26 17:41:13 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-11-26 17:43:32 -0800 |
commit | 9e1805d38ef9eaae2c56e85c67704105b33f97e5 (patch) | |
tree | 2ac363f3edf2bae93d04fa220e144d4a40571e11 | |
parent | 2d6f039766193623dbf17c46bfcd17bb8c8bdb32 (diff) | |
download | pytorch-9e1805d38ef9eaae2c56e85c67704105b33f97e5.tar.gz pytorch-9e1805d38ef9eaae2c56e85c67704105b33f97e5.tar.bz2 pytorch-9e1805d38ef9eaae2c56e85c67704105b33f97e5.zip |
Switch Int8ChannelShuffle operator to QNNPACK (#14362)
Summary:
1.8-2.2X better performance on ARM devices
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14362
Reviewed By: jerryzh168
Differential Revision: D13192312
Pulled By: Maratyszcza
fbshipit-source-id: 0d3dff067e300c7d741c42615b61246cbf09a829
-rw-r--r-- | caffe2/operators/quantized/int8_channel_shuffle_op.h | 170 | ||||
m--------- | third_party/QNNPACK | 0 |
2 files changed, 53 insertions, 117 deletions
diff --git a/caffe2/operators/quantized/int8_channel_shuffle_op.h b/caffe2/operators/quantized/int8_channel_shuffle_op.h index cdd2a404a9..beef35eece 100644 --- a/caffe2/operators/quantized/int8_channel_shuffle_op.h +++ b/caffe2/operators/quantized/int8_channel_shuffle_op.h @@ -1,121 +1,18 @@ #ifndef CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_ #define CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_ +#include <qnnpack.h> + #include "caffe2/core/context.h" #include "caffe2/core/operator.h" #include "caffe2/core/tensor_int8.h" #include "caffe2/operators/conv_pool_op_base.h" -#include "caffe2/operators/quantized/int8_simd.h" #include "caffe2/operators/quantized/int8_utils.h" namespace caffe2 { namespace int8 { -namespace { - -template <size_t TileSizeK, size_t TileSizeG> -inline void -TransposeTile(const uint8_t* X_tile, size_t K, size_t G, uint8_t* Y_tile) { -#ifdef INT8_NEON_SIMD - static_assert(TileSizeK == 8, ""); - static_assert(TileSizeG == 4, ""); - auto Transpose8x4_NEON = - [](uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3) { - const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); - const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); - const uint16x4x2_t c0 = vtrn_u16( - vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); - const uint16x4x2_t c1 = vtrn_u16( - vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); - *a0 = vreinterpret_u8_u16(c0.val[0]); - *a1 = vreinterpret_u8_u16(c1.val[0]); - *a2 = vreinterpret_u8_u16(c0.val[1]); - *a3 = vreinterpret_u8_u16(c1.val[1]); - }; - - uint8x8_t g0 = vld1_u8(X_tile + 0 * K); - uint8x8_t g1 = vld1_u8(X_tile + 1 * K); - uint8x8_t g2 = vld1_u8(X_tile + 2 * K); - uint8x8_t g3 = vld1_u8(X_tile + 3 * K); - Transpose8x4_NEON(&g0, &g1, &g2, &g3); - uint8_t tile[TileSizeK / 2][2][TileSizeG]; - vst1_u8(&tile[0][0][0], g0); - vst1_u8(&tile[1][0][0], g1); - vst1_u8(&tile[2][0][0], g2); - vst1_u8(&tile[3][0][0], g3); - for (auto kkk = 0; kkk < 2; ++kkk) { - for (auto kk = 0; kk < TileSizeK / 2; ++kk) { - const auto k = TileSizeK / 2 * kkk + kk; - for (auto g = 0; g < TileSizeG; ++g) { - Y_tile[k * G + g] = tile[kk][kkk][g]; - } - } - } -#else - uint8_t tile[TileSizeG][TileSizeK]; - for (auto g = 0; g < TileSizeG; ++g) { - for (auto k = 0; k < TileSizeK; ++k) { - tile[g][k] = X_tile[g * K + k]; - } - } - for (auto k = 0; k < TileSizeK; ++k) { - for (auto g = 0; g < TileSizeG; ++g) { - Y_tile[k * G + g] = tile[g][k]; - } - } -#endif -} - -void Int8ChannelShuffle( - const uint8_t* X_data, - size_t B, - size_t K, - size_t G, - uint8_t* Y_data, - ThreadPool* threadPool) { - auto divRoundUp = [](size_t n, size_t d) { return (n + d - 1) / d; }; - constexpr size_t kTileSizeG = 4; - constexpr size_t kTileSizeK = 8; - auto f = [&](int, size_t b) { - for (auto kk = 0; kk < divRoundUp(K, kTileSizeK); ++kk) { - for (auto gg = 0; gg < divRoundUp(G, kTileSizeG); ++gg) { - const auto g = gg * kTileSizeG; - const auto k = kk * kTileSizeK; - const auto X_tile = X_data + b * G * K + g * K + k; - auto* Y_tile = Y_data + b * G * K + k * G + g; - if (kk * kTileSizeK + kTileSizeK <= K && - gg * kTileSizeG + kTileSizeG <= G) { - // Complete tile. - TransposeTile<kTileSizeK, kTileSizeG>(X_tile, K, G, Y_tile); - } else { - uint8_t Xp[kTileSizeG][kTileSizeK]; - uint8_t Yp[kTileSizeK][kTileSizeG]; - for (auto kt = 0; kt < kTileSizeK; ++kt) { - for (auto gt = 0; gt < kTileSizeG; ++gt) { - if (k + kt < K && g + gt < G) { - Xp[gt][kt] = X_tile[gt * K + kt]; - } - } - } - TransposeTile<kTileSizeK, kTileSizeG>( - &Xp[0][0], kTileSizeK, kTileSizeG, &Yp[0][0]); - for (auto kt = 0; kt < kTileSizeK; ++kt) { - for (auto gt = 0; gt < kTileSizeG; ++gt) { - if (k + kt < K && g + gt < G) { - Y_tile[kt * G + gt] = Yp[kt][gt]; - } - } - } - } - } - } - }; - threadPool->run(f, B); -} - -} // namespace - class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> { public: Int8ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws) @@ -126,36 +23,75 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> { "Int8ChannelShuffleOp only supports NHWC order"); } + ~Int8ChannelShuffleOp() { + if (this->qnnpackOperator_ != nullptr) { + qnnp_delete_operator(this->qnnpackOperator_); + this->qnnpackOperator_ = nullptr; + } + } + bool RunOnDeviceWithOrderNHWC() override { const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); Y->t.ResizeLike(X.t); Y->scale = X.scale; Y->zero_point = X.zero_point; - int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); - auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + const int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + const float Y_scale = this->template GetSingleArgument<float>("Y_scale", 1.0f); CHECK_EQ(Y_offset, X.zero_point); CHECK_EQ(Y_scale, X.scale); CHECK_GE(X.zero_point, std::numeric_limits<uint8_t>::min()); CHECK_LE(X.zero_point, std::numeric_limits<uint8_t>::max()); const auto C = X.t.dim32(3); - CAFFE_ENFORCE(C % this->group_ == 0, ""); const auto G = this->group_; - const auto K = C / G; - const auto B = X.t.dim32(0) * X.t.dim32(1) * X.t.dim32(2); - Int8ChannelShuffle( - X.t.data<uint8_t>(), - B, - K, - G, - Y->t.mutable_data<uint8_t>(), - ws_->GetThreadPool()); + CAFFE_ENFORCE(C % G == 0, ""); + const auto B = X.t.numel() / C; + + initQNNPACK(); + + if (this->qnnpackOperator_ == nullptr) { + const qnnp_status createStatus = qnnp_create_channel_shuffle_nc_x8( + G /* groups */, + C / G /* group channels */, + &this->qnnpackOperator_); + CAFFE_ENFORCE( + createStatus == qnnp_status_success, + "failed to create QNNPACK channel shuffle operator"); + CAFFE_ENFORCE(this->qnnpackOperator_ != nullptr); + } + + const qnnp_status setupStatus = qnnp_setup_channel_shuffle_nc_x8( + this->qnnpackOperator_, + X.t.numel() / C /* batch size */, + X.t.template data<uint8_t>(), + C /* X stride */, + Y->t.template mutable_data<uint8_t>(), + C /* Y stride */); + CAFFE_ENFORCE( + setupStatus == qnnp_status_success, + "failed to setup QNNPACK channel shuffle operator"); + +#ifdef FBCODE_CAFFE2 + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); +#else + pthreadpool_t threadpool = + reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool()); + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackOperator_, threadpool); +#endif + CAFFE_ENFORCE( + runStatus == qnnp_status_success, + "failed to run QNNPACK channel shuffle operator"); + return true; } private: Workspace* ws_; + // QNNPACK channel shuffle operator + qnnp_operator_t qnnpackOperator_{nullptr}; }; } // namespace int8 diff --git a/third_party/QNNPACK b/third_party/QNNPACK -Subproject 8bb459126ad44eef11c9b0fbc5d5260ebfe177e +Subproject cf768177baa93c8c984817cb27080dac42661fe |