diff options
author | Marat Dukhan <marat@fb.com> | 2018-10-25 12:38:35 -0700 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-10-25 12:43:00 -0700 |
commit | 9cb4bce847af688ca344caaa876d3c47a36f28bf (patch) | |
tree | bb08da3db0087d543417e81b9b2d42f87c325672 /caffe2/operators/quantized | |
parent | faa354e10243e9e6d6428e1bd0f317bb226c42bd (diff) | |
download | pytorch-9cb4bce847af688ca344caaa876d3c47a36f28bf.tar.gz pytorch-9cb4bce847af688ca344caaa876d3c47a36f28bf.tar.bz2 pytorch-9cb4bce847af688ca344caaa876d3c47a36f28bf.zip |
Open-source Caffe2 Int8 ops (#13065)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13065
- Open-source Caffe2 Int8 (quantized) operators
Reviewed By: Yangqing
Differential Revision: D10524381
fbshipit-source-id: 6daa153dc247572900c91e37262d033c368b382d
Diffstat (limited to 'caffe2/operators/quantized')
44 files changed, 4540 insertions, 0 deletions
diff --git a/caffe2/operators/quantized/init_qnnpack.cc b/caffe2/operators/quantized/init_qnnpack.cc new file mode 100644 index 0000000000..8b1356b366 --- /dev/null +++ b/caffe2/operators/quantized/init_qnnpack.cc @@ -0,0 +1,17 @@ +#include <mutex> + +#include <qnnpack.h> + +#include "caffe2/core/logging.h" + +namespace caffe2 { + +void initQNNPACK() { + static std::once_flag once; + static enum qnnp_status qnnpackStatus = qnnp_status_uninitialized; + std::call_once(once, []() { qnnpackStatus = qnnp_initialize(); }); + CAFFE_ENFORCE( + qnnpackStatus == qnnp_status_success, "failed to initialize QNNPACK"); +} + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_add_op.cc b/caffe2/operators/quantized/int8_add_op.cc new file mode 100644 index 0000000000..225a7a1f6b --- /dev/null +++ b/caffe2/operators/quantized/int8_add_op.cc @@ -0,0 +1,68 @@ +#include <climits> + +#include "caffe2/operators/quantized/int8_add_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Add, int8::Int8AddOp<int8::Activation::NONE>); +REGISTER_CPU_OPERATOR(Int8AddRelu, int8::Int8AddOp<int8::Activation::RELU>); + +REGISTER_CPU_OPERATOR(Int8Sum, int8::Int8AddOp<int8::Activation::NONE>); +REGISTER_CPU_OPERATOR(Int8SumRelu, int8::Int8AddOp<int8::Activation::RELU>); + +OPERATOR_SCHEMA(Int8Add) + .NumInputs(2) + .NumOutputs(1) + .AllowInplace({{0, 0}, {1, 0}}) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .SetDoc(R"DOC( + Performs element-wise binary Add (with no broadcast support). +)DOC") + .Input( + 0, + "A", + "First operand, should share the type with the second operand.") + .Input(1, "B", "Second operand. It should be of the same size as A.") + .Output(0, "C", "Result, has same dimensions and type as A"); + +OPERATOR_SCHEMA(Int8AddRelu) + .NumInputs(2) + .NumOutputs(1) + .AllowInplace({{0, 0}, {1, 0}}) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .SetDoc(R"DOC( + Performs element-wise binary Add (with no broadcast support). " + "Output will go through rectified linear " + "function, where y = max(0, x). +)DOC") + .Input( + 0, + "A", + "First operand, should share the type with the second operand.") + .Input(1, "B", "Second operand. It should be of the same size as A.") + .Output(0, "C", "Result, has same dimensions and type as A"); + +/* + * These ops are defined as alias of Int8Add/Int8AddRelu for compatibility + * with current production models. In the future these ops will be changed + * to an equivalent of Sum op, which does reduction of a single argument. + * We deliberately omit schema for Int8Sum/Int8SumRelu so they can + * temporary use either legacy or the new semantics depending on the engine. + */ +OPERATOR_SCHEMA(Int8Sum) + .NumInputs(1, std::numeric_limits<int>::max()) + .NumOutputs(1) + .AllowInplace({{0, 0}, {1, 0}}) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset"); + +OPERATOR_SCHEMA(Int8SumRelu) + .NumInputs(1, std::numeric_limits<int>::max()) + .NumOutputs(1) + .AllowInplace({{0, 0}, {1, 0}}) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset"); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_add_op.h b/caffe2/operators/quantized/int8_add_op.h new file mode 100644 index 0000000000..b847b012c0 --- /dev/null +++ b/caffe2/operators/quantized/int8_add_op.h @@ -0,0 +1,222 @@ +#ifndef CAFFE2_OPERATORS_INT8_ADD_OP_H_ +#define CAFFE2_OPERATORS_INT8_ADD_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +namespace { + +/* + * Implementation based on TensorFlow Lite kernels: + * - Repo: https://github.com/tensorflow/tensorflow + * - Path: tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h + * - Hash: d4ad9c73969c45d1a224ebfc43eb645b9860216b + */ + +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +constexpr size_t kAddLeftShift = 20; + +void Int8Add( + const uint8_t* X0_data, + size_t N, + size_t D, + int32_t X0_offset, + int32_t X0_multiplier, + int X0_shift, + const uint8_t* X1_data, + int32_t X1_offset, + int32_t X1_multiplier, + int X1_shift, + int32_t Y_offset, + int32_t Y_multiplier, + int Y_shift, + uint8_t* Y_data, + uint8_t Y_activation_min, + uint8_t Y_activation_max, + C2GEMMContext* gemm_context) { + CHECK_GT(X0_offset, -256); + CHECK_GT(X1_offset, -256); + CHECK_LT(X0_offset, 256); + CHECK_LT(X1_offset, 256); + static_assert(kAddLeftShift > 5, ""); + static_assert(kAddLeftShift <= 20, ""); + + auto f = [&](int, size_t n) { + size_t d = 0; + +#ifdef INT8_NEON_SIMD + constexpr size_t kIntermediateAddLeftShift = 4; + const auto X0_offset_val = + vshlq_n_s16(vdupq_n_s16(X0_offset), kIntermediateAddLeftShift); + const auto X1_offset_val = + vshlq_n_s16(vdupq_n_s16(X1_offset), kIntermediateAddLeftShift); + const auto X0_shift_dup = vdupq_n_s32(-X0_shift); + const auto X1_shift_dup = vdupq_n_s32(-X1_shift); + const auto DUnroll = (D / 8) * 8; + + for (; d < DUnroll; d += 8) { + const auto X0_val_original = vld1_u8(X0_data + n * D + d); + const auto X1_val_original = vld1_u8(X1_data + n * D + d); + + // Load input + // Widen to int16 + // Add int16 offset. + // Widen to int32 + // Shift right by 20. + // Alternatively, we can widening shift X by 4, shifty X0_offset by 4, + // add, then shift by 16. Safe as X << 5 + X_offset << 5 can't overflow + // uint16, as X ~ 8 bit, X_offset ~ 10 bit, so 15 bits total from X + + // X_offset + const auto X0_val_s16 = vreinterpretq_s16_u16( + vshll_n_u8(X0_val_original, kIntermediateAddLeftShift)); + const auto X1_val_s16 = vreinterpretq_s16_u16( + vshll_n_u8(X1_val_original, kIntermediateAddLeftShift)); + const auto X0_val = vaddq_s16(X0_val_s16, X0_offset_val); + const auto X1_val = vaddq_s16(X1_val_s16, X1_offset_val); + const auto X0_val_high = vget_high_s16(X0_val); + const auto X0_val_low = vget_low_s16(X0_val); + const auto X1_val_high = vget_high_s16(X1_val); + const auto X1_val_low = vget_low_s16(X1_val); + auto x11 = + vshll_n_s16(X0_val_low, kAddLeftShift - kIntermediateAddLeftShift); + auto x12 = + vshll_n_s16(X0_val_high, kAddLeftShift - kIntermediateAddLeftShift); + auto x21 = + vshll_n_s16(X1_val_low, kAddLeftShift - kIntermediateAddLeftShift); + auto x22 = + vshll_n_s16(X1_val_high, kAddLeftShift - kIntermediateAddLeftShift); + x11 = vqrdmulhq_n_s32(x11, X0_multiplier); + x12 = vqrdmulhq_n_s32(x12, X0_multiplier); + x21 = vqrdmulhq_n_s32(x21, X1_multiplier); + x22 = vqrdmulhq_n_s32(x22, X1_multiplier); + x11 = vshlq_s32(x11, X0_shift_dup); + x12 = vshlq_s32(x12, X0_shift_dup); + x21 = vshlq_s32(x21, X1_shift_dup); + x22 = vshlq_s32(x22, X1_shift_dup); + auto s1 = vaddq_s32(x11, x21); + auto s2 = vaddq_s32(x12, x22); + s1 = vqrdmulhq_n_s32(s1, Y_multiplier); + s2 = vqrdmulhq_n_s32(s2, Y_multiplier); + using gemmlowp::RoundingDivideByPOT; + s1 = RoundingDivideByPOT(s1, Y_shift); + s2 = RoundingDivideByPOT(s2, Y_shift); + const auto s1_narrowed = vmovn_s32(s1); + const auto s2_narrowed = vmovn_s32(s2); + const auto s = vaddq_s16( + vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(Y_offset)); + auto ss = vqmovun_s16(s); + ss = vmin_u8(ss, vdup_n_u8(Y_activation_max)); + ss = vmax_u8(ss, vdup_n_u8(Y_activation_min)); + vst1_u8(Y_data + n * D + d, ss); + } +#endif // NEON + + for (; d < D; d++) { + const int32_t X0_val = X0_offset + X0_data[n * D + d]; + const int32_t X1_val = X1_offset + X1_data[n * D + d]; + const int32_t shifted_X0_val = X0_val * (1 << kAddLeftShift); + const int32_t shifted_X1_val = X1_val * (1 << kAddLeftShift); + const int32_t scaled_X0_val = MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_X0_val, X0_multiplier, X0_shift); + const int32_t scaled_X1_val = MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_X1_val, X1_multiplier, X1_shift); + const int32_t raw_sum = scaled_X0_val + scaled_X1_val; + const int32_t raw_Y = MultiplyByQuantizedMultiplierSmallerThanOne( + raw_sum, Y_multiplier, Y_shift) + + Y_offset; + const int32_t clamped_Y = std::min<int32_t>( + Y_activation_max, std::max<int32_t>(Y_activation_min, raw_Y)); + Y_data[n * D + d] = static_cast<uint8_t>(clamped_Y); + } + }; + gemm_context->threadPool()->run(f, N); +} + +} // namespace + +template <Activation Ac> +class Int8AddOp final : public Operator<CPUContext> { + public: + Int8AddOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws), + gemm_context_(ws->GetThreadPool()) {} + + bool RunOnDevice() override { + CAFFE_ENFORCE_EQ(Inputs().size(), 2); + const auto& X0 = Inputs()[0]->template Get<Int8TensorCPU>(); + const auto& X1 = Inputs()[1]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + auto X0_offset = -X0.zero_point; + auto X1_offset = -X1.zero_point; + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + const double twice_max_input_scale = 2 * std::max(X0.scale, X1.scale); + const double real_X0_multiplier = X0.scale / twice_max_input_scale; + const double real_X1_multiplier = X1.scale / twice_max_input_scale; + const double real_Y_multiplier = + twice_max_input_scale / ((1 << kAddLeftShift) * Y_scale); + + Y->t.ResizeLike(X0.t); + Y->zero_point = Y_offset; + Y->scale = Y_scale; + + int32_t X0_multiplier; + int X0_shift; + QuantizeMultiplierSmallerThanOne( + real_X0_multiplier, &X0_multiplier, &X0_shift); + int32_t X1_multiplier; + int X1_shift; + QuantizeMultiplierSmallerThanOne( + real_X1_multiplier, &X1_multiplier, &X1_shift); + int32_t Y_multiplier; + int Y_shift; + QuantizeMultiplierSmallerThanOne( + real_Y_multiplier, &Y_multiplier, &Y_shift); + + Int8Add( + X0.t.template data<uint8_t>(), + X0.t.size() / X0.t.dim(X0.t.ndim() - 1), + X0.t.dim(X0.t.ndim() - 1), + X0_offset, + X0_multiplier, + X0_shift, + X1.t.template data<uint8_t>(), + X1_offset, + X1_multiplier, + X1_shift, + Y_offset, + Y_multiplier, + Y_shift, + Y->t.template mutable_data<uint8_t>(), + activationLimits(Y->scale, Y->zero_point, Ac).first, + activationLimits(Y->scale, Y->zero_point, Ac).second, + &gemm_context_); + return true; + } + + private: + C2GEMMContext gemm_context_; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_ADD_OP_H_ diff --git a/caffe2/operators/quantized/int8_average_pool_op.cc b/caffe2/operators/quantized/int8_average_pool_op.cc new file mode 100644 index 0000000000..70df83f6fc --- /dev/null +++ b/caffe2/operators/quantized/int8_average_pool_op.cc @@ -0,0 +1,65 @@ +#include "caffe2/operators/quantized/int8_average_pool_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR( + Int8AveragePool, + int8::Int8AveragePoolOp<int8::Activation::NONE>); +REGISTER_CPU_OPERATOR( + Int8AveragePoolRelu, + int8::Int8AveragePoolOp<int8::Activation::RELU>); + +const char kAveragePoolDoc_int8[] = R"DOC( +consumes an input blob X and applies average pooling across the +the blob according to kernel sizes, stride sizes, and pad lengths defined by the +ConvPoolOpBase operator. Average pooling consisting of averaging all values of a +subset of the input tensor according to the kernel size and downsampling the +data into the output blob Y for further processing. +)DOC"; + +std::function<void(OpSchema&)> AveragePoolDocGenerator( + const char* dim, + bool relu_fused = false) { + auto suffix = relu_fused ? " Output will go through rectified linear " + "function, where y = max(0, x)." + : ""; + return [=](OpSchema& schema) { + string doc = "AveragePool{dim} {pool_doc}"; + c10::ReplaceAll(doc, "{dim}", dim); + c10::ReplaceAll(doc, "{pool_doc}", kAveragePoolDoc_int8); + schema.SetDoc(doc); + string output_doc = + "Output data tensor from average pooling across the input " + "tensor. Dimensions will vary based on various kernel, stride, and pad " + "sizes.{suffix}"; + c10::ReplaceAll(output_doc, "{suffix}", suffix); + schema.Input( + 0, + "X", + "Input data tensor from the previous operator; dimensions depend on " + "whether the NCHW or NHWC operators are being used. For example, in " + "the former, the input has size (N x C x H x W), where N is the batch " + "size, C is the number of channels, and H and W are the height and the " + "width of the data. The corresponding permutation of dimensions is " + "used in the latter case."); + schema.Output(0, "Y", output_doc.c_str()); + }; +} + +OPERATOR_SCHEMA(Int8AveragePool) + .NumInputs(1) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) + .FillUsing(AveragePoolDocGenerator("")); + +OPERATOR_SCHEMA(Int8AveragePoolRelu) + .NumInputs(1) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) + .FillUsing(AveragePoolDocGenerator("", true)); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_average_pool_op.h b/caffe2/operators/quantized/int8_average_pool_op.h new file mode 100644 index 0000000000..c8bcb66d93 --- /dev/null +++ b/caffe2/operators/quantized/int8_average_pool_op.h @@ -0,0 +1,197 @@ +#ifndef CAFFE2_OPERATORS_INT8_AVERAGE_POOL_OP_H_ +#define CAFFE2_OPERATORS_INT8_AVERAGE_POOL_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/conv_pool_op_base.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +namespace { + +/* + * Implementation based on TensorFlow Lite kernels: + * - Repo: https://github.com/tensorflow/tensorflow + * - Path: tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h + * - Hash: d4ad9c73969c45d1a224ebfc43eb645b9860216b + */ + +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +void Int8AveragePool( + const uint8_t* input_data, + at::IntList input_dims, + int stride_width, + int stride_height, + int pad_width, + int pad_height, + int filter_width, + int filter_height, + uint8_t* output_data, + at::IntList output_dims, + uint8_t output_activation_min, + uint8_t output_activation_max) { + DCHECK_LE(output_activation_min, output_activation_max); + const int batches = input_dims[0]; + const int depth = input_dims[3]; + const int input_height = input_dims[1]; + const int input_width = input_dims[2]; + const int output_height = output_dims[1]; + const int output_width = output_dims[2]; + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = + std::min(filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = + std::min(filter_height, input_height - in_y_origin); + const int filter_count = + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + // 1280 required by Inception v3 + static constexpr int kAccBufferMaxSize = 2048; + DCHECK_LE(depth, kAccBufferMaxSize); + uint16_t acc[kAccBufferMaxSize]; + memset(acc, 0, depth * sizeof(acc[0])); + const uint8_t* input_ptr = input_data + depth * in_x_origin + + depth * input_width * in_y_origin + + depth * input_width * input_height * batch; + for (int fy = filter_y_start; fy < filter_y_end; fy++) { + const uint8_t* input_row_ptr = + input_ptr + fy * input_width * depth + filter_x_start * depth; + for (int fx = filter_x_start; fx < filter_x_end; fx++) { + int channel = 0; +#ifdef INT8_NEON_SIMD + for (; channel <= depth - 16; channel += 16) { + uint16x8_t acc_reg[2]; + for (int i = 0; i < 2; i++) { + acc_reg[i] = vld1q_u16(acc + channel + 8 * i); + } + uint8x16_t input_reg = vld1q_u8(input_row_ptr); + input_row_ptr += 16; + acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg)); + acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg)); + for (int i = 0; i < 2; i++) { + vst1q_u16(acc + channel + 8 * i, acc_reg[i]); + } + } + for (; channel <= depth - 8; channel += 8) { + uint16x8_t acc_reg = vld1q_u16(acc + channel); + uint8x8_t input_reg = vld1_u8(input_row_ptr); + input_row_ptr += 8; + acc_reg = vaddw_u8(acc_reg, input_reg); + vst1q_u16(acc + channel, acc_reg); + } +#endif + for (; channel < depth; ++channel) { + acc[channel] += *input_row_ptr++; + } + } + } + uint8_t* output_ptr = output_data + out_x * depth + + out_y * depth * output_width + + batch * depth * output_width * output_height; + int channel = 0; +#ifdef INT8_NEON_SIMD +#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ + if (filter_count == FILTER_COUNT) { \ + for (; channel <= depth - 8; channel += 8) { \ + uint16_t buf[8]; \ + for (int i = 0; i < 8; i++) { \ + buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ + } \ + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ + buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); \ + buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); \ + vst1_u8(output_ptr + channel, buf8); \ + } \ + } + AVGPOOL_DIVIDING_BY(9) + AVGPOOL_DIVIDING_BY(15) +#undef AVGPOOL_DIVIDING_BY + for (; channel <= depth - 8; channel += 8) { + uint16_t buf[8]; + for (int i = 0; i < 8; i++) { + buf[i] = (acc[channel + i] + filter_count / 2) / filter_count; + } + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); + buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); + buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); + vst1_u8(output_ptr + channel, buf8); + } +#endif + for (; channel < depth; ++channel) { + uint16_t a = (acc[channel] + filter_count / 2) / filter_count; + a = std::max<uint16_t>(a, output_activation_min); + a = std::min<uint16_t>(a, output_activation_max); + output_ptr[channel] = static_cast<uint8_t>(a); + } + } + } + } +} + +} // namespace + +template <Activation Ac> +class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> { + public: + Int8AveragePoolOp(const OperatorDef& operator_def, Workspace* ws) + : ConvPoolOpBase<CPUContext>(operator_def, ws) { + OPERATOR_NEEDS_FEATURE( + this->order_ == StorageOrder::NHWC, "Int8 only supports NCHW order."); + } + + bool RunOnDeviceWithOrderNHWC() override { + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + Y->scale = X.scale; + Y->zero_point = X.zero_point; + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + + CHECK_EQ(X.t.ndim(), 4); + const int channels = X.t.dim32(3); + ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), channels); + + Int8AveragePool( + X.t.template data<uint8_t>(), + X.t.sizes(), + stride_w(), + stride_h(), + pad_l(), + pad_t(), + kernel_w(), + kernel_h(), + Y->t.template mutable_data<uint8_t>(), + Y->t.sizes(), + activationLimits(Y->scale, Y->zero_point, Ac).first, + activationLimits(Y->scale, Y->zero_point, Ac).second); + return true; + } +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_AVERAGE_POOL_OP_H_ diff --git a/caffe2/operators/quantized/int8_channel_shuffle_op.cc b/caffe2/operators/quantized/int8_channel_shuffle_op.cc new file mode 100644 index 0000000000..a133e2bf74 --- /dev/null +++ b/caffe2/operators/quantized/int8_channel_shuffle_op.cc @@ -0,0 +1,14 @@ +#include "caffe2/operators/quantized/int8_channel_shuffle_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8ChannelShuffle, int8::Int8ChannelShuffleOp); + +OPERATOR_SCHEMA(Int8ChannelShuffle) + .IdenticalTypeAndShape() + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .NumInputs(1) + .NumOutputs(1); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_channel_shuffle_op.h b/caffe2/operators/quantized/int8_channel_shuffle_op.h new file mode 100644 index 0000000000..4589c7d9e8 --- /dev/null +++ b/caffe2/operators/quantized/int8_channel_shuffle_op.h @@ -0,0 +1,164 @@ +#ifndef CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_ +#define CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/conv_pool_op_base.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +namespace { + +template <size_t TileSizeK, size_t TileSizeG> +inline void +TransposeTile(const uint8_t* X_tile, size_t K, size_t G, uint8_t* Y_tile) { +#ifdef INT8_NEON_SIMD + static_assert(TileSizeK == 8, ""); + static_assert(TileSizeG == 4, ""); + auto Transpose8x4_NEON = + [](uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3) { + const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); + const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); + const uint16x4x2_t c0 = vtrn_u16( + vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); + const uint16x4x2_t c1 = vtrn_u16( + vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); + *a0 = vreinterpret_u8_u16(c0.val[0]); + *a1 = vreinterpret_u8_u16(c1.val[0]); + *a2 = vreinterpret_u8_u16(c0.val[1]); + *a3 = vreinterpret_u8_u16(c1.val[1]); + }; + + uint8x8_t g0 = vld1_u8(X_tile + 0 * K); + uint8x8_t g1 = vld1_u8(X_tile + 1 * K); + uint8x8_t g2 = vld1_u8(X_tile + 2 * K); + uint8x8_t g3 = vld1_u8(X_tile + 3 * K); + Transpose8x4_NEON(&g0, &g1, &g2, &g3); + uint8_t tile[TileSizeK / 2][2][TileSizeG]; + vst1_u8(&tile[0][0][0], g0); + vst1_u8(&tile[1][0][0], g1); + vst1_u8(&tile[2][0][0], g2); + vst1_u8(&tile[3][0][0], g3); + for (auto kkk = 0; kkk < 2; ++kkk) { + for (auto kk = 0; kk < TileSizeK / 2; ++kk) { + const auto k = TileSizeK / 2 * kkk + kk; + for (auto g = 0; g < TileSizeG; ++g) { + Y_tile[k * G + g] = tile[kk][kkk][g]; + } + } + } +#else + uint8_t tile[TileSizeG][TileSizeK]; + for (auto g = 0; g < TileSizeG; ++g) { + for (auto k = 0; k < TileSizeK; ++k) { + tile[g][k] = X_tile[g * K + k]; + } + } + for (auto k = 0; k < TileSizeK; ++k) { + for (auto g = 0; g < TileSizeG; ++g) { + Y_tile[k * G + g] = tile[g][k]; + } + } +#endif +} + +void Int8ChannelShuffle( + const uint8_t* X_data, + size_t B, + size_t K, + size_t G, + uint8_t* Y_data, + C2GEMMContext* gemm_context) { + auto divRoundUp = [](size_t n, size_t d) { return (n + d - 1) / d; }; + constexpr size_t kTileSizeG = 4; + constexpr size_t kTileSizeK = 8; + auto f = [&](int, size_t b) { + for (auto kk = 0; kk < divRoundUp(K, kTileSizeK); ++kk) { + for (auto gg = 0; gg < divRoundUp(G, kTileSizeG); ++gg) { + const auto g = gg * kTileSizeG; + const auto k = kk * kTileSizeK; + const auto X_tile = X_data + b * G * K + g * K + k; + auto* Y_tile = Y_data + b * G * K + k * G + g; + if (kk * kTileSizeK + kTileSizeK <= K && + gg * kTileSizeG + kTileSizeG <= G) { + // Complete tile. + TransposeTile<kTileSizeK, kTileSizeG>(X_tile, K, G, Y_tile); + } else { + uint8_t Xp[kTileSizeG][kTileSizeK]; + uint8_t Yp[kTileSizeK][kTileSizeG]; + for (auto kt = 0; kt < kTileSizeK; ++kt) { + for (auto gt = 0; gt < kTileSizeG; ++gt) { + if (k + kt < K && g + gt < G) { + Xp[gt][kt] = X_tile[gt * K + kt]; + } + } + } + TransposeTile<kTileSizeK, kTileSizeG>( + &Xp[0][0], kTileSizeK, kTileSizeG, &Yp[0][0]); + for (auto kt = 0; kt < kTileSizeK; ++kt) { + for (auto gt = 0; gt < kTileSizeG; ++gt) { + if (k + kt < K && g + gt < G) { + Y_tile[kt * G + gt] = Yp[kt][gt]; + } + } + } + } + } + } + }; + gemm_context->threadPool()->run(f, B); +} + +} // namespace + +class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> { + public: + Int8ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws) + : ConvPoolOpBase<CPUContext>(operator_def, ws), + gemm_context_(ws->GetThreadPool()) { + OPERATOR_NEEDS_FEATURE( + this->order_ == StorageOrder::NHWC, + "Int8ChannelShuffleOp only supports NHWC order"); + } + + bool RunOnDeviceWithOrderNHWC() override { + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + Y->t.ResizeLike(X.t); + Y->scale = X.scale; + Y->zero_point = X.zero_point; + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + CHECK_GE(X.zero_point, std::numeric_limits<uint8_t>::min()); + CHECK_LE(X.zero_point, std::numeric_limits<uint8_t>::max()); + + const auto C = X.t.dim32(3); + CAFFE_ENFORCE(C % this->group_ == 0, ""); + const auto G = this->group_; + const auto K = C / G; + const auto B = X.t.dim32(0) * X.t.dim32(1) * X.t.dim32(2); + Int8ChannelShuffle( + X.t.data<uint8_t>(), + B, + K, + G, + Y->t.mutable_data<uint8_t>(), + &gemm_context_); + return true; + } + + private: + C2GEMMContext gemm_context_; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_CHANNEL_SHUFFLE_OP_H_ diff --git a/caffe2/operators/quantized/int8_concat_op.cc b/caffe2/operators/quantized/int8_concat_op.cc new file mode 100644 index 0000000000..8950d41427 --- /dev/null +++ b/caffe2/operators/quantized/int8_concat_op.cc @@ -0,0 +1,22 @@ +#include "caffe2/operators/quantized/int8_concat_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Concat, int8::Int8ConcatOp); + +OPERATOR_SCHEMA(Int8Concat) + .NumInputs(1, INT_MAX) + .NumOutputs(1, 2) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .Arg("axis", "Which axis to concat on") + .Arg( + "add_axis", + "Pass 1 to add the axis specified in arg 'axis' to all " + "input tensors") + .SetDoc("Concatenate a list of tensors into a single tensor") + .Output(0, "concat_result", "Concatenated tensor") + .Output(1, "split_info", "The dimensions of the inputs.") + .InheritOnnxSchema("Concat"); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_concat_op.h b/caffe2/operators/quantized/int8_concat_op.h new file mode 100644 index 0000000000..939376f795 --- /dev/null +++ b/caffe2/operators/quantized/int8_concat_op.h @@ -0,0 +1,90 @@ +#ifndef CAFFE2_OPERATORS_INT8_CONCAT_OP_H_ +#define CAFFE2_OPERATORS_INT8_CONCAT_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +class Int8ConcatOp final : public Operator<CPUContext> { + public: + Int8ConcatOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws) { + // concat supports more than NHWC format + if (this->template GetSingleArgument<string>("order", "") == "NHWC") { + // Default to C axis + axis_ = this->template GetSingleArgument<int>("axis", 3); + CHECK_GE(axis_, 0); + CHECK_LT(axis_, 4); + } else if ( + this->template GetSingleArgument<string>("order", "") == "NCHW") { + axis_ = this->template GetSingleArgument<int>("axis", 1); + CHECK_GE(axis_, 0); + CHECK_LT(axis_, 4); + } else { + axis_ = this->template GetSingleArgument<int>("axis", 0); + } + } + + bool RunOnDevice() override { + const auto& X0 = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + Y->scale = X0.scale; + Y->zero_point = X0.zero_point; + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X0.zero_point); + CHECK_EQ(Y_scale, X0.scale); + CHECK_GE(X0.zero_point, std::numeric_limits<uint8_t>::min()); + CHECK_LE(X0.zero_point, std::numeric_limits<uint8_t>::max()); + auto Y_dims = X0.t.sizes().vec(); + if (this->template GetSingleArgument<string>("order", "") == "NHWC") { + CHECK_EQ(Y_dims.size(), 4); + } + for (auto i = 1; i < InputSize(); ++i) { + const auto& Xi = Inputs()[i]->template Get<Int8TensorCPU>(); + CHECK_EQ(Xi.t.ndim(), Y_dims.size()); + for (auto j = 0; j < Y_dims.size(); ++j) { + if (j != axis_) { + CHECK_EQ(Xi.t.dim(j), Y_dims[j]); + } + } + Y_dims[axis_] += Xi.t.dim(axis_); + } + Y->t.Resize(Y_dims); + int before = X0.t.size_to_dim(axis_); + int after = X0.t.size_from_dim(axis_ + 1); + const auto C_total = Y_dims[axis_]; + size_t C_offset = 0; + for (auto i = 0; i < InputSize(); ++i) { + const auto& Xi = Inputs()[i]->template Get<Int8TensorCPU>(); + // Copy the NxHxWxC input slice to NxHxWx[C_offset:C_offset + C]. + const auto Ci = Xi.t.dim(axis_); + math::CopyMatrix<CPUContext>( + Xi.t.itemsize(), + before, + Ci * after, + Xi.t.template data<uint8_t>(), + Ci * after, + Y->t.template mutable_data<uint8_t>() + C_offset, + C_total * after, + &context_, + Xi.t.meta().copy()); + C_offset += Ci * after * Xi.t.itemsize(); + } + return true; + } + + private: + int axis_; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_CONCAT_OP_H_ diff --git a/caffe2/operators/quantized/int8_conv_op.cc b/caffe2/operators/quantized/int8_conv_op.cc new file mode 100644 index 0000000000..4bbd83f5ab --- /dev/null +++ b/caffe2/operators/quantized/int8_conv_op.cc @@ -0,0 +1,81 @@ +#include "caffe2/operators/quantized/int8_conv_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Conv, int8::Int8ConvOp<int8::Activation::NONE>); +REGISTER_CPU_OPERATOR(Int8ConvRelu, int8::Int8ConvOp<int8::Activation::RELU>); + +const char kConvDoc_int8[] = R"DOC( +[Only NHWC order is supported now]Note that other parameters, such as the stride and +kernel size, or the pads' sizes in each direction are not necessary for input +because they are provided by the ConvPoolOpBase operator. Various dimension +checks are done implicitly, and the sizes are specified in the Input docs for +this operator. As is expected, the filter is convolved with a subset of the +image and the bias is added; this is done throughout the image data and the +output is computed. As a side note on the implementation layout: +conv_op_impl.h is the templated implementation of the conv_op.h file, which is +why they are separate files. +)DOC"; + +std::function<void(OpSchema&)> ConvDocGenerator( + const char* dim, + bool relu_fused = false) { + auto suffix = relu_fused ? " Output will go through rectified linear " + "function, where y = max(0, x)." + : ""; + return [=](OpSchema& schema) { + string doc = R"DOC( +The convolution operator consumes an input vector, a {dim}filter blob +and a bias blob and computes the output. {conv_doc})DOC"; + c10::ReplaceAll(doc, "{dim}", dim); + c10::ReplaceAll(doc, "{conv_doc}", kConvDoc_int8); + schema.SetDoc(doc); + string output_doc = + "Output data blob that contains the result of the " + "convolution. The output dimensions are functions of the kernel size, " + "stride size, and pad lengths.{suffix}"; + c10::ReplaceAll(output_doc, "{suffix}", suffix); + schema.Input( + 0, + "X", + "Input data blob from previous layer; has size (N x C x H x W), " + "where N is the batch size, C is the number of channels, " + "and H and W are the height and width. Note that this is for the NCHW " + "usage. On the other hand, the NHWC Op has a different set of " + "dimension constraints. "); + schema.Input( + 1, + "filter", + "The filter blob that will be used in the " + "convolutions; has size (M x C x kH x kW), where C is the number of " + "channels, and kH and kW are the height and width of the kernel."); + schema.Input( + 2, + "bias", + "The 1D bias blob that is added through the " + "convolution; has size (M)."); + schema.Output(0, "Y", output_doc.c_str()); + }; +} + +OPERATOR_SCHEMA(Int8Conv) + .NumInputs(2, 3) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv) + .CostInferenceFunction(OpSchema::CostInferenceFunctionType( + ConvPoolOpBase<CPUContext>::CostInferenceForConv)) + .FillUsing(ConvDocGenerator("")); + +OPERATOR_SCHEMA(Int8ConvRelu) + .NumInputs(2, 3) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv) + .CostInferenceFunction(OpSchema::CostInferenceFunctionType( + ConvPoolOpBase<CPUContext>::CostInferenceForConv)) + .FillUsing(ConvDocGenerator("", true)); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_conv_op.h b/caffe2/operators/quantized/int8_conv_op.h new file mode 100644 index 0000000000..befa85bc25 --- /dev/null +++ b/caffe2/operators/quantized/int8_conv_op.h @@ -0,0 +1,171 @@ +#ifndef CAFFE2_OPERATORS_INT8_CONV_OP_H_ +#define CAFFE2_OPERATORS_INT8_CONV_OP_H_ + +#include <qnnpack.h> + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/conv_op_shared.h" +#include "caffe2/operators/conv_pool_op_base.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +template <Activation Ac> +class Int8ConvOp final : public ConvPoolOpBase<CPUContext> { + public: + USE_CONV_POOL_BASE_FUNCTIONS(CPUContext); + Int8ConvOp(const OperatorDef& def, Workspace* ws) + : ConvPoolOpBase(def, ws), gemm_context_(ws->GetThreadPool()) { + OPERATOR_NEEDS_FEATURE( + this->order_ == StorageOrder::NHWC, + "Int8Conv only supports NHWC order"); + createSharedBuffer<CPUContext>(ws_); + } + + ~Int8ConvOp() { + if (this->qnnpackObject_ != nullptr) { + qnnp_delete_operator(this->qnnpackObject_); + this->qnnpackObject_ = nullptr; + } + } + + bool RunOnDeviceWithOrderNHWC() override { + CAFFE_ENFORCE_EQ(Inputs().size(), 3); + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + const auto& W = Inputs()[1]->template Get<Int8TensorCPU>(); + const auto& B = Inputs()[2]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + const int32_t Y_offset = + this->template GetSingleArgument<int>("Y_zero_point", 0); + double Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + + ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), W.t.dim32(0)); + Y->scale = Y_scale; + Y->zero_point = Y_offset; + + const auto M = W.t.dim(0); + const auto KH = W.t.dim(1); + const auto KW = W.t.dim(2); + const auto KC = W.t.dim(3); + const auto C = X.t.dim32(3); + const bool isDepthwise = this->group_ > 1 && this->group_ == M && + this->group_ == C && KC == 1 && KH * KW == 9 && dilation_w() == 1; + + CHECK_EQ(Y->t.dim32(3), M); + runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) { + initQNNPACK(); + + pthreadpool_t threadpool = + reinterpret_cast<pthreadpool_t>(gemm_context_.threadPool()); + + if (this->qnnpackObject_ == nullptr) { + CAFFE_ENFORCE( + C % this->group_ == 0, + "number of input channels must be divisible by groups count"); + CAFFE_ENFORCE( + M % this->group_ == 0, + "number of output channels must be divisible by groups count"); + const qnnp_status createStatus = qnnp_create_convolution2d_nhwc_q8( + pad_t(), + pad_r(), + pad_b(), + pad_l(), + KH, + KW, + stride_h(), + stride_w(), + dilation_h(), + dilation_w(), + this->group_, + C / this->group_, + M / this->group_, + X.zero_point, + X.scale, + W.zero_point, + W.scale, + W.t.template data<uint8_t>(), + B.t.template data<int32_t>(), + Y->zero_point, + Y->scale, + activationLimits(Y->scale, Y->zero_point, Ac).first, + activationLimits(Y->scale, Y->zero_point, Ac).second, + &this->qnnpackObject_); + CAFFE_ENFORCE( + createStatus == qnnp_status_success, + "failed to create QNNPACK convolution object"); + CAFFE_ENFORCE(this->qnnpackObject_ != nullptr); + } + + uint8_t* inputPtr = X.t.template mutable_data<uint8_t>(); + if ((isDepthwise && this->group_ < 8) || + (!isDepthwise && C / this->group_ < 8)) { + buffer->Resize(std::vector<int64_t>{X.t.size() + 8}); + inputPtr = buffer->template mutable_data<uint8_t>() + 8; + memcpy(inputPtr, X.t.template data<uint8_t>(), X.t.size()); + } + + if (lastBatchSize_ != static_cast<size_t>(X.t.dim(0)) || + lastInputHeight_ != static_cast<size_t>(X.t.dim(1)) || + lastInputWidth_ != static_cast<size_t>(X.t.dim(2)) || + lastInputPointer_ != inputPtr || + lastOutputPointer_ != Y->t.template mutable_data<uint8_t>()) { + const qnnp_status setupStatus = qnnp_setup_convolution2d_nhwc_q8( + this->qnnpackObject_, + X.t.dim(0), + X.t.dim(1), + X.t.dim(2), + inputPtr, + X.t.dim(3) /* input pixel stride */, + Y->t.template mutable_data<uint8_t>(), + Y->t.dim(3) /* output pixel stride */, + nullptr /* threadpool */); + CAFFE_ENFORCE( + setupStatus == qnnp_status_success, + "failed to setup QNNPACK convolution object"); + + lastBatchSize_ = static_cast<size_t>(X.t.dim(0)); + lastInputHeight_ = static_cast<size_t>(X.t.dim(1)); + lastInputWidth_ = static_cast<size_t>(X.t.dim(2)); + lastInputPointer_ = inputPtr; + lastOutputPointer_ = Y->t.template mutable_data<uint8_t>(); + } + +#ifdef FBCODE_CAFFE2 + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); +#else + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackObject_, threadpool); +#endif + CAFFE_ENFORCE( + runStatus == qnnp_status_success, + "failed to run QNNPACK convolution"); + }); + return true; + } + + private: + C2GEMMContext gemm_context_; + // QNNPACK convolution object + qnnp_operator_t qnnpackObject_{nullptr}; + // batch size in the previous call to RunOnDeviceWithOrderNHWC + size_t lastBatchSize_{0}; + // input height in the previous call to RunOnDeviceWithOrderNHWC + size_t lastInputHeight_{0}; + // input width in the previous call to RunOnDeviceWithOrderNHWC + size_t lastInputWidth_{0}; + // input pointer in the previous call to RunOnDeviceWithOrderNHWC + const void* lastInputPointer_{nullptr}; + // output pointer in the previous call to RunOnDeviceWithOrderNHWC + void* lastOutputPointer_{nullptr}; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_CONV_OP_H_ diff --git a/caffe2/operators/quantized/int8_conv_transpose_op.cc b/caffe2/operators/quantized/int8_conv_transpose_op.cc new file mode 100644 index 0000000000..6431852567 --- /dev/null +++ b/caffe2/operators/quantized/int8_conv_transpose_op.cc @@ -0,0 +1,49 @@ +#include "caffe2/operators/quantized/int8_conv_transpose_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8ConvTranspose, int8::Int8ConvTransposeOp); + +OPERATOR_SCHEMA(Int8ConvTranspose) + .NumInputs(2, 3) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .SetDoc(R"DOC( +The transposed convolution consumes an input vector, the filter blob, and +the bias blob, and computes the output. Note that other parameters, such as +the stride and kernel size, or the pads' sizes in each direction are not +necessary for input because they are provided by the +ConvTransposeUnpoolOpBase operator. Various dimension checks are done +implicitly, and the sizes are specified in the Input docs for this operator. +As is expected, the filter is deconvolved with a subset of the +image and the bias is added; this is done throughout the image data and the +output is computed. As a side note on the implementation layout: +conv_transpose_op_impl.h is the templated implementation of the +conv_transpose_op.h file, which is why they are separate files. + )DOC") + .Input( + 0, + "X", + "Input data blob from previous layer; has size " + "(N x H x W x C), where N is the batch size, C is the number of channels, and" + " H and W are the height and width. Note that NHWC is supported now") + .Input( + 1, + "filter", + "The filter blob that will be used in the transposed " + "convolution; has size (M x kH x kW x C), where C is the number of channels," + " and kH and kW are the height and width of the kernel.") + .Input( + 2, + "bias", + "The 1D bias blob that is added through the convolution;" + "has size (C). Optional, if not passed, will treat it as all 0.") + .Output( + 0, + "Y", + "Output data blob that contains the result of the " + "transposed convolution. The output dimensions are functions of the kernel" + " size, stride size, and pad lengths."); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_conv_transpose_op.h b/caffe2/operators/quantized/int8_conv_transpose_op.h new file mode 100644 index 0000000000..e3c78df9a8 --- /dev/null +++ b/caffe2/operators/quantized/int8_conv_transpose_op.h @@ -0,0 +1,169 @@ +#ifndef CAFFE2_OPERATORS_INT8_CONV_TRANSPOSE_OP_H_ +#define CAFFE2_OPERATORS_INT8_CONV_TRANSPOSE_OP_H_ + +#include <qnnpack.h> + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/conv_op_shared.h" +#include "caffe2/operators/conv_transpose_unpool_op_base.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +class Int8ConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> { + public: + USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(CPUContext); + Int8ConvTransposeOp(const OperatorDef& def, Workspace* ws) + : ConvTransposeUnpoolBase(def, ws), gemm_context_(ws->GetThreadPool()) { + OPERATOR_NEEDS_FEATURE( + this->order_ == StorageOrder::NHWC, + "Int8ConvTransposeOp only supports NHWC order"); + createSharedBuffer<CPUContext>(ws_); + } + + ~Int8ConvTransposeOp() { + if (this->qnnpackObject_ != nullptr) { + qnnp_delete_operator(this->qnnpackObject_); + this->qnnpackObject_ = nullptr; + } + } + + bool RunOnDeviceWithOrderNHWC() override { + CAFFE_ENFORCE_EQ(Inputs().size(), 3); + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + const auto& W = Inputs()[1]->template Get<Int8TensorCPU>(); + const auto& B = Inputs()[2]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + const auto X_offset = -X.zero_point; + const auto W_offset = -W.zero_point; + const int32_t Y_offset = + this->template GetSingleArgument<int>("Y_zero_point", 0); + double Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + Y->scale = Y_scale; + Y->zero_point = Y_offset; + + const auto N = X.t.dim(0); + const auto IH = X.t.dim(1); + const auto IW = X.t.dim(2); + const auto IC = X.t.dim(3); + + CHECK_EQ(IC, W.t.dim(0)); + const auto KH = W.t.dim(1); + const auto KW = W.t.dim(2); + const auto OC = W.t.dim(3); + + ConvTransposeUnpoolBase<CPUContext>::SetOutputSize(X.t, &(Y->t), OC); + CHECK_EQ(OC, Y->t.dim(3)); + + runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) { + initQNNPACK(); + + pthreadpool_t threadpool = + reinterpret_cast<pthreadpool_t>(gemm_context_.threadPool()); + + if (this->qnnpackObject_ == nullptr) { + const qnnp_status createStatus = qnnp_create_deconvolution2d_nhwc_q8( + pad_t(), + pad_r(), + pad_b(), + pad_l(), + adj_h(), + adj_w(), + KH, + KW, + stride_h(), + stride_w(), + 1 /* dilation height */, + 1 /* dilation width */, + 1 /* groups */, + IC, + OC, + X.zero_point, + X.scale, + W.zero_point, + W.scale, + W.t.template data<uint8_t>(), + B.t.template data<int32_t>(), + Y->zero_point, + Y->scale, + std::numeric_limits<uint8_t>::min(), + std::numeric_limits<uint8_t>::max(), + &this->qnnpackObject_); + CAFFE_ENFORCE( + createStatus == qnnp_status_success, + "failed to create QNNPACK convolution object"); + CAFFE_ENFORCE(this->qnnpackObject_ != nullptr); + } + + uint8_t* inputPtr = X.t.template mutable_data<uint8_t>(); + if (IC < 8) { + buffer->Resize(std::vector<int64_t>{X.t.size() + 8}); + inputPtr = buffer->template mutable_data<uint8_t>() + 8; + memcpy(inputPtr, X.t.template data<uint8_t>(), X.t.size()); + } + + if (lastBatchSize_ != static_cast<size_t>(X.t.dim(0)) || + lastInputHeight_ != static_cast<size_t>(X.t.dim(1)) || + lastInputWidth_ != static_cast<size_t>(X.t.dim(2)) || + lastInputPointer_ != inputPtr || + lastOutputPointer_ != Y->t.template mutable_data<uint8_t>()) { + const qnnp_status setupStatus = qnnp_setup_deconvolution2d_nhwc_q8( + this->qnnpackObject_, + X.t.dim(0), + X.t.dim(1), + X.t.dim(2), + inputPtr, + X.t.dim(3) /* input pixel stride */, + Y->t.template mutable_data<uint8_t>(), + Y->t.dim(3) /* output pixel stride */, + nullptr /* threadpool */); + CAFFE_ENFORCE( + setupStatus == qnnp_status_success, + "failed to setup QNNPACK convolution object"); + + lastBatchSize_ = static_cast<size_t>(X.t.dim(0)); + lastInputHeight_ = static_cast<size_t>(X.t.dim(1)); + lastInputWidth_ = static_cast<size_t>(X.t.dim(2)); + lastInputPointer_ = inputPtr; + lastOutputPointer_ = Y->t.template mutable_data<uint8_t>(); + } + +#ifdef FBCODE_CAFFE2 + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); +#else + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackObject_, threadpool); +#endif + CAFFE_ENFORCE( + runStatus == qnnp_status_success, + "failed to run QNNPACK convolution"); + }); + return true; + } + + private: + C2GEMMContext gemm_context_; + // QNNPACK convolution object + qnnp_operator_t qnnpackObject_{nullptr}; + // batch size in the previous call to RunOnDeviceWithOrderNHWC + size_t lastBatchSize_{0}; + // input height in the previous call to RunOnDeviceWithOrderNHWC + size_t lastInputHeight_{0}; + // input width in the previous call to RunOnDeviceWithOrderNHWC + size_t lastInputWidth_{0}; + // input pointer in the previous call to RunOnDeviceWithOrderNHWC + const void* lastInputPointer_{nullptr}; + // output pointer in the previous call to RunOnDeviceWithOrderNHWC + void* lastOutputPointer_{nullptr}; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_CONV_TRANSPOSE_OP_H_ diff --git a/caffe2/operators/quantized/int8_dequantize_op.cc b/caffe2/operators/quantized/int8_dequantize_op.cc new file mode 100644 index 0000000000..f000b8f06c --- /dev/null +++ b/caffe2/operators/quantized/int8_dequantize_op.cc @@ -0,0 +1,14 @@ +#include "caffe2/operators/quantized/int8_dequantize_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Dequantize, int8::Int8DequantizeOp); + +OPERATOR_SCHEMA(Int8Dequantize) + .IdenticalTypeAndShape() + .NumInputs(1) + .NumOutputs(1) + .Input(0, "qX", "Int8 Tensor qX.") + .Output(0, "Y", "FP32 Tensor that represents mapped real value of qX."); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_dequantize_op.h b/caffe2/operators/quantized/int8_dequantize_op.h new file mode 100644 index 0000000000..bb6c3421e5 --- /dev/null +++ b/caffe2/operators/quantized/int8_dequantize_op.h @@ -0,0 +1,52 @@ +#ifndef CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_ +#define CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +namespace { + +void Int8Dequantize( + const uint8_t* in, + float* out, + const int64_t N, + const float X_scale, + const int32_t X_offset) { + for (auto i = 0; i < N; ++i) { + out[i] = (static_cast<int32_t>(in[i]) - X_offset) * X_scale; + } +} + +} // namespace + +class Int8DequantizeOp final : public Operator<CPUContext> { + public: + using Operator<CPUContext>::Operator; + + bool RunOnDevice() override { + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Output(0); + Y->ResizeLike(X.t); + int32_t X_offset = X.zero_point; + auto X_scale = X.scale; + Int8Dequantize( + X.t.data<uint8_t>(), + Y->mutable_data<float>(), + X.t.size(), + X_scale, + X_offset); + return true; + } +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_ diff --git a/caffe2/operators/quantized/int8_fc_op.cc b/caffe2/operators/quantized/int8_fc_op.cc new file mode 100644 index 0000000000..ee7d6059ee --- /dev/null +++ b/caffe2/operators/quantized/int8_fc_op.cc @@ -0,0 +1,41 @@ +#include "caffe2/operators/quantized/int8_fc_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8FC, int8::Int8FCOp); + +OPERATOR_SCHEMA(Int8FC) + .NumInputs(3) + .NumOutputs(1) + .SetDoc(R"DOC( +Computes the result of passing an input vector X into a fully +connected layer with 2D weight matrix W and 1D bias vector b. That is, +the layer computes Y = X * W^T + b, where X has size (M x K), +W has size (N x K), b has size (N), and Y has size (M x N), +where M is often the batch size. + + +NOTE: X does not need to explicitly be a 2D vector; rather, it will be +coerced into one. For an arbitrary n-dimensional tensor +X \in [a_0, a_1 * ... * a_{n-1}]. Only this case is supported! +Lastly, even though b is a 1D vector of size N, it is copied/resized to +be size (M x N) implicitly and added to each vector in the batch. +Each of these dimensions must be matched correctly, or else the operator +will throw errors. +)DOC") + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .Input( + 0, + "X", + "input tensor that's coerced into a 2D matrix of size (MxK) " + "as described above") + .Input( + 1, + "W", + "A tensor that is coerced into a 2D blob of size (KxN) " + "containing fully connected weight matrix") + .Input(2, "b", "1D blob containing bias vector") + .Output(0, "Y", "2D output tensor"); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_fc_op.h b/caffe2/operators/quantized/int8_fc_op.h new file mode 100644 index 0000000000..963dd9bd56 --- /dev/null +++ b/caffe2/operators/quantized/int8_fc_op.h @@ -0,0 +1,133 @@ +#ifndef CAFFE2_OPERATORS_INT8_FC_OP_H_ +#define CAFFE2_OPERATORS_INT8_FC_OP_H_ + +#include <qnnpack.h> + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/conv_op_shared.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +class Int8FCOp final : public Operator<CPUContext> { + public: + Int8FCOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws), + ws_(ws), + gemm_context_(ws->GetThreadPool()) { + createSharedBuffer<CPUContext>(ws_); + } + + ~Int8FCOp() { + if (this->qnnpackObject_ != nullptr) { + qnnp_delete_operator(this->qnnpackObject_); + this->qnnpackObject_ = nullptr; + } + } + + bool RunOnDevice() override { + const auto& X = Inputs()[0]->Get<Int8TensorCPU>(); + const auto& W = Inputs()[1]->Get<Int8TensorCPU>(); + const auto& B = Inputs()[2]->Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->GetMutable<Int8TensorCPU>(); + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + Y->scale = Y_scale; + Y->zero_point = Y_offset; + // (NxHxW)xC == MxK x (NxK) -> MxN + const auto K = X.t.size_from_dim(1); + const auto N = W.t.dim(0); + CHECK_EQ(K, W.t.dim(1)); + CHECK_EQ(N, B.t.size()); + const auto M = X.t.size() / K; + Y->t.Resize(M, N); + + runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) { + initQNNPACK(); + + pthreadpool_t threadpool = + reinterpret_cast<pthreadpool_t>(gemm_context_.threadPool()); + + if (this->qnnpackObject_ == nullptr) { + const qnnp_status createStatus = qnnp_create_fully_connected_nc_q8( + K, + N, + X.zero_point, + X.scale, + W.zero_point, + W.scale, + W.t.template data<uint8_t>(), + B.t.template data<int32_t>(), + Y->zero_point, + Y->scale, + std::numeric_limits<uint8_t>::min(), + std::numeric_limits<uint8_t>::max(), + &this->qnnpackObject_); + CAFFE_ENFORCE( + createStatus == qnnp_status_success, + "failed to create QNNPACK fully connected operator"); + CAFFE_ENFORCE(this->qnnpackObject_ != nullptr); + } + + uint8_t* inputPtr = X.t.template mutable_data<uint8_t>(); + if (K < 8) { + buffer->Resize(std::vector<int64_t>{X.t.size() + 8}); + inputPtr = buffer->template mutable_data<uint8_t>() + 8; + memcpy(inputPtr, X.t.template data<uint8_t>(), X.t.size()); + } + + if (lastBatchSize_ != static_cast<size_t>(M) || + lastInputPointer_ != inputPtr || + lastOutputPointer_ != Y->t.template mutable_data<uint8_t>()) { + const qnnp_status setupStatus = qnnp_setup_fully_connected_nc_q8( + this->qnnpackObject_, + M, + inputPtr, + K /* input stride */, + Y->t.template mutable_data<uint8_t>(), + N /* output stride */, + nullptr /* threadpool */); + CAFFE_ENFORCE( + setupStatus == qnnp_status_success, + "failed to setup QNNPACK fully connected operator"); + + lastBatchSize_ = static_cast<size_t>(M); + lastInputPointer_ = inputPtr; + lastOutputPointer_ = Y->t.template mutable_data<uint8_t>(); + } + +#ifdef FBCODE_CAFFE2 + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); +#else + const qnnp_status runStatus = + qnnp_run_operator(this->qnnpackObject_, threadpool); +#endif + CAFFE_ENFORCE( + runStatus == qnnp_status_success, "failed to run QNNPACK operator"); + }); + return true; + } + + private: + Workspace* ws_; + C2GEMMContext gemm_context_; + // QNNPACK convolution object + qnnp_operator_t qnnpackObject_{nullptr}; + // batch size in the previous call to RunOnDeviceWithOrderNHWC + size_t lastBatchSize_{0}; + // input pointer in the previous call to RunOnDeviceWithOrderNHWC + const void* lastInputPointer_{nullptr}; + // output pointer in the previous call to RunOnDeviceWithOrderNHWC + void* lastOutputPointer_{nullptr}; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_FC_OP_H_ diff --git a/caffe2/operators/quantized/int8_flatten_op.cc b/caffe2/operators/quantized/int8_flatten_op.cc new file mode 100644 index 0000000000..a9d5a5ede2 --- /dev/null +++ b/caffe2/operators/quantized/int8_flatten_op.cc @@ -0,0 +1,30 @@ +#include "caffe2/operators/quantized/int8_flatten_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Flatten, int8::Int8FlattenOp); + +OPERATOR_SCHEMA(Int8Flatten) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +Flattens the input tensor into a 2D matrix. If input tensor has shape +(d_0, d_1, ... d_n) then the output will have shape +(d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn) +)DOC") + .Input(0, "input", "A Int8 tensor of rank >= axis.") + .Output( + 0, + "output", + "A 2D Int8 tensor with the contents of the input tensor, " + "with input dimensions up to axis flattened to the outer dimension " + "of the output and remaining input dimensions flattened into the inner " + "dimension of the output.") + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .Arg( + "axis", + "(Default to 1) Indicate up to which input dimensions " + "(exclusive) should be flattened to the outer dimension of the output"); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_flatten_op.h b/caffe2/operators/quantized/int8_flatten_op.h new file mode 100644 index 0000000000..6aacb2ac7a --- /dev/null +++ b/caffe2/operators/quantized/int8_flatten_op.h @@ -0,0 +1,47 @@ +#ifndef CAFFE2_OPERATORS_INT8_FLATTEN_OP_H_ +#define CAFFE2_OPERATORS_INT8_FLATTEN_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +class Int8FlattenOp : public Operator<CPUContext> { + public: + Int8FlattenOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws), + axis_(this->template GetSingleArgument<int>("axis", 1)) {} + + bool RunOnDevice() override { + const auto& X = Inputs()[0]->Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->GetMutable<Int8TensorCPU>(); + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + Y->scale = Y_scale; + Y->zero_point = Y_offset; + CAFFE_ENFORCE_GE( + X.t.sizes().size(), axis_, "The rank of the tensor must be >= axis."); + Y->t.Resize(X.t.size_to_dim(axis_), X.t.size_from_dim(axis_)); + context_.CopyItemsToCPU( + X.t.meta(), + X.t.size(), + X.t.raw_data(), + Y->t.raw_mutable_data(X.t.meta())); + return true; + } + + private: + int axis_; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_FLATTEN_OP_H_ diff --git a/caffe2/operators/quantized/int8_given_tensor_fill_op.cc b/caffe2/operators/quantized/int8_given_tensor_fill_op.cc new file mode 100644 index 0000000000..4840b4880b --- /dev/null +++ b/caffe2/operators/quantized/int8_given_tensor_fill_op.cc @@ -0,0 +1,32 @@ +#include "int8_given_tensor_fill_op.h" + +namespace caffe2 { + +OPERATOR_SCHEMA(Int8GivenTensorFill) + .NumInputs(0) + .NumOutputs(1) + .Arg("value", "Input array of type char(byte)") + .Arg("shape", "Input tensor shape") + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .SetDoc(R"DOC( + Creates quantized tensor of type char(byte) with scale and zero point info. +)DOC") + .Output(0, "Tensor", "An Int8TensorCPU with scale and zero point info"); + +OPERATOR_SCHEMA(Int8GivenIntTensorFill) + .NumInputs(0) + .NumOutputs(1) + .Arg("value", "Input array of type int32") + .Arg("shape", "Input tensor shape") + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .SetDoc(R"DOC( + Creates quantized tensor of type int32 with scale and zero point info. +)DOC") + .Output(0, "Tensor", "An Int8TensorCPU with scale and zero point info"); + +REGISTER_CPU_OPERATOR(Int8GivenTensorFill, int8::Int8GivenTensorFillOp); +REGISTER_CPU_OPERATOR(Int8GivenIntTensorFill, int8::Int8GivenIntTensorFillOp); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_given_tensor_fill_op.h b/caffe2/operators/quantized/int8_given_tensor_fill_op.h new file mode 100644 index 0000000000..5352844db6 --- /dev/null +++ b/caffe2/operators/quantized/int8_given_tensor_fill_op.h @@ -0,0 +1,114 @@ +#ifndef CAFFE2_OPERATORS_INT8_GIVEN_TENSOR_FILL_OP_H_ +#define CAFFE2_OPERATORS_INT8_GIVEN_TENSOR_FILL_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/filler_op.h" +#include "caffe2/utils/cast.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { +namespace int8 { + +class Int8GivenTensorFillOp final : public Operator<CPUContext> { + public: + Int8GivenTensorFillOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws), + scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)), + zero_point_( + this->template GetSingleArgument<int32_t>("Y_zero_point", 0)), + shape_(this->template GetRepeatedArgument<int64_t>("shape")) { + ExtractValues(); + } + + bool RunOnDevice() override { + auto* output = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + output->t.Resize(shape_); + output->scale = scale_; + output->zero_point = zero_point_; + return Fill(output); + } + + private: + void ExtractValues() { + auto source_values = this->template GetSingleArgument<string>("values", ""); + values_.Resize(source_values.size()); + uint8_t* values_data = values_.template mutable_data<uint8_t>(); + for (int i = 0; i < source_values.size(); i++) { + values_data[i] = static_cast<uint8_t>(source_values[i]); + } + } + + bool Fill(Int8TensorCPU* output) { + DCHECK_EQ(output->t.size(), values_.size()) + << "output size: " << output->t.size() + << " given size: " << values_.size(); + auto* data = output->t.template mutable_data<uint8_t>(); + const uint8_t* values_data = values_.template data<uint8_t>(); + if (output->t.size()) { + context_.template CopySameDevice<uint8_t>( + output->t.size(), values_data, data); + } + return true; + } + + float scale_; + int32_t zero_point_; + vector<int64_t> shape_; + Tensor values_{CPU}; +}; + +class Int8GivenIntTensorFillOp final : public Operator<CPUContext> { + public: + Int8GivenIntTensorFillOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws), + scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)), + zero_point_( + this->template GetSingleArgument<int32_t>("Y_zero_point", 0)), + shape_(this->template GetRepeatedArgument<int64_t>("shape")) { + ExtractValues(); + } + + bool RunOnDevice() override { + auto* output = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + output->t.Resize(shape_); + output->scale = scale_; + output->zero_point = zero_point_; + return Fill(output); + } + + private: + void ExtractValues() { + auto source_values = this->template GetRepeatedArgument<int32_t>("values"); + values_.Resize(source_values.size()); + auto* values_data = values_.template mutable_data<int32_t>(); + for (int i = 0; i < source_values.size(); i++) { + values_data[i] = static_cast<int32_t>(source_values[i]); + } + } + + bool Fill(Int8TensorCPU* output) { + DCHECK_EQ(output->t.size(), values_.size()) + << "output size: " << output->t.size() + << " given size: " << values_.size(); + auto* data = output->t.template mutable_data<int32_t>(); + const auto* values_data = values_.template data<int32_t>(); + if (output->t.size()) { + context_.template CopySameDevice<int32_t>( + output->t.size(), values_data, data); + } + return true; + } + + float scale_; + int32_t zero_point_; + vector<int64_t> shape_; + Tensor values_{CPU}; +}; + +} // namespace int8 +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_GIVEN_TENSOR_FILL_OP_H_ diff --git a/caffe2/operators/quantized/int8_leaky_relu_op.cc b/caffe2/operators/quantized/int8_leaky_relu_op.cc new file mode 100644 index 0000000000..b6bde8b78e --- /dev/null +++ b/caffe2/operators/quantized/int8_leaky_relu_op.cc @@ -0,0 +1,24 @@ +#include "caffe2/operators/quantized/int8_leaky_relu_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8LeakyRelu, int8::Int8LeakyReluOp); + +OPERATOR_SCHEMA(Int8LeakyRelu) + .NumInputs(1) + .NumOutputs(1) + .Arg("alpha", "Coefficient of leakage, default value is 0.01") + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .AllowInplace({{0, 0}}) + .CostInferenceFunction(PointwiseCostInference<2>) + .IdenticalTypeAndShape() + .SetDoc(R"DOC( +LeakyRelu takes input data (Tensor<T>) and an argument alpha, and produces one +output data (Tensor<T>) where the function `f(x) = alpha * x for x < 0`, +`f(x) = x for x >= 0`, is applied to the data tensor elementwise. +)DOC") + .Input(0, "X", "1D input tensor") + .Output(0, "Y", "1D input tensor"); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_leaky_relu_op.h b/caffe2/operators/quantized/int8_leaky_relu_op.h new file mode 100644 index 0000000000..68e6a46a12 --- /dev/null +++ b/caffe2/operators/quantized/int8_leaky_relu_op.h @@ -0,0 +1,64 @@ +#ifndef CAFFE2_OPERATORS_INT8_LEAKY_RELU_OP_H_ +#define CAFFE2_OPERATORS_INT8_LEAKY_RELU_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +class Int8LeakyReluOp final : public Operator<CPUContext> { + public: + Int8LeakyReluOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws) { + double alpha = this->template GetSingleArgument<float>("alpha", 0.01); + CAFFE_ENFORCE_GT(alpha, 0.0); + CAFFE_ENFORCE_LT(alpha, 1.0); + QuantizeMultiplierSmallerThanOne(alpha, &multiplier_, &shift_); + } + + bool RunOnDevice() override { + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + Y->t.ResizeLike(X.t); + Y->scale = X.scale; + Y->zero_point = X.zero_point; + CHECK_GE(X.zero_point, std::numeric_limits<uint8_t>::min()); + CHECK_LE(X.zero_point, std::numeric_limits<uint8_t>::max()); + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + + const uint8_t* Xdata = X.t.data<uint8_t>(); + uint8_t* Ydata = Y->t.mutable_data<uint8_t>(); + + // For x < zero_point: + // (y - zero_point) * scale = alpha * (x - zero_point) * scale + // y = alpha * (x - zeropoint) + zero_point + for (int i = 0; i < X.t.size(); i++) { + if (Xdata[i] < X.zero_point) { + int32_t out = MultiplyByQuantizedMultiplierSmallerThanOne( + Xdata[i] - X.zero_point, multiplier_, shift_) + + X.zero_point; + Ydata[i] = static_cast<uint8_t>(out); + } else { + Ydata[i] = Xdata[i]; + } + } + return true; + } + + private: + int32_t multiplier_; + int shift_; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_LEAKY_RELU_OP_H_ diff --git a/caffe2/operators/quantized/int8_max_pool_op.cc b/caffe2/operators/quantized/int8_max_pool_op.cc new file mode 100644 index 0000000000..64a1507b84 --- /dev/null +++ b/caffe2/operators/quantized/int8_max_pool_op.cc @@ -0,0 +1,63 @@ +#include "caffe2/operators/quantized/int8_max_pool_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8MaxPool, int8::Int8MaxPoolOp<int8::Activation::NONE>); +REGISTER_CPU_OPERATOR( + Int8MaxPoolRelu, + int8::Int8MaxPoolOp<int8::Activation::RELU>); + +const char kMaxPoolDoc_int8[] = R"DOC( +consumes an input blob X and applies max pooling across the +the blob according to kernel sizes, stride sizes, and pad lengths defined by the +ConvPoolOpBase operator. Max pooling consisting of taking the maximum value of a +subset of the input tensor according to the kernel size and downsampling the +data into the output blob Y for further processing. +)DOC"; + +std::function<void(OpSchema&)> MaxPoolDocGenerator( + const char* dim, + bool relu_fused = false) { + auto suffix = relu_fused ? " Output will go through rectified linear " + "function, where y = max(0, x)." + : ""; + return [=](OpSchema& schema) { + string doc = "MaxPool{dim} {pool_doc}"; + c10::ReplaceAll(doc, "{dim}", dim); + c10::ReplaceAll(doc, "{pool_doc}", kMaxPoolDoc_int8); + string output_doc = + "Output data tensor from max pooling across the input " + "tensor. Dimensions will vary based on various kernel, stride, and pad " + "sizes.{suffix}"; + c10::ReplaceAll(output_doc, "{suffix}", suffix); + schema.SetDoc(doc); + schema.Input( + 0, + "X", + "Input data tensor from the previous operator; dimensions depend on " + "whether the NCHW or NHWC operators are being used. For example, in " + "the former, the input has size (N x C x H x W), where N is the batch " + "size, C is the number of channels, and H and W are the height and the " + "width of the data. The corresponding permutation of dimensions is " + "used in the latter case."); + schema.Output(0, "Y", output_doc.c_str()); + }; +} + +OPERATOR_SCHEMA(Int8MaxPool) + .NumInputs(1) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) + .FillUsing(MaxPoolDocGenerator("")); + +OPERATOR_SCHEMA(Int8MaxPoolRelu) + .NumInputs(1) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) + .FillUsing(MaxPoolDocGenerator("", true)); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_max_pool_op.h b/caffe2/operators/quantized/int8_max_pool_op.h new file mode 100644 index 0000000000..26f2fa3d45 --- /dev/null +++ b/caffe2/operators/quantized/int8_max_pool_op.h @@ -0,0 +1,183 @@ +#ifndef CAFFE2_OPERATORS_INT8_MAX_POOL_OP_H_ +#define CAFFE2_OPERATORS_INT8_MAX_POOL_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/conv_pool_op_base.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +namespace { + +/* + * Implementation based on TensorFlow Lite kernels: + * - Repo: https://github.com/tensorflow/tensorflow + * - Path: tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h + * - Hash: d4ad9c73969c45d1a224ebfc43eb645b9860216b + */ + +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +void Int8MaxPool( + const uint8_t* input_data, + at::IntList input_dims, + int stride_width, + int stride_height, + int pad_width, + int pad_height, + int filter_width, + int filter_height, + uint8_t* output_data, + at::IntList output_dims, + uint8_t output_activation_min, + uint8_t output_activation_max) { + const int batches = input_dims[0]; + const int depth = input_dims[3]; + const int input_height = input_dims[1]; + const int input_width = input_dims[2]; + const int output_height = output_dims[1]; + const int output_width = output_dims[2]; + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = + std::min(filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = + std::min(filter_height, input_height - in_y_origin); + // 2048 required by Inception v3 + static constexpr int kAccBufferMaxSize = 2048; + CHECK_LE(depth, kAccBufferMaxSize); + uint8_t acc[kAccBufferMaxSize]; + memset(acc, 0, depth * sizeof(acc[0])); + + const uint8_t* input_ptr = + &input_data + [in_x_origin * depth + in_y_origin * input_width * depth + + batch * input_height * input_width * depth]; + + for (int fy = filter_y_start; fy < filter_y_end; fy++) { + const uint8_t* input_row_ptr = + &input_ptr[fy * input_width * depth + filter_x_start * depth]; + + for (int fx = filter_x_start; fx < filter_x_end; fx++) { + int channel = 0; +#ifdef INT8_NEON_SIMD + for (; channel <= depth - 16; channel += 16) { + uint8x16_t acc_reg = vld1q_u8(acc + channel); + uint8x16_t input_reg = vld1q_u8(input_row_ptr); + input_row_ptr += 16; + acc_reg = vmaxq_u8(acc_reg, input_reg); + vst1q_u8(acc + channel, acc_reg); + } + + for (; channel <= depth - 8; channel += 8) { + uint8x8_t acc_reg = vld1_u8(acc + channel); + uint8x8_t input_reg = vld1_u8(input_row_ptr); + input_row_ptr += 8; + acc_reg = vmax_u8(acc_reg, input_reg); + vst1_u8(acc + channel, acc_reg); + } +#endif + for (; channel < depth; ++channel) { + acc[channel] = std::max(acc[channel], *input_row_ptr++); + } + } + } + uint8_t* output_ptr = + &output_data + [out_x * depth + out_y * output_width * depth + + batch * output_height * output_width * depth]; + int channel = 0; +#ifdef INT8_NEON_SIMD + for (; channel <= depth - 16; channel += 16) { + uint8x16_t a = vld1q_u8(acc + channel); + a = vminq_u8(a, vdupq_n_u8(output_activation_max)); + a = vmaxq_u8(a, vdupq_n_u8(output_activation_min)); + vst1q_u8(output_ptr + channel, a); + } + for (; channel <= depth - 8; channel += 8) { + uint8x8_t a = vld1_u8(acc + channel); + a = vmin_u8(a, vdup_n_u8(output_activation_max)); + a = vmax_u8(a, vdup_n_u8(output_activation_min)); + vst1_u8(output_ptr + channel, a); + } +#endif + for (; channel < depth; ++channel) { + uint8_t a = acc[channel]; + a = std::max<uint8_t>(a, output_activation_min); + a = std::min<uint8_t>(a, output_activation_max); + output_ptr[channel] = static_cast<uint8_t>(a); + } + } + } + } +} + +} // namespace + +template <Activation Ac> +class Int8MaxPoolOp final : public ConvPoolOpBase<CPUContext> { + public: + Int8MaxPoolOp(const OperatorDef& operator_def, Workspace* ws) + : ConvPoolOpBase<CPUContext>(operator_def, ws) { + OPERATOR_NEEDS_FEATURE( + this->order_ == StorageOrder::NHWC, "Int8 only supports NCHW order."); + } + + bool RunOnDeviceWithOrderNHWC() override { + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + Y->scale = X.scale; + Y->zero_point = X.zero_point; + const int32_t Y_offset = + this->template GetSingleArgument<int>("Y_zero_point", 0); + const float Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + + CHECK_EQ(X.t.ndim(), 4); + const int height = X.t.dim32(1); + const int width = X.t.dim32(2); + const int channels = X.t.dim32(3); + ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), channels); + + Int8MaxPool( + X.t.template data<uint8_t>(), + X.t.sizes(), + stride_w(), + stride_h(), + pad_l(), + pad_t(), + kernel_w(), + kernel_h(), + Y->t.template mutable_data<uint8_t>(), + Y->t.sizes(), + activationLimits(Y->scale, Y->zero_point, Ac).first, + activationLimits(Y->scale, Y->zero_point, Ac).second); + return true; + } +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_MAX_POOL_OP_H_ diff --git a/caffe2/operators/quantized/int8_quantize_op.cc b/caffe2/operators/quantized/int8_quantize_op.cc new file mode 100644 index 0000000000..cbabe4613c --- /dev/null +++ b/caffe2/operators/quantized/int8_quantize_op.cc @@ -0,0 +1,16 @@ +#include "caffe2/operators/quantized/int8_quantize_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Quantize, int8::Int8QuantizeOp); + +OPERATOR_SCHEMA(Int8Quantize) + .IdenticalTypeAndShape() + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .NumInputs(1) + .NumOutputs(1) + .Input(0, "X", "FP32 Tensor X.") + .Output(0, "Y", "Int8 Tensor qX representing X with linear quantization."); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_quantize_op.h b/caffe2/operators/quantized/int8_quantize_op.h new file mode 100644 index 0000000000..53a5ce2dfc --- /dev/null +++ b/caffe2/operators/quantized/int8_quantize_op.h @@ -0,0 +1,91 @@ +#ifndef CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_ +#define CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +namespace { + +void Int8Quantize( + const float* in, + uint8_t* out, + const int64_t N, + const float Y_scale, + const int32_t Y_offset) { + const float inv_scale = 1.0f / Y_scale; + uint32_t i = 0; +#ifdef INT8_NEON_SIMD + const float32x4_t vinv_scale = vdupq_n_f32(inv_scale); + // magic float and magic int to take care of rounding + // int magic_round(float f): interpret_int32(f + 12582912.0f) - 0x4B400000 + // Some detail: + // 12582912.0f is 2**23 + 2**22. The trick is based on the fact that when you + // add a small number to a large number, the result rounds to the precision of + // the least significant bit of the large number. For IEEE-754 + // single-precision number mantissa has 23 bits, and adding 2**23 would cause + // rounding to the nearest even integer. The we cast to int and subtract the + // same number (0x4B400000 is the integer representation of 12582912.0f) to + // get only the mantissa. This works if -2**22 < x < 2**22, but preserves the + // sign for negative numbers. + const int32x4_t voffset = vdupq_n_s32(Y_offset - 0x4B400000); + const float32x4_t vmagic_float = vdupq_n_f32(12582912.0f); + for (i = 0; i + 8 < N; i += 8) { + const float32x4_t vin0123 = vld1q_f32(in); + in += 4; + const float32x4_t vin4567 = vld1q_f32(in); + in += 4; + const int32x4_t vraw0123 = vaddq_s32( + voffset, + vreinterpretq_s32_f32( + vaddq_f32(vmagic_float, vmulq_f32(vin0123, vinv_scale)))); + const int32x4_t vraw4567 = vaddq_s32( + voffset, + vreinterpretq_s32_f32( + vaddq_f32(vmagic_float, vmulq_f32(vin4567, vinv_scale)))); + const int16x8_t vraw01234567 = + vcombine_s16(vqmovn_s32(vraw0123), vqmovn_s32(vraw4567)); + const uint8x8_t vout01234567 = vqmovun_s16(vraw01234567); + vst1_u8(out, vout01234567); + out += 8; + } +#endif + for (; i < N; ++i) { + (*out++) = QuantizeUint8(Y_scale, Y_offset, (*in++)); + } +} + +} // namespace + +class Int8QuantizeOp final : public Operator<CPUContext> { + public: + using Operator<CPUContext>::Operator; + + bool RunOnDevice() override { + const auto& X = Input(0); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + Y->t.ResizeLike(X); + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + Y->scale = Y_scale; + Y->zero_point = Y_offset; + Int8Quantize( + X.data<float>(), + Y->t.mutable_data<uint8_t>(), + X.size(), + Y_scale, + Y_offset); + return true; + } +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_ diff --git a/caffe2/operators/quantized/int8_relu_op.cc b/caffe2/operators/quantized/int8_relu_op.cc new file mode 100644 index 0000000000..f069d45100 --- /dev/null +++ b/caffe2/operators/quantized/int8_relu_op.cc @@ -0,0 +1,37 @@ +#include "caffe2/operators/quantized/int8_relu_op.h" + +namespace caffe2 { + +namespace { + +OpSchema::Cost CostInferenceForRelu( + const OperatorDef& def, + const vector<TensorShape>& in) { + struct OpSchema::Cost cost = PointwiseCostInference<0>(def, in); + cost.params_bytes = 0; + return cost; +} + +} // namespace + +REGISTER_CPU_OPERATOR(Int8Relu, int8::Int8ReluOp); + +// Input: X, output: Y +OPERATOR_SCHEMA(Int8Relu) + .NumInputs(1) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .AllowInplace({{0, 0}}) + .CostInferenceFunction(CostInferenceForRelu) + .IdenticalTypeAndShape() + .SetDoc(R"DOC( +Relu takes one input data (Tensor<T>) and produces one output data +(Tensor<T>) where the rectified linear function, y = max(0, x), is applied to +the tensor elementwise. +)DOC") + .Input(0, "X", "1D input tensor") + .Output(0, "Y", "1D input tensor") + .InheritOnnxSchema("Relu"); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_relu_op.h b/caffe2/operators/quantized/int8_relu_op.h new file mode 100644 index 0000000000..9e88bcc75e --- /dev/null +++ b/caffe2/operators/quantized/int8_relu_op.h @@ -0,0 +1,43 @@ +#ifndef CAFFE2_OPERATORS_INT8_RELU_OP_H_ +#define CAFFE2_OPERATORS_INT8_RELU_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" +#include "caffe2/utils/eigen_utils.h" + +namespace caffe2 { + +namespace int8 { + +class Int8ReluOp final : public Operator<CPUContext> { + public: + using Operator<CPUContext>::Operator; + + bool RunOnDevice() override { + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + Y->t.ResizeLike(X.t); + Y->scale = X.scale; + Y->zero_point = X.zero_point; + CHECK_GE(X.zero_point, std::numeric_limits<uint8_t>::min()); + CHECK_LE(X.zero_point, std::numeric_limits<uint8_t>::max()); + const int32_t Y_offset = + this->template GetSingleArgument<int>("Y_zero_point", 0); + const float Y_scale = + this->template GetSingleArgument<float>("Y_scale", 1.0f); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + EigenVectorMap<uint8_t>(Y->t.mutable_data<uint8_t>(), X.t.size()) = + ConstEigenVectorMap<uint8_t>(X.t.data<uint8_t>(), X.t.size()) + .cwiseMax(QuantizeUint8(X.scale, X.zero_point, 0)); + return true; + } +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_RELU_OP_H_ diff --git a/caffe2/operators/quantized/int8_reshape_op.cc b/caffe2/operators/quantized/int8_reshape_op.cc new file mode 100644 index 0000000000..a385313666 --- /dev/null +++ b/caffe2/operators/quantized/int8_reshape_op.cc @@ -0,0 +1,31 @@ +#include "caffe2/operators/quantized/int8_reshape_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Reshape, int8::Int8ReshapeOp); + +OPERATOR_SCHEMA(Int8Reshape) + .NumInputs(1, 2) + .NumOutputs(2) + .AllowInplace({{0, 0}}) + .SetDoc(R"DOC( +Reshape the input tensor similar to numpy.reshape. + +It takes a tensor as input and an optional tensor specifying the new shape. +When the second input is absent, an extra argument `shape` must be specified. +It outputs the reshaped tensor as well as the original shape. + +At most one dimension of the new shape can be -1. In this case, the value is +inferred from the size of the tensor and the remaining dimensions. A dimension +could also be 0, in which case the actual dimension value is going to be copied +from the input tensor. +)DOC") + .Arg("shape", "New shape") + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .Input(0, "data", "An input tensor.") + .Input(1, "new_shape", "New shape.") + .Output(0, "reshaped", "Reshaped data.") + .Output(1, "old_shape", "Original shape."); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_reshape_op.h b/caffe2/operators/quantized/int8_reshape_op.h new file mode 100644 index 0000000000..0aa2000ea7 --- /dev/null +++ b/caffe2/operators/quantized/int8_reshape_op.h @@ -0,0 +1,47 @@ +#ifndef CAFFE2_OPERATORS_INT8_RESHAPE_OP_H_ +#define CAFFE2_OPERATORS_INT8_RESHAPE_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" +#include "caffe2/operators/reshape_op.h" + +namespace caffe2 { + +namespace int8 { + +class Int8ReshapeOp final : public ReshapeOp<uint8_t, CPUContext> { + public: + Int8ReshapeOp(const OperatorDef& operator_def, Workspace* ws) + : ReshapeOp(operator_def, ws) {} + + bool RunOnDevice() override { + if (InputSize() == 2) { + return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1)); + } + CAFFE_ENFORCE( + OperatorBase::HasArgument("shape"), "Argument `shape` is missing."); + return this->template DoRunWithType<int64_t>(); + } + + template <typename T> + bool DoRunWithType() { + auto& X = Inputs()[0]->Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->GetMutable<Int8TensorCPU>(); + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + Y->scale = Y_scale; + Y->zero_point = Y_offset; + DoRunWithTypeImpl<T>(X.t, &Y->t); + return true; + } +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_RESHAPE_OP_H_ diff --git a/caffe2/operators/quantized/int8_resize_nearest_op.cc b/caffe2/operators/quantized/int8_resize_nearest_op.cc new file mode 100644 index 0000000000..fd5e2fd89b --- /dev/null +++ b/caffe2/operators/quantized/int8_resize_nearest_op.cc @@ -0,0 +1,25 @@ +#include "caffe2/operators/quantized/int8_resize_nearest_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8ResizeNearest, int8::Int8ResizeNearestOp); + +// Input: X, output: Y +OPERATOR_SCHEMA(Int8ResizeNearest) + .NumInputs(1) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .Arg("width_scale", "Scale along width dimension") + .Arg("height_scale", "Scale along height dimension") + .SetDoc(R"DOC( +Resizes the spatial dimensions of the input using nearest neighbor +interpolation. The `width_scale` and `height_scale` arguments +control the size of the output, which is given by: +output_width = floor(input_width * width_scale) +output_height = floor(output_height * height_scale) +)DOC") + .Input(0, "X", "Input Int8 tensor") + .Output(0, "Y", "Output Int8 tensor"); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_resize_nearest_op.h b/caffe2/operators/quantized/int8_resize_nearest_op.h new file mode 100644 index 0000000000..eebcc33064 --- /dev/null +++ b/caffe2/operators/quantized/int8_resize_nearest_op.h @@ -0,0 +1,72 @@ +#ifndef CAFFE2_OPERATORS_INT8_RESIZE_NEAREST_OP_H_ +#define CAFFE2_OPERATORS_INT8_RESIZE_NEAREST_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +namespace int8 { + +class Int8ResizeNearestOp final : public Operator<CPUContext> { + public: + Int8ResizeNearestOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws) { + width_scale_ = this->template GetSingleArgument<float>("width_scale", 1); + height_scale_ = this->template GetSingleArgument<float>("height_scale", 1); + CAFFE_ENFORCE_GT(width_scale_, 0); + CAFFE_ENFORCE_GT(height_scale_, 0); + } + + bool RunOnDevice() override { + // Assume NHWC layout. + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + + CAFFE_ENFORCE_EQ(4, X.t.ndim()); + const int N = X.t.dim32(0); + const int IH = X.t.dim32(1); + const int IW = X.t.dim32(2); + const int C = X.t.dim32(3); + const int OW = IW * width_scale_; + const int OH = IH * height_scale_; + + Y->t.Resize(N, OH, OW, C); + Y->scale = X.scale; + Y->zero_point = X.zero_point; + + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + + const uint8_t* Xdata = X.t.data<uint8_t>(); + uint8_t* Ydata = Y->t.mutable_data<uint8_t>(); + + for (int n = 0; n < N; ++n) { + for (int y = 0; y < OH; ++y) { + const int in_y = std::min((int)(y / height_scale_), (IH - 1)); + for (int x = 0; x < OW; ++x) { + const int in_x = std::min((int)(x / width_scale_), (IW - 1)); + std::memcpy( + &Ydata[C * x + C * OW * y + C * OW * OH * n], + &Xdata[C * in_x + C * IW * in_y + C * IW * IH * n], + C); + } + } + } + return true; + } + + private: + float width_scale_; + float height_scale_; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_RESIZE_NEAREST_OP_H_ diff --git a/caffe2/operators/quantized/int8_roi_align_op.cc b/caffe2/operators/quantized/int8_roi_align_op.cc new file mode 100644 index 0000000000..2caf91706f --- /dev/null +++ b/caffe2/operators/quantized/int8_roi_align_op.cc @@ -0,0 +1,45 @@ +#include "caffe2/operators/quantized/int8_roi_align_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8RoIAlign, int8::Int8RoIAlignOp); + +OPERATOR_SCHEMA(Int8RoIAlign) + .NumInputs(2) + .NumOutputs(1) + .SetDoc(R"DOC( +Region of Interest (RoI) align operation as used in Mask R-CNN. +)DOC") + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .Arg( + "spatial_scale", + "(float) default 1.0; Spatial scale of the input feature map X " + "relative to the input image. E.g., 0.0625 if X has a stride of 16 " + "w.r.t. the input image.") + .Arg("pooled_h", "(int) default 1; Pooled output Y's height.") + .Arg("pooled_w", "(int) default 1; Pooled output Y's width.") + .Arg( + "sampling_ratio", + "(int) default -1; number of sampling points in the interpolation grid " + "used to compute the output value of each pooled output bin. If > 0, " + "then exactly sampling_ratio x sampling_ratio grid points are used. If " + "<= 0, then an adaptive number of grid points are used (computed as " + "ceil(roi_width / pooled_w), and likewise for height).") + .Input(0, "X", "4D Int8 Tensor feature map input of shape (N, C, H, W).") + .Input( + 1, + "RoIs", + "2D input of shape (R, 4 or 5) specifying R RoIs " + "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI " + "coordinates are in the coordinate system of the input image. For " + "inputs corresponding to a single image, batch index can be excluded " + "to have just 4 columns.") + .Output( + 0, + "Y", + "4D Int8 Tensor output of shape (R, C, pooled_h, pooled_w). " + "The r-th batch element " + "is a pooled feature map cooresponding to the r-th RoI."); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_roi_align_op.h b/caffe2/operators/quantized/int8_roi_align_op.h new file mode 100644 index 0000000000..93d36166d9 --- /dev/null +++ b/caffe2/operators/quantized/int8_roi_align_op.h @@ -0,0 +1,341 @@ +#ifndef CAFFE2_OPERATORS_INT8_ROI_ALIGN_OP_H_ +#define CAFFE2_OPERATORS_INT8_ROI_ALIGN_OP_H_ + +#include "caffe2/core/common.h" +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/operator_schema.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +namespace int8 { + +namespace { + +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + uint8_t w1; + uint8_t w2; + uint8_t w3; + uint8_t w4; +}; + +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + float roi_start_h, + float roi_start_w, + float bin_size_h, + float bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector<PreCalc>& pre_calc) { + int pre_calc_index = 0; + // boltnn use a smaller multiplier here. Sometimes w will shrink to 0. + const float w_multiplier = 255.0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const float yy = roi_start_h + ph * bin_size_h + + static_cast<float>(iy + .5f) * bin_size_h / + static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const float xx = roi_start_w + pw * bin_size_w + + static_cast<float>(ix + .5f) * bin_size_w / + static_cast<float>(roi_bin_grid_w); + + float x = xx; + float y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (float)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (float)x_low; + } else { + x_high = x_low + 1; + } + + float ly = y - y_low; + float lx = x - x_low; + float hy = 1. - ly, hx = 1. - lx; + // w are not necessary 1 + uint8_t w1 = static_cast<uint8_t>(Round(hy * hx * w_multiplier)); + uint8_t w2 = static_cast<uint8_t>(Round(hy * lx * w_multiplier)); + uint8_t w3 = static_cast<uint8_t>(Round(ly * hx * w_multiplier)); + uint8_t w4 = static_cast<uint8_t>(Round(ly * lx * w_multiplier)); + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +void ROIAlignForward( + const int nthreads, + const uint8_t* bottom_data, + const float& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const float* bottom_rois, + int roi_cols, + uint8_t* top_data, + const float x_scale, + const float y_scale, + const int32_t x_offset, + const int32_t y_offset, + StorageOrder order) { + DCHECK(roi_cols == 4 || roi_cols == 5); + + int n_rois = nthreads / channels / pooled_width / pooled_height; + + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const float* offset_bottom_rois = bottom_rois + n * roi_cols; + int roi_batch_ind = 0; + if (roi_cols == 5) { + roi_batch_ind = offset_bottom_rois[0]; + offset_bottom_rois++; + } + + // Do not using rounding; this implementation detail is critical + float roi_start_w = offset_bottom_rois[0] * spatial_scale; + float roi_start_h = offset_bottom_rois[1] * spatial_scale; + float roi_end_w = offset_bottom_rois[2] * spatial_scale; + float roi_end_h = offset_bottom_rois[3] * spatial_scale; + + // Force malformed ROIs to be 1x1 + float roi_width = std::max(roi_end_w - roi_start_w, (float)1.); + float roi_height = std::max(roi_end_h - roi_start_h, (float)1.); + float bin_size_h = + static_cast<float>(roi_height) / static_cast<float>(pooled_height); + float bin_size_w = + static_cast<float>(roi_width) / static_cast<float>(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const float count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + // calculate multiplier + double real_multiplier = x_scale / (y_scale * 255.0 * count); + int32_t Y_multiplier; + int Y_shift; + QuantizeMultiplierSmallerThanOne(real_multiplier, &Y_multiplier, &Y_shift); + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector<PreCalc> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + const uint8_t* offset_bottom_data = + bottom_data + roi_batch_ind * channels * height * width; + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + vector<int32_t> acc_buffer(channels, 0); + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + + const uint8_t* data_1 = offset_bottom_data + channels * pc.pos1; + const uint8_t* data_2 = offset_bottom_data + channels * pc.pos2; + const uint8_t* data_3 = offset_bottom_data + channels * pc.pos3; + const uint8_t* data_4 = offset_bottom_data + channels * pc.pos4; + for (int c = 0; c < channels; ++c) { + acc_buffer[c] += (uint32_t)(pc.w1) * (uint32_t)(data_1[c]); + acc_buffer[c] += (uint32_t)(pc.w2) * (uint32_t)(data_2[c]); + acc_buffer[c] += (uint32_t)(pc.w3) * (uint32_t)(data_3[c]); + acc_buffer[c] += (uint32_t)(pc.w4) * (uint32_t)(data_4[c]); + + // w_1..4 are all multiplied by 255.0 + acc_buffer[c] -= x_offset * 255.0; + } + + pre_calc_index += 1; + } + } + int index_nhw = index_n + (ph * pooled_width + pw) * channels; + uint8_t* out_ptr = top_data + index_nhw; + for (int c = 0; c < channels; ++c) { + int32_t a_mul = MultiplyByQuantizedMultiplierSmallerThanOne( + acc_buffer[c], Y_multiplier, Y_shift) + + y_offset; + int32_t clamped_a = + std::min<int32_t>(255, std::max<int32_t>(0, a_mul)); + out_ptr[c] = static_cast<uint8_t>(clamped_a); + } + } // for pw + } // for ph + } // for n +} + +} // namespace + +class Int8RoIAlignOp final : public Operator<CPUContext> { + public: + Int8RoIAlignOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws), + order_(StringToStorageOrder( + this->template GetSingleArgument<string>("order", "NHWC"))), + spatial_scale_( + this->template GetSingleArgument<float>("spatial_scale", 1.)), + pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)), + pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)), + sampling_ratio_( + this->template GetSingleArgument<int>("sampling_ratio", -1)) { + DCHECK_GT(spatial_scale_, 0); + DCHECK_GT(pooled_height_, 0); + DCHECK_GT(pooled_width_, 0); + DCHECK_GE(sampling_ratio_, 0); + // only supports NHWC + CAFFE_ENFORCE(order_ == StorageOrder::NHWC); + } + + bool RunOnDevice() override { + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); // Input, NHWC + auto& R = Input(1); // RoIs + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); // RoI pooled + // calculate multiplier + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + Y->scale = Y_scale; + Y->zero_point = Y_offset; + + if (R.size() == 0) { + // Handle empty rois + Y->t.Resize(0, pooled_height_, pooled_width_, X.t.dim32(3)); + // The following mutable_data calls are needed to allocate the tensors + Y->t.mutable_data<uint8_t>(); + return true; + } + + CAFFE_ENFORCE_EQ(R.ndim(), 2); + // if R has 5 columns, the first column is the index, otherwise 0 + CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5); + + assert(sampling_ratio_ >= 0); + + // only supports NHWC now + Y->t.Resize(R.dim32(0), pooled_height_, pooled_width_, X.t.dim32(3)); + int output_size = Y->t.size(); + + ROIAlignForward( + output_size, + X.t.data<uint8_t>(), + spatial_scale_, + X.t.dim32(3), + X.t.dim32(1), + X.t.dim32(2), + pooled_height_, + pooled_width_, + sampling_ratio_, + R.data<float>(), + R.dim32(1), + Y->t.mutable_data<uint8_t>(), + X.scale, + Y_scale, + X.zero_point, + Y_offset, + order_); + + return true; + } + + protected: + StorageOrder order_; + float spatial_scale_; + int pooled_height_; + int pooled_width_; + int sampling_ratio_; +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_ROI_ALIGN_OP_H_ diff --git a/caffe2/operators/quantized/int8_roi_align_op_test.cc b/caffe2/operators/quantized/int8_roi_align_op_test.cc new file mode 100644 index 0000000000..e00c4aeebe --- /dev/null +++ b/caffe2/operators/quantized/int8_roi_align_op_test.cc @@ -0,0 +1,62 @@ +#include "caffe2/operators/quantized/int8_test_utils.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +TEST(Int8RoIAlign, RoIAlign) { + const int N = 2; + const int C = 3; + const int H = 100; + const int W = 110; + auto XQ = q({N, H, W, C}); + XQ->scale = 0.01f; + XQ->zero_point = 127; + auto X = dq(*XQ); + const int n_rois = 10; + Workspace ws; + vector<float> rois_array; + for (int n = 0; n < n_rois; n++) { + rois_array.push_back(randomInt(0, N - 1)); + int w1 = randomInt(0, W); + int w2 = randomInt(0, W); + int h1 = randomInt(0, H); + int h2 = randomInt(0, H); + rois_array.push_back(std::min(w1, w2)); + rois_array.push_back(std::max(h1, h2)); + rois_array.push_back(std::min(w1, w2)); + rois_array.push_back(std::max(h1, h2)); + } + add_input({n_rois, 5}, rois_array, "RoIs", &ws); + auto xop = CreateOperatorDef( + "RoIAlign", + "", + {"X", "RoIs"}, + {"Y"}, + {MakeArgument<float>("spatial_scale", 0.25f), + MakeArgument<int>("pooled_h", 2), + MakeArgument<int>("pooled_w", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("sampling_ratio", 2)}); + auto op = CreateOperatorDef( + "Int8RoIAlign", + "", + {"XQ", "RoIs"}, + {"YQ"}, + {MakeArgument<float>("spatial_scale", 0.25f), + MakeArgument<int>("pooled_h", 2), + MakeArgument<int>("pooled_w", 2), + MakeArgument<int>("sampling_ratio", 2), + MakeArgument<int>("Y_zero_point", 127), + MakeArgument<float>("Y_scale", 0.01f)}); + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + // we cant make sure delta is within XQ->scale since there is interpolation + EXPECT_TENSOR_APPROX_EQ(*YA, YE, 4 * XQ->scale); +} + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_simd.h b/caffe2/operators/quantized/int8_simd.h new file mode 100644 index 0000000000..63d6fd0cab --- /dev/null +++ b/caffe2/operators/quantized/int8_simd.h @@ -0,0 +1,63 @@ +#pragma once + +// We want to allow 128-bit wide SIMD if either NEON is available (as +// detected by GEMMLOWP_NEON), or whether SSE4.2 and Clang is +// available (in which case we will use the neon_sse.h library to +// share source between the two implementations). We use SSE4.2 to +// ensure we can use the full neon2sse library, and we use Clang as +// GCC has issues correctly compiling some parts of the neon2sse +// library. + +// Otherwise, the INT8_NEON_SIMD variable will be undefined. + +#include "gemmlowp/fixedpoint/fixedpoint.h" +#include "gemmlowp/public/gemmlowp.h" + +#ifdef GEMMLOWP_NEON +#define INT8_NEON_SIMD +#endif + +#if defined(__SSE4_2__) && defined(__clang__) +#define INT8_NEON_SIMD + +#include "neon2sse.h" +// Add GEMMLOWP SIMD type wrappers for the NEON2SSE SIMD types. + +namespace gemmlowp { +template <> +struct FixedPointRawTypeTraits<int32x4_t> { + typedef std::int32_t ScalarRawType; + static const int kLanes = 4; +}; + +template <> +inline int32x4_t Dup<int32x4_t>(std::int32_t x) { + return vdupq_n_s32(x); +} + +template <> +inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) { + return vandq_s32(a, b); +} + +template <> +inline int32x4_t Add(int32x4_t a, int32x4_t b) { + return vaddq_s32(a, b); +} + +template <> +inline int32x4_t ShiftRight(int32x4_t a, int offset) { + return vshlq_s32(a, vdupq_n_s32(-offset)); +} + +template <> +inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) { + return vreinterpretq_s32_u32(vcltq_s32(a, b)); +} + +template <> +inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) { + return vreinterpretq_s32_u32(vcgtq_s32(a, b)); +} +} // namespace gemmlowp +#endif diff --git a/caffe2/operators/quantized/int8_slice_op.cc b/caffe2/operators/quantized/int8_slice_op.cc new file mode 100644 index 0000000000..38e9714b63 --- /dev/null +++ b/caffe2/operators/quantized/int8_slice_op.cc @@ -0,0 +1,44 @@ +#include "caffe2/operators/quantized/int8_slice_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Slice, int8::Int8SliceOp); + +OPERATOR_SCHEMA(Int8Slice) + .NumInputs(1, 3) + .NumOutputs(1) + .SetDoc(R"DOC( +Produces a slice of the input Int8 tensor. Currently, only slicing in a single +dimension is supported. +Slices are passed as 2 1D vectors or as two keyword argument lists with starting +and end indices for each dimension of the input `data` tensor. If a negative +value is passed for any of the start or end indices, it represents the number of +elements before the end of that dimension. End indices are non-inclusive unless +negative (end index -1 means up to and including the last element). + + +Example: + + data = [ + [1, 2, 3, 4], + [5, 6, 7, 8], + ] + starts = [0, 1] + ends = [-1, 3] + + result = [ + [2, 3], + [6, 7], + ] +)DOC") + .Input(0, "data", "Int8 Tensor of data to extract slices from.") + .Input(1, "starts", "1D tensor: start-indices for each dimension of data.") + .Input(2, "ends", "1D tensor: end-indices for each dimension of data.") + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .Arg("starts", "List of starting indices") + .Arg("ends", "List of ending indices") + .Output(0, "output", "Sliced Int8 data tensor.") + .InheritOnnxSchema("Slice"); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_slice_op.h b/caffe2/operators/quantized/int8_slice_op.h new file mode 100644 index 0000000000..10b5b05141 --- /dev/null +++ b/caffe2/operators/quantized/int8_slice_op.h @@ -0,0 +1,71 @@ +#ifndef CAFFE2_OPERATORS_INT8_SLICE_OP_H_ +#define CAFFE2_OPERATORS_INT8_SLICE_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" +#include "caffe2/operators/slice_op.h" + +namespace caffe2 { + +namespace int8 { + +class Int8SliceOp final : public SliceOp<CPUContext> { + public: + Int8SliceOp(const OperatorDef& operator_def, Workspace* ws) + : SliceOp(operator_def, ws) {} + + bool RunOnDevice() override { + if (InputSize() > 1) { + return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1)); + } else { + return DoRunWithType<int64_t>(); + } + } + + template <typename SIndex> + bool DoRunWithType() { + if (InputSize() > 1) { + starts_host_.CopyFrom(Input(1)); + ends_host_.CopyFrom(Input(2)); + } else { + if (!statically_inited_) { + CAFFE_ENFORCE(HasArgument("starts")); + CAFFE_ENFORCE(HasArgument("ends")); + CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); + + starts_host_.Resize(starts_.size()); + ends_host_.Resize(ends_.size()); + + memcpy( + starts_host_.template mutable_data<SIndex>(), + starts_.data(), + sizeof(SIndex) * starts_.size()); + memcpy( + ends_host_.template mutable_data<SIndex>(), + ends_.data(), + sizeof(SIndex) * ends_.size()); + statically_inited_ = true; + } + } + + auto& X = Inputs()[0]->Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->GetMutable<Int8TensorCPU>(); + int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0); + auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, X.zero_point); + CHECK_EQ(Y_scale, X.scale); + Y->scale = Y_scale; + Y->zero_point = Y_offset; + + return SliceImpl<SIndex, CPUContext>( + &Y->t, X.t, starts_host_, ends_host_, &context_); + } +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_SLICE_OP_H_ diff --git a/caffe2/operators/quantized/int8_softmax_op.cc b/caffe2/operators/quantized/int8_softmax_op.cc new file mode 100644 index 0000000000..42285a3b63 --- /dev/null +++ b/caffe2/operators/quantized/int8_softmax_op.cc @@ -0,0 +1,46 @@ +#include "caffe2/operators/quantized/int8_softmax_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(Int8Softmax, int8::Int8SoftmaxOp); + +OPERATOR_SCHEMA(Int8Softmax) + .NumInputs(1) + .NumOutputs(1) + .Arg("Y_scale", "Output tensor quantization scale") + .Arg("Y_zero_point", "Output tensor quantization offset") + .IdenticalTypeAndShape() + .SetDoc(R"DOC( +The operator computes the softmax normalized values for each layer in the batch + of the given input. The input is a 2-D tensor (Tensor<float>) of size +(batch_size x input_feature_dimensions). The output tensor has the same shape +and contains the softmax normalized values of the corresponding input. + +X does not need to explicitly be a 2D vector; rather, it will be +coerced into one. For an arbitrary n-dimensional tensor +X \in [a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}] and k is +the axis provided, then X will be coerced into a 2-dimensional tensor with +dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}]. For the default +case where axis=1, this means the X tensor will be coerced into a 2D tensor +of dimensions [a_0, a_1 * ... * a_{n-1}], where a_0 is often the batch size. +In this situation, we must have a_0 = N and a_1 * ... * a_{n-1} = D. +Each of these dimensions must be matched correctly, or else the operator +will throw errors. +)DOC") + .Arg( + "axis", + "(int) default to 1; describes the axis of the inputs when coerced " + "to 2D; defaults to one because the 0th axis most likely describes " + "the batch_size") + .Input( + 0, + "input", + "The input tensor that's coerced into a 2D matrix of size (NxD) " + "as described above.") + .Output( + 0, + "output", + "The softmax normalized output values with the same " + "shape as input tensor."); + +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_softmax_op.h b/caffe2/operators/quantized/int8_softmax_op.h new file mode 100644 index 0000000000..0b6b9d68df --- /dev/null +++ b/caffe2/operators/quantized/int8_softmax_op.h @@ -0,0 +1,227 @@ +#ifndef CAFFE2_OPERATORS_INT8_SOFTMAX_OP_H_ +#define CAFFE2_OPERATORS_INT8_SOFTMAX_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor_int8.h" +#include "caffe2/operators/quantized/int8_utils.h" +#include "caffe2/operators/reshape_op.h" + +namespace caffe2 { + +namespace int8 { + +namespace { + +/* + * Implementation based on TensorFlow Lite kernels: + * - Repo: https://github.com/tensorflow/tensorflow + * - Path: tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h + * - Hash: d4ad9c73969c45d1a224ebfc43eb645b9860216b + */ + +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +void QuantizeMultiplierGreaterThanOne( + double double_multiplier, + int32_t* quantized_multiplier, + int* left_shift) { + CHECK(double_multiplier > 1.); + const double q = std::frexp(double_multiplier, left_shift); + auto q_fixed = static_cast<int64_t>(Round(q * (1ll << 31))); + CHECK(q_fixed <= (1ll << 31)); + if (q_fixed == (1ll << 31)) { + q_fixed /= 2; + ++*left_shift; + } + CHECK_GE(*left_shift, 0); + CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max()); + *quantized_multiplier = static_cast<int32_t>(q_fixed); +} + +inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne( + int32_t x, + int32_t quantized_multiplier, + int left_shift) { + using gemmlowp::SaturatingRoundingDoublingHighMul; + return SaturatingRoundingDoublingHighMul( + x * (1 << left_shift), quantized_multiplier); +} + +void PreprocessSoftmaxScaling( + double beta, + double input_scale, + int input_integer_bits, + int32_t* quantized_multiplier, + int* left_shift) { + // If the overall multiplier (input and beta) is large, then exp() of an + // input difference of 1 scaled by this will be large. In other words, we + // can cap the multiplier and know that, when it is used, the output will be + // (round to) zero wherever the input is not at the maximum value. + + // If the overall scale is less than one, and input_integer_bits=0, then the + // result is double equivalent of Q0.31 (actually with more precision). Thus + // this generates a Q(input_integer_bits).(31-input_integer_bits) + // representation. + const double input_beta_real_multiplier = std::min( + beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0); + + QuantizeMultiplierGreaterThanOne( + input_beta_real_multiplier, quantized_multiplier, left_shift); +} + +int CalculateInputRadius(int input_integer_bits, int input_left_shift) { + const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) * + (1ll << (31 - input_integer_bits)) / (1ll << input_left_shift); + // Tighten bound using floor. Suppose that we could use the exact value. + // After scaling the difference, the result would be at the maximum. Thus we + // must ensure that our value has lower magnitude. + return static_cast<int>(std::floor(max_input_rescaled)); +} + +void Int8Softmax( + const uint8_t* input_data, + const size_t N, + const size_t D, + int32_t input_beta_multiplier, + int32_t input_beta_left_shift, + int diff_min, + uint8_t* output_data) { + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input_beta_multiplier, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. + static const int kScaledDiffIntegerBits = 5; + static const int kAccumulationIntegerBits = 12; + using FixedPointScaledDiff = + gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>; + using FixedPointAccum = + gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>; + using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>; + + for (int n = 0; n < N; ++n) { + uint8_t max_in_row = 0; + for (int c = 0; c < D; ++c) { + max_in_row = std::max(max_in_row, input_data[n * D + c]); + } + + FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); + for (int c = 0; c < D; ++c) { + int32_t input_diff = + static_cast<int32_t>(input_data[n * D + c]) - max_in_row; + if (input_diff >= diff_min) { + const int32_t input_diff_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + sum_of_exps = sum_of_exps + + gemmlowp::Rescale<kAccumulationIntegerBits>( + exp_on_negative_values(scaled_diff_f8)); + } + } + + int32_t fixed_sum_of_exps = sum_of_exps.raw(); + // TODO(starka): Use a NEON intrinsic like vclzq_u32 instead. + int headroom_plus_one = + __builtin_clz(static_cast<uint32_t>(fixed_sum_of_exps)); + // This is the number of bits to the left of the binary point above 1.0. + // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and + // no later adjustment will be needed. + int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; + int32_t shifted_sum_minus_one = static_cast<int32_t>( + (static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) - + (static_cast<uint32_t>(1) << 31)); + + FixedPoint0 shifted_scale; + // gemmlowp::one_over_one_plus_x_for_x_in_0_1 is defined on (0, + // 1), not [0, 1), so need to handle the case where + // shifted_sum_minus_one is exactly 0. + if (shifted_sum_minus_one == 0) { + shifted_scale = FixedPoint0::One(); + } else { + shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( + FixedPoint0::FromRaw(shifted_sum_minus_one)); + } + + for (int c = 0; c < D; ++c) { + int32_t input_diff = + static_cast<int32_t>(input_data[n * D + c]) - max_in_row; + if (input_diff >= diff_min) { + const int32_t input_diff_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + + FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); + int32_t unsat_output = gemmlowp::RoundingDivideByPOT( + (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); + + output_data[n * D + c] = std::max(std::min(unsat_output, 255), 0); + + } else { + output_data[n * D + c] = 0; + } + } + } +} + +} // namespace + +class Int8SoftmaxOp final : public Operator<CPUContext> { + public: + Int8SoftmaxOp(const OperatorDef& operator_def, Workspace* ws) + : Operator<CPUContext>(operator_def, ws) {} + + bool RunOnDevice() override { + const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); + auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); + const int32_t Y_offset = + this->template GetSingleArgument<int>("Y_zero_point", 0); + const float Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); + CHECK_EQ(Y_offset, 0); + CHECK_EQ(Y_scale, 1. / 256); + + static const int kScaledDiffIntegerBits = 5; + Y->scale = Y_scale; + Y->zero_point = Y_offset; + Y->t.ResizeLike(X.t); + int32_t input_multiplier; + int input_left_shift; + PreprocessSoftmaxScaling( + 1.0 /*params->beta*/, + X.scale, + kScaledDiffIntegerBits, + &input_multiplier, + &input_left_shift); + const int diff_min = + -1.0 * CalculateInputRadius(kScaledDiffIntegerBits, input_left_shift); + Int8Softmax( + X.t.data<uint8_t>(), + X.t.dim(0), + X.t.size() / X.t.dim(0), + input_multiplier, + input_left_shift, + diff_min, + Y->t.mutable_data<uint8_t>()); + return true; + } +}; + +} // namespace int8 + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_INT8_SOFTMAX_OP_H_ diff --git a/caffe2/operators/quantized/int8_test.cc b/caffe2/operators/quantized/int8_test.cc new file mode 100644 index 0000000000..85933956e3 --- /dev/null +++ b/caffe2/operators/quantized/int8_test.cc @@ -0,0 +1,858 @@ +#include "caffe2/operators/conv_pool_op_base.h" +#include "caffe2/operators/conv_transpose_unpool_op_base.h" +#include "caffe2/operators/spatial_batch_norm_op.h" + +#include "caffe2/operators/quantized/int8_test_utils.h" +#include "caffe2/operators/quantized/int8_utils.h" + +namespace caffe2 { + +// How to test + +// Generate int8 tensor +// Convert to fp32 +// Run with int8 backend +// Dequantize result to fp32 +// Run with fp32 backend +// Compare results. + +// for quantized Add, the error shouldn't exceed 2 * scale + +TEST(Int8, ReLU) { + auto XQ = q({1, 224, 224, 3}); + auto X = dq(*XQ); + auto xop = CreateOperatorDef("Relu", "", {"X"}, {"Y"}); + auto op = CreateOperatorDef( + "Int8Relu", + "", + {"XQ"}, + {"YQ"}, + {MakeArgument<int>("Y_zero_point", XQ->zero_point), + MakeArgument<float>("Y_scale", XQ->scale)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_EQ(*YA, YE); +} + +// LeakyReLU isn't build in xplat, so this fails buck test +// xplat/caffe2:caffe2_testAndroid +TEST(Int8, DISABLED_LeakyReLU) { + auto XQ = q({1, 224, 224, 3}); + auto X = dq(*XQ); + const float alpha = 0.1; + auto xop = CreateOperatorDef( + "LeakyRelu", "", {"X"}, {"Y"}, {MakeArgument<float>("alpha", alpha)}); + auto op = CreateOperatorDef( + "Int8LeakyRelu", + "", + {"XQ"}, + {"YQ"}, + {MakeArgument<float>("alpha", alpha), + MakeArgument<int>("Y_zero_point", XQ->zero_point), + MakeArgument<float>("Y_scale", XQ->scale)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, addErrorTolerance(YQ.scale)); +} + +TEST(Int8, Softmax) { + auto XQ = q({1, 2, 1, 3}); + auto X = dq(*XQ); + auto xop = CreateOperatorDef("Softmax", "", {"X"}, {"Y"}); + auto op = CreateOperatorDef( + "Int8Softmax", + "", + {"XQ"}, + {"YQ"}, + {MakeArgument<int>("Y_zero_point", 0), + MakeArgument<float>("Y_scale", 1.0 / 256)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + EXPECT_EQ(YQ.scale, 1.0 / 256); + EXPECT_EQ(YQ.zero_point, 0); + + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, addErrorTolerance(YQ.scale)); +} + +TEST(Int8, MaxPool) { + auto XQ = q({1, 25, 25, 16}); + auto X = dq(*XQ); + auto xop = CreateOperatorDef( + "MaxPool", + "", + {"X"}, + {"Y"}, + {MakeArgument<int>("kernel", 2), MakeArgument<string>("order", "NHWC")}); + auto op = CreateOperatorDef( + "Int8MaxPool", + "", + {"XQ"}, + {"YQ"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("Y_zero_point", XQ->zero_point), + MakeArgument<float>("Y_scale", XQ->scale)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_EQ(*YA, YE); +} + +TEST(Int8, AveragePool) { + auto XQ = q({1, 25, 25, 16}); + auto X = dq(*XQ); + auto xop = CreateOperatorDef( + "AveragePool", + "", + {"X"}, + {"Y"}, + {MakeArgument<int>("kernel", 2), MakeArgument<string>("order", "NHWC")}); + auto op = CreateOperatorDef( + "Int8AveragePool", + "", + {"XQ"}, + {"YQ"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("Y_zero_point", XQ->zero_point), + MakeArgument<float>("Y_scale", XQ->scale)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, addErrorTolerance(XQ->scale)); +} + +TEST(Int8, ResizeNearest) { + auto XQ = q({1, 25, 25, 16}); + auto X = dq(*XQ); + auto xop = CreateOperatorDef( + "ResizeNearest", + "", + {"XT"}, + {"YT"}, + {MakeArgument<float>("width_scale", 2), + MakeArgument<float>("height_scale", 2)}); + auto op = CreateOperatorDef( + "Int8ResizeNearest", + "", + {"XQ"}, + {"YQ"}, + {MakeArgument<float>("width_scale", 2), + MakeArgument<float>("height_scale", 2), + MakeArgument<int>("Y_zero_point", XQ->zero_point), + MakeArgument<float>("Y_scale", XQ->scale)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(CreateOperatorDef("NHWC2NCHW", "", {"X"}, {"XT"})); + ws.RunOperatorOnce(xop); + ws.RunOperatorOnce(CreateOperatorDef("NCHW2NHWC", "", {"YT"}, {"Y"})); + ws.RunOperatorOnce(op); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_EQ(*YA, YE); +} + +TEST(Int8, ChannelShuffle) { + auto XQ = q({2, 25, 25, 32}); + auto X = dq(*XQ); + auto xop = CreateOperatorDef( + "ChannelShuffle", + "", + {"XT"}, + {"YT"}, + { + MakeArgument<int>("kernel", 1), + MakeArgument<int>("group", 4), + MakeArgument<std::string>("order", "NCHW"), + }); + auto op = CreateOperatorDef( + "Int8ChannelShuffle", + "", + {"XQ"}, + {"YQ"}, + { + MakeArgument<int>("kernel", 1), + MakeArgument<int>("group", 4), + MakeArgument<std::string>("order", "NHWC"), + MakeArgument<int>("Y_zero_point", XQ->zero_point), + MakeArgument<float>("Y_scale", XQ->scale), + }); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(CreateOperatorDef("NHWC2NCHW", "", {"X"}, {"XT"})); + ws.RunOperatorOnce(xop); + ws.RunOperatorOnce(CreateOperatorDef("NCHW2NHWC", "", {"YT"}, {"Y"})); + ws.RunOperatorOnce(op); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_EQ(*YA, YE); +} + +TEST(Int8, Concat) { + auto XQ0 = q({2, 25, 25, 16}); + auto X0 = dq(*XQ0); + auto XQ1 = q({2, 25, 25, 24}); + auto X1 = dq(*XQ1); + auto xop = CreateOperatorDef( + "Concat", + "", + {"XT0", "XT1"}, + {"YT", "_"}, + { + MakeArgument<std::string>("order", "NCHW"), + }); + auto op = CreateOperatorDef( + "Int8Concat", + "", + {"XQ0", "XQ1"}, + {"YQ", "_"}, + { + MakeArgument<std::string>("order", "NHWC"), + MakeArgument<int>("Y_zero_point", XQ0->zero_point), + MakeArgument<float>("Y_scale", XQ0->scale), + }); + Workspace ws; + int8Copy(ws.CreateBlob("XQ0")->GetMutable<int8::Int8TensorCPU>(), *XQ0); + int8Copy(ws.CreateBlob("XQ1")->GetMutable<int8::Int8TensorCPU>(), *XQ1); + BlobGetMutableTensor(ws.CreateBlob("X0"), CPU)->CopyFrom(*X0); + BlobGetMutableTensor(ws.CreateBlob("X1"), CPU)->CopyFrom(*X1); + ws.RunOperatorOnce(CreateOperatorDef("NHWC2NCHW", "", {"X0"}, {"XT0"})); + ws.RunOperatorOnce(CreateOperatorDef("NHWC2NCHW", "", {"X1"}, {"XT1"})); + ws.RunOperatorOnce(xop); + ws.RunOperatorOnce(CreateOperatorDef("NCHW2NHWC", "", {"YT"}, {"Y"})); + ws.RunOperatorOnce(op); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_EQ(*YA, YE); +} + +TEST(Int8, Add) { + auto XQ0 = q({1, 10, 10, 20}); + auto XQ1 = q({1, 10, 10, 20}); + auto X0 = dq(*XQ0); + auto X1 = dq(*XQ1); + auto xop = CreateOperatorDef("Add", "", {"X0", "X1"}, {"Y"}); + const auto Y_scale = 2 * std::max(XQ0->scale, XQ1->scale); + auto op = CreateOperatorDef( + "Int8Add", + "", + {"XQ0", "XQ1"}, + {"YQ"}, + {MakeArgument<int>("Y_zero_point", XQ0->zero_point), + MakeArgument<float>("Y_scale", Y_scale)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ0")->GetMutable<int8::Int8TensorCPU>(), *XQ0); + int8Copy(ws.CreateBlob("XQ1")->GetMutable<int8::Int8TensorCPU>(), *XQ1); + BlobGetMutableTensor(ws.CreateBlob("X0"), CPU)->CopyFrom(*X0); + BlobGetMutableTensor(ws.CreateBlob("X1"), CPU)->CopyFrom(*X1); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, addErrorTolerance(Y_scale)); +} + +TEST(Int8, SumRelu) { + auto XQ0 = q({1, 10, 10, 20}); + auto XQ1 = q({1, 10, 10, 20}); + auto X0 = dq(*XQ0); + auto X1 = dq(*XQ1); + auto xop = CreateOperatorDef("Sum", "", {"X0", "X1"}, {"Y"}); + auto rlxop = CreateOperatorDef("Relu", "", {"Y"}, {"Y"}); + const auto Y_scale = 2 * std::max(XQ0->scale, XQ1->scale); + auto op = CreateOperatorDef( + "Int8SumRelu", + "", + {"XQ0", "XQ1"}, + {"YQ"}, + {MakeArgument<int>("Y_zero_point", XQ0->zero_point), + MakeArgument<float>("Y_scale", Y_scale)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ0")->GetMutable<int8::Int8TensorCPU>(), *XQ0); + int8Copy(ws.CreateBlob("XQ1")->GetMutable<int8::Int8TensorCPU>(), *XQ1); + BlobGetMutableTensor(ws.CreateBlob("X0"), CPU)->CopyFrom(*X0); + BlobGetMutableTensor(ws.CreateBlob("X1"), CPU)->CopyFrom(*X1); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + ws.RunOperatorOnce(rlxop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, addErrorTolerance(Y_scale)); +} + +void setq(int8::Int8TensorCPU* dst, const std::vector<float>& vs) { + CHECK_EQ(vs.size(), dst->t.size()); + for (auto i = 0; i < vs.size(); ++i) { + uint8_t vq = std::max( + std::numeric_limits<uint8_t>::min(), + std::min( + std::numeric_limits<uint8_t>::max(), + static_cast<uint8_t>(int8::Round( + static_cast<float>(dst->zero_point + (vs[i] / dst->scale)))))); + dst->t.mutable_data<uint8_t>()[i] = vq; + } +} + +void biassetq(int8::Int8TensorCPU* dst, const std::vector<float>& vs) { + CHECK_EQ(vs.size(), dst->t.size()); + for (auto i = 0; i < vs.size(); ++i) { + int32_t vq = std::max( + std::numeric_limits<int32_t>::min(), + std::min( + std::numeric_limits<int32_t>::max(), + static_cast<int32_t>(int8::Round( + static_cast<float>(dst->zero_point + (vs[i] / dst->scale)))))); + dst->t.mutable_data<int32_t>()[i] = vq; + } +} + +// Use TFLite test vectors to ensure compatibility. +TEST(Int8, Conv) { + auto XQ = q({2, 2, 4, 1}); + XQ->scale = 0.5; + XQ->zero_point = 127; + setq( + XQ.get(), + std::vector<float>{1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 4, 1, 2, 3, 4}); + auto WQ = q({3, 2, 2, 1}); + WQ->scale = 0.5; + WQ->zero_point = 127; + setq( + WQ.get(), + { + 1, + 2, + 3, + 4, + -1, + 1, + -1, + 1, + -1, + -1, + 1, + 1, + }); + auto BQ = biasq({3}, XQ->scale * WQ->scale); + biassetq(BQ.get(), {1, 2, 3}); + auto X = dq(*XQ); + auto W = dq(*WQ); + auto B = biasdq(*BQ); + auto xop = CreateOperatorDef( + "Conv", + "", + {"X", "W", "B"}, + {"Y"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("stride", 2)}); + auto op = CreateOperatorDef( + "Int8Conv", + "", + {"XQ", "WQ", "BQ"}, + {"YQ"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("stride", 2), + MakeArgument<int>("Y_zero_point", 127), + MakeArgument<float>("Y_scale", 1.0)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + int8Copy(ws.CreateBlob("WQ")->GetMutable<int8::Int8TensorCPU>(), *WQ); + int8Copy(ws.CreateBlob("BQ")->GetMutable<int8::Int8TensorCPU>(), *BQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + BlobGetMutableTensor(ws.CreateBlob("W"), CPU)->CopyFrom(*W); + BlobGetMutableTensor(ws.CreateBlob("B"), CPU)->CopyFrom(*B); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TRUE( + (std::vector<uint8_t>( + YQ.t.data<uint8_t>(), YQ.t.data<uint8_t>() + YQ.t.size()) == + std::vector<uint8_t>{ + 145, 129, 132, 145, 129, 132, 144, 131, 130, 164, 131, 130})); + + EXPECT_TENSOR_APPROX_EQ(*YA, YE, 1.0e-5); +} + +TEST(Int8, Grouped1x1Conv) { + auto XQ = q({1, 3, 2, 4}); + XQ->scale = 0.5; + XQ->zero_point = 127; + setq(XQ.get(), std::vector<float>{1, 4, 3, 2, 9, 3, 8, 2, 6, 7, 8, 2, + 3, 8, 1, 7, 4, 2, 1, 3, 8, 5, 3, 1}); + + // G = 2 + auto WQ = q({4, 1, 1, 2}); + WQ->scale = 0.5; + WQ->zero_point = 127; + setq(WQ.get(), {1, 2, 3, 4, -1, -2, -3, -4}); + auto BQ = biasq({4}, XQ->scale * WQ->scale); + biassetq(BQ.get(), {1, 2, 3, 4}); + auto X = dq(*XQ); + auto W = dq(*WQ); + auto B = biasdq(*BQ); + auto xop = CreateOperatorDef( + "Conv", + "", + {"XT", "WT", "B"}, + {"YT"}, + {MakeArgument<int>("kernel", 1), + MakeArgument<string>("order", "NCHW"), + MakeArgument<int>("group", 2)}); + auto op = CreateOperatorDef( + "Int8Conv", + "", + {"XQ", "WQ", "BQ"}, + {"YQ"}, + {MakeArgument<int>("kernel", 1), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("group", 2), + MakeArgument<int>("Y_zero_point", 127), + MakeArgument<float>("Y_scale", 1.0)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + int8Copy(ws.CreateBlob("WQ")->GetMutable<int8::Int8TensorCPU>(), *WQ); + int8Copy(ws.CreateBlob("BQ")->GetMutable<int8::Int8TensorCPU>(), *BQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + BlobGetMutableTensor(ws.CreateBlob("W"), CPU)->CopyFrom(*W); + BlobGetMutableTensor(ws.CreateBlob("B"), CPU)->CopyFrom(*B); + ws.RunOperatorOnce(op); + + ws.RunOperatorOnce(CreateOperatorDef("NHWC2NCHW", "", {"X"}, {"XT"})); + // Need to transpose MxKHxKWx1 to Mx1xKHxKW + ws.RunOperatorOnce(CreateOperatorDef("NHWC2NCHW", "", {"W"}, {"WT"})); + ws.RunOperatorOnce(xop); + ws.RunOperatorOnce(CreateOperatorDef("NCHW2NHWC", "", {"YT"}, {"Y"})); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, 1.0e-5); + + // test repacking between runs + std::unique_ptr<OperatorBase> op_ptr(CreateOperator(op, &ws)); + EXPECT_TRUE(op_ptr != nullptr); + for (auto it = 0; it < 3; ++it) { + EXPECT_TRUE(op_ptr->Run()); + const auto& temp_YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto temp_YA = dq(temp_YQ); + const auto& temp_YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*temp_YA, temp_YE, 1.0e-5); + } +} + +TEST(Int8, Conv2) { + auto XQ = q({1, 3, 6, 1}); + XQ->scale = 0.5; + XQ->zero_point = 127; + setq( + XQ.get(), + std::vector<float>{ + 3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4, 5, 4, 3, -3, -4, -5}); + auto WQ = q({1, 2, 2, 1}); + WQ->scale = 0.5; + WQ->zero_point = 127; + setq(WQ.get(), {1, 2, 3, 4}); + auto BQ = biasq({1}, XQ->scale * WQ->scale); + biassetq(BQ.get(), {-1}); + auto X = dq(*XQ); + auto W = dq(*WQ); + auto B = biasdq(*BQ); + auto xop = CreateOperatorDef( + "Conv", + "", + {"X", "W", "B"}, + {"Y"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("stride_w", 3), + MakeArgument<int>("stride_h", 1)}); + auto op = CreateOperatorDef( + "Int8Conv", + "", + {"XQ", "WQ", "BQ"}, + {"YQ"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("stride_w", 3), + MakeArgument<int>("stride_h", 1), + MakeArgument<int>("Y_zero_point", 127), + MakeArgument<float>("Y_scale", 1.0)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + int8Copy(ws.CreateBlob("WQ")->GetMutable<int8::Int8TensorCPU>(), *WQ); + int8Copy(ws.CreateBlob("BQ")->GetMutable<int8::Int8TensorCPU>(), *BQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + BlobGetMutableTensor(ws.CreateBlob("W"), CPU)->CopyFrom(*W); + BlobGetMutableTensor(ws.CreateBlob("B"), CPU)->CopyFrom(*B); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TRUE( + (std::vector<uint8_t>( + YQ.t.data<uint8_t>(), YQ.t.data<uint8_t>() + YQ.t.size()) == + std::vector<uint8_t>{157, 103, 167, 93})); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, 1.0e-5); +} + +TEST(Int8, DepthwiseConv) { + auto XQ = q({1, 3, 2, 2}); + XQ->scale = 0.5; + XQ->zero_point = 127; + // setq(XQ.get(), std::vector<float>{1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}); + setq(XQ.get(), std::vector<float>{1, 4, 3, 2, 9, 3, 8, 2, 6, 7, 8, 2}); + + auto WQ = q({2, 2, 2, 1}); + WQ->scale = 0.5; + WQ->zero_point = 127; + setq(WQ.get(), {1, 2, 3, 4, -9, 10, -11, 12}); + auto BQ = biasq({2}, XQ->scale * WQ->scale); + biassetq(BQ.get(), {1, 2}); + auto X = dq(*XQ); + auto W = dq(*WQ); + auto B = biasdq(*BQ); + auto xop = CreateOperatorDef( + "Conv", + "", + {"XT", "WT", "B"}, + {"YT"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NCHW"), + MakeArgument<int>("group", 2)}); + auto op = CreateOperatorDef( + "Int8Conv", + "", + {"XQ", "WQ", "BQ"}, + {"YQ"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("group", 2), + MakeArgument<int>("Y_zero_point", 127), + MakeArgument<float>("Y_scale", 1.0)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + int8Copy(ws.CreateBlob("WQ")->GetMutable<int8::Int8TensorCPU>(), *WQ); + int8Copy(ws.CreateBlob("BQ")->GetMutable<int8::Int8TensorCPU>(), *BQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + BlobGetMutableTensor(ws.CreateBlob("W"), CPU)->CopyFrom(*W); + BlobGetMutableTensor(ws.CreateBlob("B"), CPU)->CopyFrom(*B); + ws.RunOperatorOnce(op); + + ws.RunOperatorOnce(CreateOperatorDef("NHWC2NCHW", "", {"X"}, {"XT"})); + // Need to transpose MxKHxKWx1 to Mx1xKHxKW + ws.RunOperatorOnce(CreateOperatorDef("NHWC2NCHW", "", {"W"}, {"WT"})); + ws.RunOperatorOnce(xop); + ws.RunOperatorOnce(CreateOperatorDef("NCHW2NHWC", "", {"YT"}, {"Y"})); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + for (auto i = 0; i < YA->size(); ++i) { + LOG(INFO) << YA->data<float>()[i]; + LOG(INFO) << YE.data<float>()[i]; + } + EXPECT_TENSOR_APPROX_EQ(*YA, YE, 1.0e-5); +} + +TEST(Int8, ConvTranspose) { + auto XQ = q({1, 3, 6, 1}); + XQ->scale = 0.5; + XQ->zero_point = 127; + setq( + XQ.get(), + std::vector<float>{ + 3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4, 5, 4, 3, -3, -4, -5}); + auto WQ = q({1, 2, 2, 1}); + WQ->scale = 0.5; + WQ->zero_point = 127; + setq(WQ.get(), {1, 2, 3, 4}); + auto BQ = biasq({1}, XQ->scale * WQ->scale); + biassetq(BQ.get(), {-1}); + auto X = dq(*XQ); + auto W = dq(*WQ); + auto B = biasdq(*BQ); + auto xop = CreateOperatorDef( + "ConvTranspose", + "", + {"X", "W", "B"}, + {"Y"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("stride_w", 1), + MakeArgument<int>("stride_h", 2)}); + auto op = CreateOperatorDef( + "Int8ConvTranspose", + "", + {"XQ", "WQ", "BQ"}, + {"YQ"}, + {MakeArgument<int>("kernel", 2), + MakeArgument<string>("order", "NHWC"), + MakeArgument<int>("stride_w", 1), + MakeArgument<int>("stride_h", 2), + MakeArgument<int>("Y_zero_point", 127), + MakeArgument<float>("Y_scale", 1.0)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + int8Copy(ws.CreateBlob("WQ")->GetMutable<int8::Int8TensorCPU>(), *WQ); + int8Copy(ws.CreateBlob("BQ")->GetMutable<int8::Int8TensorCPU>(), *BQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + BlobGetMutableTensor(ws.CreateBlob("W"), CPU)->CopyFrom(*W); + BlobGetMutableTensor(ws.CreateBlob("B"), CPU)->CopyFrom(*B); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, 1.0e-5); +} + +TEST(Int8, FC) { + auto XQ = q({2, 10}); + XQ->scale = 0.5; + XQ->zero_point = 127; + setq(XQ.get(), {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10}); + auto WQ = q({3, 10}); + WQ->scale = 0.5; + WQ->zero_point = 127; + setq( + WQ.get(), + { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + }); + auto BQ = biasq({3}, XQ->scale * WQ->scale); + biassetq(BQ.get(), {1, 2, 3}); + auto X = dq(*XQ); + auto W = dq(*WQ); + auto B = biasdq(*BQ); + auto xop = CreateOperatorDef("FC", "", {"X", "W", "B"}, {"Y"}, {}); + auto op = CreateOperatorDef( + "Int8FC", + "", + {"XQ", "WQ", "BQ"}, + {"YQ"}, + {MakeArgument<int>("Y_zero_point", 127), + MakeArgument<float>("Y_scale", 1.0)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + int8Copy(ws.CreateBlob("WQ")->GetMutable<int8::Int8TensorCPU>(), *WQ); + int8Copy(ws.CreateBlob("BQ")->GetMutable<int8::Int8TensorCPU>(), *BQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + BlobGetMutableTensor(ws.CreateBlob("W"), CPU)->CopyFrom(*W); + BlobGetMutableTensor(ws.CreateBlob("B"), CPU)->CopyFrom(*B); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + for (auto i = 0; i < YA->size(); ++i) { + LOG(INFO) << YA->data<float>()[i]; + LOG(INFO) << YE.data<float>()[i]; + } + EXPECT_TRUE( + (std::vector<uint8_t>( + YQ.t.data<uint8_t>(), YQ.t.data<uint8_t>() + YQ.t.size()) == + std::vector<uint8_t>{151, 152, 153, 185, 186, 187})); +} + +TEST(Int8, GivenTensorFill) { + vector<int64_t> shape = {1, 25, 25, 16}; + auto XQ = q(shape); + auto X = dq(*XQ); + vector<float> v( + X->template data<float>(), X->template data<float>() + X->size()); + std::string vq( + XQ->t.template data<uint8_t>(), + XQ->t.template data<uint8_t>() + XQ->t.size()); + auto op = CreateOperatorDef( + "GivenTensorFill", + "", + {}, + {"Y"}, + {MakeArgument<vector<int64_t>>("shape", shape), + MakeArgument<vector<float>>("values", v)}); + auto xop = CreateOperatorDef( + "Int8GivenTensorFill", + "", + {}, + {"YQ"}, + {MakeArgument<vector<int64_t>>("shape", shape), + MakeArgument<string>("values", vq), + MakeArgument<float>("Y_scale", XQ->scale), + MakeArgument<int32_t>("Y_zero_point", XQ->zero_point)}); + Workspace ws; + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, 1.0e-5); +} + +TEST(Int8, GivenIntTensorFill) { + vector<int64_t> shape = {32}; + auto XQ = biasq(shape, 1. / 255 * 1. / 255); + auto X = biasdq(*XQ); + vector<float> v( + X->template data<float>(), X->template data<float>() + X->size()); + vector<int32_t> vq( + XQ->t.template data<int32_t>(), + XQ->t.template data<int32_t>() + XQ->t.size()); + auto op = CreateOperatorDef( + "GivenTensorFill", + "", + {}, + {"Y"}, + {MakeArgument<vector<int64_t>>("shape", shape), + MakeArgument<vector<float>>("values", v)}); + auto xop = CreateOperatorDef( + "Int8GivenIntTensorFill", + "", + {}, + {"YQ"}, + {MakeArgument<vector<int64_t>>("shape", shape), + MakeArgument<vector<int32_t>>("values", vq), + MakeArgument<float>("Y_scale", XQ->scale), + MakeArgument<int32_t>("Y_zero_point", XQ->zero_point)}); + Workspace ws; + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = biasdq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*YA, YE, 1.0e-5); +} + +TEST(Int8, QuantDeQuant) { + vector<int64_t> shape = {1, 25, 25, 16}; + auto XQ = q(shape); + auto X = dq(*XQ); + auto xop = CreateOperatorDef( + "Int8Quantize", + "", + {"X"}, + {"XQ"}, + {MakeArgument<float>("Y_scale", XQ->scale), + MakeArgument<int32_t>("Y_zero_point", XQ->zero_point)}); + auto op = CreateOperatorDef("Int8Dequantize", "", {"XQ"}, {"X_x"}); + Workspace ws; + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(xop); + ws.RunOperatorOnce(op); + const auto& X_x = ws.GetBlob("X_x")->Get<TensorCPU>(); + EXPECT_TENSOR_APPROX_EQ(*X, X_x, XQ->scale); +} + +TEST(Int8, Reshape) { + auto XQ = q({1, 25, 25, 16}); + auto xop = CreateOperatorDef( + "Int8Reshape", + "", + {"XQ"}, + {"YQ", "old_shape"}, + {MakeArgument("shape", vector<int64_t>{0, -1, 2000}), + MakeArgument<float>("Y_scale", XQ->scale), + MakeArgument<int32_t>("Y_zero_point", XQ->zero_point)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + EXPECT_EQ(YQ.t.sizes(), (vector<int64_t>{1, 5, 2000})); + EXPECT_EQ(YQ.scale, XQ->scale); + EXPECT_EQ(YQ.zero_point, XQ->zero_point); +} + +TEST(Int8, Flatten) { + auto XQ = q({1, 25, 25, 16}); + auto xop = CreateOperatorDef( + "Int8Flatten", + "", + {"XQ"}, + {"YQ"}, + {MakeArgument<int>("axis", 2), + MakeArgument<float>("Y_scale", XQ->scale), + MakeArgument<int32_t>("Y_zero_point", XQ->zero_point)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + EXPECT_EQ(YQ.t.sizes(), (vector<int64_t>{25, 400})); + EXPECT_EQ(YQ.scale, XQ->scale); + EXPECT_EQ(YQ.zero_point, XQ->zero_point); +} + +TEST(Int8, Slice) { + auto XQ = q({1, 25, 25, 16}); + auto X = dq(*XQ); + vector<int> starts = {0, 3, 0, 0}; + vector<int> ends = {-1, 5, -1, -1}; + auto xop = CreateOperatorDef( + "Slice", + "", + {"X"}, + {"Y"}, + {MakeArgument<vector<int>>("starts", starts), + MakeArgument<vector<int>>("ends", ends)}); + auto op = CreateOperatorDef( + "Int8Slice", + "", + {"XQ"}, + {"YQ"}, + {MakeArgument<vector<int>>("starts", starts), + MakeArgument<vector<int>>("ends", ends), + MakeArgument<int>("Y_zero_point", XQ->zero_point), + MakeArgument<float>("Y_scale", XQ->scale)}); + Workspace ws; + int8Copy(ws.CreateBlob("XQ")->GetMutable<int8::Int8TensorCPU>(), *XQ); + BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(*X); + ws.RunOperatorOnce(op); + ws.RunOperatorOnce(xop); + const auto& YQ = ws.GetBlob("YQ")->Get<int8::Int8TensorCPU>(); + auto YA = dq(YQ); + const auto& YE = ws.GetBlob("Y")->Get<TensorCPU>(); + EXPECT_TENSOR_EQ(*YA, YE); + EXPECT_EQ(YQ.t.sizes(), (vector<int64_t>{1, 2, 25, 16})); + EXPECT_EQ(YQ.scale, XQ->scale); + EXPECT_EQ(YQ.zero_point, XQ->zero_point); +} +} // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_test_utils.h b/caffe2/operators/quantized/int8_test_utils.h new file mode 100644 index 0000000000..7cd5bf2d58 --- /dev/null +++ b/caffe2/operators/quantized/int8_test_utils.h @@ -0,0 +1,118 @@ +#ifndef CAFFE2_INT8_TEST_UTILS_H_ +#define CAFFE2_INT8_TEST_UTILS_H_ + +#include "caffe2/core/common.h" +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" +#include "caffe2/core/tensor_int8.h" + +#include <array> +#include <cmath> +#include <random> + +#include "gtest/gtest.h" + +namespace caffe2 { + +// for quantized Add, the error shouldn't exceed 2 * scale +inline float addErrorTolerance(float scale) { + return 2 * scale; +} + +inline std::unique_ptr<int8::Int8TensorCPU> q( + const std::vector<int64_t>& dims) { + auto r = caffe2::make_unique<int8::Int8TensorCPU>(); + r->scale = 0.01; + r->zero_point = static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) / 2; + r->t.Resize(dims); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<uint8_t> dis; + for (auto i = 0; i < r->t.size(); ++i) { + r->t.mutable_data<uint8_t>()[i] = dis(gen); + } + return r; +} + +inline std::unique_ptr<int8::Int8TensorCPU> biasq( + const std::vector<int64_t>& dims, + double scale) { + auto r = caffe2::make_unique<int8::Int8TensorCPU>(); + r->scale = scale; + r->zero_point = 0; + r->t.Resize(dims); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dis(-1, 1); + for (auto i = 0; i < r->t.size(); ++i) { + r->t.mutable_data<int32_t>()[i] = + static_cast<int32_t>(dis(gen) / scale + r->zero_point); + } + return r; +} + +inline std::unique_ptr<TensorCPU> dq(const int8::Int8TensorCPU& XQ) { + auto r = caffe2::make_unique<Tensor>(CPU); + r->Resize(XQ.t.sizes()); + for (auto i = 0; i < r->size(); ++i) { + r->mutable_data<float>()[i] = + (static_cast<int32_t>(XQ.t.data<uint8_t>()[i]) - XQ.zero_point) * + XQ.scale; + } + return r; +} + +inline std::unique_ptr<TensorCPU> biasdq(const int8::Int8TensorCPU& XQ) { + auto r = caffe2::make_unique<Tensor>(CPU); + r->Resize(XQ.t.sizes()); + for (auto i = 0; i < r->size(); ++i) { + r->mutable_data<float>()[i] = + (XQ.t.data<int32_t>()[i] - XQ.zero_point) * XQ.scale; + } + return r; +} + +#define EXPECT_TENSOR_EQ(_YA, _YE) \ + do { \ + EXPECT_TRUE((_YA).sizes() == (_YE).sizes()); \ + for (auto i = 0; i < (_YA).size(); ++i) { \ + EXPECT_FLOAT_EQ((_YA).data<float>()[i], (_YE).data<float>()[i]); \ + } \ + } while (0); + +#define EXPECT_TENSOR_APPROX_EQ(_YA, _YE, _tol) \ + do { \ + EXPECT_TRUE((_YA).sizes() == (_YE).sizes()); \ + for (auto i = 0; i < (_YA).size(); ++i) { \ + EXPECT_NEAR((_YA).data<float>()[i], (_YE).data<float>()[i], (_tol)); \ + } \ + } while (0); + +inline void int8Copy(int8::Int8TensorCPU* dst, const int8::Int8TensorCPU& src) { + dst->zero_point = src.zero_point; + dst->scale = src.scale; + dst->t.CopyFrom(src.t); +} + +inline void add_input( + const vector<int64_t>& shape, + const vector<float>& values, + const string& name, + Workspace* ws) { + // auto* t = ws->CreateBlob(name)->GetMutable<TensorCPU>(); + auto t = caffe2::make_unique<Tensor>(CPU); + t->Resize(shape); + std::copy(values.begin(), values.end(), t->mutable_data<float>()); + BlobGetMutableTensor(ws->CreateBlob(name), CPU)->CopyFrom(*t); +} + +inline int randomInt(int a, int b) { + static std::random_device rd; + static std::mt19937 gen(rd()); + return std::uniform_int_distribution<int>(a, b)(gen); +} + +} // namespace caffe2 + +#endif // CAFFE2_INT8_TEST_UTILS_H_ diff --git a/caffe2/operators/quantized/int8_utils.h b/caffe2/operators/quantized/int8_utils.h new file mode 100644 index 0000000000..2cacb079d9 --- /dev/null +++ b/caffe2/operators/quantized/int8_utils.h @@ -0,0 +1,177 @@ +#ifndef CAFFE2_INT8_UTILS_H_ +#define CAFFE2_INT8_UTILS_H_ + +#include <gemmlowp/public/gemmlowp.h> + +#include "caffe2/utils/threadpool/ThreadPool.h" +#include "caffe2/utils/threadpool/WorkersPool.h" + +namespace caffe2 { + +/* + * Initialized QNNPACK (only once). + * Throws if initialization failed. + */ +void initQNNPACK(); + +namespace int8 { + +/* + * Code here is partially derived from gemmlowp library + * (https://github.com/google/gemmlowp) + */ + +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +inline int32_t MultiplyByQuantizedMultiplierSmallerThanOne( + int32_t x, + int32_t quantized_multiplier, + int right_shift) { + using gemmlowp::RoundingDivideByPOT; + using gemmlowp::SaturatingRoundingDoublingHighMul; + return RoundingDivideByPOT( + SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift); +} + +#if defined(__ANDROID__) && !defined(__NDK_MAJOR__) +template <class T> +inline float Round(const float x) { + return ::nearbyintf(x); +} +inline double Round(const double x) { + return ::nearbyint(x); +} +#else +template <class T> +inline T Round(const T x) { + return std::nearbyint(x); +} +#endif + +inline uint8_t QuantizeUint8(float scale, int32_t zero_point, float value) { + const int32_t qmin = std::numeric_limits<uint8_t>::min(); + const int32_t qmax = std::numeric_limits<uint8_t>::max(); + + auto r = zero_point + static_cast<int32_t>(Round(value / scale)); + r = std::max(r, qmin); + r = std::min(r, qmax); + return static_cast<uint8_t>(r); +} + +inline void QuantizeMultiplierSmallerThanOne( + double double_multiplier, + int32_t* quantized_multiplier, + int* right_shift) { + CHECK(double_multiplier >= 0.); + CHECK(double_multiplier < 1.); + if (double_multiplier == 0.) { + *quantized_multiplier = 0; + *right_shift = 0; + return; + } + CHECK(double_multiplier > 0.); + const double q = std::frexp(double_multiplier, right_shift); + *right_shift *= -1; + + auto q_fixed = static_cast<int64_t>(Round(q * (1ll << 31))); + CHECK(q_fixed <= (1ll << 31)); + if (q_fixed == (1ll << 31)) { + q_fixed /= 2; + --*right_shift; + } + CHECK_GE(*right_shift, 0); + CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max()); + *quantized_multiplier = static_cast<int32_t>(q_fixed); +} + +// An adaptor to use the Caffe2 WorkersPool implementation for gemmlowp +// multithreading functions. +class C2GEMMContext : public gemmlowp::SingleThreadGemmContext { + class C2WorkersPool; + + public: + C2GEMMContext(ThreadPool* pool) : threadPool_(pool), workersPool_(pool) {} + int max_num_threads() const { + CHECK(threadPool_); + return threadPool_->getNumThreads(); + } + C2WorkersPool* workers_pool() { + return &workersPool_; + } + + ThreadPool* threadPool() { + return threadPool_; + } + + private: + class C2WorkersPool { + public: + C2WorkersPool(ThreadPool* pool) : pool_(pool) {} + void Execute(const std::vector<gemmlowp::Task*>& tasks) { + class C2Task : public Task { + public: + C2Task(gemmlowp::Task* task) : task_(task){}; + virtual void Run() override { + CHECK(task_); + task_->Run(); + } + + private: + gemmlowp::Task* task_; + }; + std::vector<std::shared_ptr<Task>> c2tasks; + c2tasks.reserve(tasks.size()); + std::vector<gemmlowp::Allocator> allocators(tasks.size()); + + for (size_t i = 0; i < tasks.size(); ++i) { + auto* task = tasks[i]; + CHECK(task); + task->local_allocator = &allocators[i]; + c2tasks.push_back(std::shared_ptr<Task>(new C2Task(task))); + } + CHECK(pool_); + pool_->withPool([&](WorkersPool* pool) { pool->Execute(c2tasks); }); + for (auto* t : tasks) { + delete t; + } + } + + private: + ThreadPool* pool_; + }; + ThreadPool* threadPool_; + C2WorkersPool workersPool_; +}; + +enum class Activation : uint8_t { NONE = 0, RELU = 1 }; + +inline std::pair<uint8_t, uint8_t> +activationLimits(float scale, int32_t zero_point, Activation Ac) { + switch (Ac) { + case Activation::NONE: + return {std::numeric_limits<uint8_t>::min(), + std::numeric_limits<uint8_t>::max()}; + case Activation::RELU: + return {QuantizeUint8(scale, zero_point, 0.0), + std::numeric_limits<uint8_t>::max()}; + default: + __builtin_unreachable(); + } +} + +} // namespace int8 +} // namespace caffe2 + +#endif // CAFFE2_INT8_UTILS_H_ |