diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2018-05-04 17:57:16 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2018-05-04 17:57:16 +0900 |
commit | 07659ccd9fe7b1cf1547cc6cad78bcf489f0a361 (patch) | |
tree | cf3a123812b7f1ad8b50d7d0ace891e0c03c6110 /runtimes/nn/depend/external/gemmlowp/public | |
parent | da6f7a3e8360a49fd073a6e0031a4da134d9d984 (diff) | |
download | nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.tar.gz nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.tar.bz2 nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.zip |
Imported Upstream version 0.1upstream/0.1submit/tizen/20180504.091146
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/public')
4 files changed, 474 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/public/bit_depth.h b/runtimes/nn/depend/external/gemmlowp/public/bit_depth.h new file mode 100644 index 000000000..6cb4ecf0d --- /dev/null +++ b/runtimes/nn/depend/external/gemmlowp/public/bit_depth.h @@ -0,0 +1,62 @@ +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// bit_depth.h: defines the settins controlling LHS/RHS bit depth + +#ifndef GEMMLOWP_PUBLIC_BIT_DEPTH_H_ +#define GEMMLOWP_PUBLIC_BIT_DEPTH_H_ + +namespace gemmlowp { + +// The range of allowed values for an operand. +template <int tMinValue, int tMaxValue> +struct OperandRange { + static const int kMinValue = tMinValue; + static const int kMaxValue = tMaxValue; + static_assert(0 <= kMinValue, ""); + static_assert(kMinValue < kMaxValue, ""); + static_assert(kMaxValue <= 255, ""); +}; + +using Uint8Range = OperandRange<0, 255>; +using Uint8RangeExcludingZero = OperandRange<1, 255>; + +template <typename tLhsRange, typename tRhsRange> +struct BitDepthParams { + using LhsRange = tLhsRange; + using RhsRange = tRhsRange; +}; + +// Default: LHS and RHS are 8bit. +using DefaultL8R8BitDepthParams = BitDepthParams<Uint8Range, Uint8Range>; + +// Variant: LHS may not take the value 0. This allows using +// faster kernels using signed arithmetic, see +// NEON_64bit_GEMM_Int8Operands_Int32Accumulators_AccumTwoWithin16Bits +using L8R8WithLhsNonzeroBitDepthParams = + BitDepthParams<Uint8RangeExcludingZero, Uint8Range>; + +// Deprecated: when gemmlowp used to allow requantizing 8bit +// inputs to less-than-8-bit depths, the public setting allowing +// that was DefaultL7R5BitDepthParams. That requantization +// feature has been removed, but as the whole point of that +// requantization was to make less-than-8-bit an internal +// optimization without any impact on the API (other than lowering +// accuracy), we can temporarily support users who were using it +// by mapping it to the default 8bit behavior. +using DefaultL7R5BitDepthParams = DefaultL8R8BitDepthParams; + +} // namespace gemmlowp + +#endif // GEMMLOWP_PUBLIC_BIT_DEPTH_H_ diff --git a/runtimes/nn/depend/external/gemmlowp/public/gemmlowp.h b/runtimes/nn/depend/external/gemmlowp/public/gemmlowp.h new file mode 100644 index 000000000..05b0f4714 --- /dev/null +++ b/runtimes/nn/depend/external/gemmlowp/public/gemmlowp.h @@ -0,0 +1,87 @@ +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// gemmlowp.h: the main public interface header of gemmlowp. + +#ifndef GEMMLOWP_PUBLIC_GEMMLOWP_H_ +#define GEMMLOWP_PUBLIC_GEMMLOWP_H_ +#include "../internal/dispatch_gemm_shape.h" +#include "bit_depth.h" +#include "map.h" +#include "output_stages.h" + +namespace gemmlowp { + +class GemmContext : public MultiThreadGemmContext {}; + +// Computes a general matrix product ("GEMM"). +// This is a version that supports per channel quantization. +template <typename InputScalar, typename OutputScalar, typename BitDepthParams, + MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder, + typename LhsOffset, typename RhsOffset, typename OutputPipelineType, + typename GemmContextType> +void GemmWithOutputPipelinePC(GemmContextType* context, + const MatrixMap<const InputScalar, LhsOrder>& lhs, + const MatrixMap<const InputScalar, RhsOrder>& rhs, + MatrixMap<OutputScalar, ResultOrder>* result, + const LhsOffset& lhs_offset, + const RhsOffset& rhs_offset, + const OutputPipelineType& output_pipeline) { + DispatchGemmShape<InputScalar, OutputScalar, BitDepthParams>( + context, lhs, rhs, result, lhs_offset, rhs_offset, output_pipeline); +} + +// Computes a general matrix product ("GEMM"). +// This is the legacy version that does not support per channel quantization. +// The meaning of the offsets, result_mult_int and result_shift +// parameters is the same as in the standard EightBitIntGemm interface +// (which is also implemented in the eight_bit_int_gemm directory). +template <typename InputScalar, typename OutputScalar, typename BitDepthParams, + MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder, + typename OutputPipelineType, typename GemmContextType> +void GemmWithOutputPipeline(GemmContextType* context, + const MatrixMap<const InputScalar, LhsOrder>& lhs, + const MatrixMap<const InputScalar, RhsOrder>& rhs, + MatrixMap<OutputScalar, ResultOrder>* result, + int lhs_offset, int rhs_offset, + const OutputPipelineType& output_pipeline) { + typedef VectorDup<const std::int32_t, VectorShape::Col> OffsetColDup; + typedef VectorDup<const std::int32_t, VectorShape::Row> OffsetRowDup; + const OffsetColDup lhs_offset_vector(lhs_offset, lhs.rows()); + const OffsetRowDup rhs_offset_vector(rhs_offset, rhs.cols()); + DispatchGemmShape<InputScalar, OutputScalar, BitDepthParams>( + context, lhs, rhs, result, lhs_offset_vector, rhs_offset_vector, + output_pipeline); +} + +// Computes a general matrix product ("GEMM"). +// The meaning of the offsets, result_mult_int and result_shift +// parameters is the same as in the standard EightBitIntGemm interface +// (which is also implemented in the eight_bit_int_gemm directory). +template <typename Scalar, typename BitDepthParams, MapOrder LhsOrder, + MapOrder RhsOrder, MapOrder ResultOrder, typename GemmContextType> +void Gemm(GemmContextType* context, + const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + GemmWithOutputPipeline<Scalar, Scalar, BitDepthParams>( + context, lhs, rhs, result, lhs_offset, rhs_offset, + MakeStandardOutputPipeline(result_offset, result_mult_int, result_shift)); +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_PUBLIC_GEMMLOWP_H_ diff --git a/runtimes/nn/depend/external/gemmlowp/public/map.h b/runtimes/nn/depend/external/gemmlowp/public/map.h new file mode 100644 index 000000000..3073e05f5 --- /dev/null +++ b/runtimes/nn/depend/external/gemmlowp/public/map.h @@ -0,0 +1,140 @@ +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// map.h: a minimalist view-existing-buffer-as-a-matrix class, +// which is how gemmlowp interfaces with external matrix data. + +#ifndef GEMMLOWP_PUBLIC_MAP_H_ +#define GEMMLOWP_PUBLIC_MAP_H_ + +#include "../internal/common.h" + +namespace gemmlowp { + +// The two storage orders allowed to map buffers as matrices: ColMajor +// means column-major, RowMajor means row-major. +enum class MapOrder { ColMajor, RowMajor }; + +// A MatrixMap is a view of an existing buffer as a matrix. It does not own +// the buffer. +template <typename tScalar, MapOrder tOrder> +class MatrixMap { + public: + typedef tScalar Scalar; + static const MapOrder kOrder = tOrder; + + protected: + Scalar* data_; // not owned. + int rows_, cols_, stride_; + + public: + MatrixMap() : data_(nullptr), rows_(0), cols_(0), stride_(0) {} + MatrixMap(Scalar* data, int rows, int cols) + : data_(data), + rows_(rows), + cols_(cols), + stride_(kOrder == MapOrder::ColMajor ? rows : cols) {} + MatrixMap(Scalar* data, int rows, int cols, int stride) + : data_(data), rows_(rows), cols_(cols), stride_(stride) {} + MatrixMap(const MatrixMap& other) + : data_(other.data_), + rows_(other.rows_), + cols_(other.cols_), + stride_(other.stride_) {} + + int rows() const { return rows_; } + int cols() const { return cols_; } + int stride() const { return stride_; } + int rows_stride() const { return kOrder == MapOrder::ColMajor ? 1 : stride_; } + int cols_stride() const { return kOrder == MapOrder::RowMajor ? 1 : stride_; } + Scalar* data() const { return data_; } + Scalar* data(int row, int col) const { + return data_ + row * rows_stride() + col * cols_stride(); + } + Scalar& operator()(int row, int col) const { return *data(row, col); } + + MatrixMap block(int start_row, int start_col, int block_rows, + int block_cols) const { + assert(start_row >= 0); + assert(start_row + block_rows <= rows_); + assert(start_col >= 0); + assert(start_col + block_cols <= cols_); + + return MatrixMap(data(start_row, start_col), block_rows, block_cols, + stride_); + } +}; + +enum class VectorShape { Col, Row }; + +// A VectorMap is a view of an existing buffer as a vector. It does not own +// the buffer. +template <typename tScalar, VectorShape tShape> +class VectorMap { + public: + typedef tScalar Scalar; + static const VectorShape kShape = tShape; + + protected: + Scalar* data_; // not owned. + int size_; + + public: + VectorMap() : data_(nullptr), size_(0) {} + VectorMap(Scalar* data, int size) : data_(data), size_(size) {} + VectorMap(const VectorMap& other) : data_(other.data_), size_(other.size_) {} + + int size() const { return size_; } + Scalar* data() const { return data_; } + Scalar* data(int index) const { return data_ + index; } + Scalar& operator()(int index) const { return *data(index); } + + VectorMap block(int start, int len) const { + assert(start >= 0); + assert(start + len <= size_); + + return VectorMap(data(start), len); + } +}; + +// A VectorDup is a (duplicated value) vector where all components are the same. +template <typename tScalar, VectorShape tShape> +class VectorDup { + public: + typedef tScalar Scalar; + static const VectorShape kShape = tShape; + + protected: + Scalar data_; + int size_; + + public: + VectorDup() : data_(0), size_(0) {} + VectorDup(Scalar data, int size) : data_(data), size_(size) {} + VectorDup(const VectorDup& other) : data_(other.data_), size_(other.size_) {} + + int size() const { return size_; } + Scalar& operator()(int) const { return data_; } + + VectorDup block(int start, int len) const { + assert(start >= 0); + assert(start + len <= size_); + + return VectorDup(data_, len); + } +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_PUBLIC_MAP_H_ diff --git a/runtimes/nn/depend/external/gemmlowp/public/output_stages.h b/runtimes/nn/depend/external/gemmlowp/public/output_stages.h new file mode 100644 index 000000000..23bcdc05f --- /dev/null +++ b/runtimes/nn/depend/external/gemmlowp/public/output_stages.h @@ -0,0 +1,185 @@ +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// output_stages.h: public definitions of the output stages that can +// be assembled into an output pipeline, to control how internal +// 32-bit accumulators are transformed to obtain the final uint8 +// result matrix entries. + +#ifndef GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ +#define GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ + +#include <tuple> + +#include "../internal/common.h" + +namespace gemmlowp { + +// This output stage takes int32 values and returns still int32 values, +// but "quantized down" to the uint8 scale; in other words, its output +// is typically what one would then clamp to [0..255] and cast to uint8 +// (see OutputStageSaturatingCastToUint8). +// +// This "quantization down" process depends on 3 parameters, +// result_offset, result_mult_int, result_shift, +// and the result is: +// ((input + result_offset) * result_mult_int + rounding) >> result_shift +// where +// rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); +struct OutputStageQuantizeDownInt32ToUint8Scale { + std::int32_t result_offset; + std::int32_t result_mult_int; + std::int32_t result_shift; +}; + +// This output stage takes int32 values and returns still int32 values, +// but "quantized down" to the uint8 scale; in other words, its output +// is typically what one would then clamp to [0..255] and cast to uint8 +// (see OutputStageSaturatingCastToUint8). +// +// This "quantization down" process depends on 3 parameters, +// result_offset, result_mult_int, result_shift, +// and the result is: +// ((input + result_offset) * result_mult_int + rounding) >> result_shift +// where +// rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); +// +// Difference from OutputStageQuantizeDownInt32ToUint8Scale here is that each +// row or column of the output (depending on tShape) has its own result_offset +// and result_mult_int numbers. +template <VectorShape tShape> +struct OutputStageQuantizeDownInt32ToUint8ScalePC { + VectorMap<const std::int32_t, tShape> result_offset; + VectorMap<const std::int32_t, tShape> result_mult_int; + std::int32_t result_shift; +}; + +// This output stage takes int32 values and returns still int32 values, +// but "quantized down" to the uint8 scale; in other words, its output +// is typically what one would then clamp to [0..255] and cast to uint8 +// (see OutputStageSaturatingCastToUint8). +// +// This "quantization down" process depends on 3 parameters, +// result_offset, result_fixedpoint_multiplier, result_shift, +// and the result is: +// ((FixedPointMul(input, result_fixedpoint_multiplier) + +// rounding) >> result_shift) + result_offset_after_shift +// where +// rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); +// and where FixedPointMul(x, y) is the nearest integer to the following +// mathematical expression, evaluated without overflow or intermediate +// rounding: +// (x * y) / 2^31 +// In practice, it is expected that FixedPointMul will be implemented +// using hardware "rounding doubling int32 multiply high" instructions, +// such as VQRDMULH on ARM. See in fixedpoint.h the generic function, +// SaturatingRoundingDoublingHighMul. +// +// Notice that the other difference from +// OutputStageQuantizeDownInt32ToUint8Scale is that the result offset +// is applied after the multiplier and shift, not before. This ensures +// that no matter what the multiplier and shift are, the result offset +// is effectively integral: offsetting the final result by an integer. +// The motivation for this is to faithfully support quantization schemes +// where the formula linking quantized values to the real mathematical +// values that they represent, is of the form +// +// real_value = scale * (quantized_value - zero_point) +// +// where scale is a real number (represented in quantized form by +// result_fixedpoint_multiplier and result_shift) and zero_point +// is an integer telling which quantized value correspond to the +// real value 0, and is represented here by (the opposite of) +// result_offset_after_shift. +// The motivation for such a quantization scheme, designed to +// ensure that 0 is always a representable value, is that in +// many applications, we need to 0-pad arrays and that can only be +// done for quantized arrays if 0 is a representable value in +// quantized form. In particular, convolution-like operations +// are often implemented using 0-padding, or "im2col"-like +// expansions that implicitly rely on 0-padding. If 0 were not +// a representable value, such operations would have to pad +// using a nonzero value, introducing bias in the computation. +struct OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint { + std::int32_t result_fixedpoint_multiplier; + std::int32_t result_shift; + std::int32_t result_offset_after_shift; +}; + +// This output stage takes int32 values that are expected to be already +// on the final uint8 scale, but not necessarily in the [0..255] range. +// It clamps them to the [0..255] range and returns them casted to uint8. +struct OutputStageSaturatingCastToUint8 {}; + +// This output stage depends on a "bias vector" that should contain int32 +// entries, and be either a row-vector of the same number of columns as the +// result matrix, or a column-vector of the same number of rows as the +// result matrix. This output stage takes int32 values and adds to them +// the corresponding entry of the bias vector (broadcasted in the other +// direction to fit the matrix's shape), outputting int32 values. +template <typename VectorType> +struct OutputStageBiasAddition { + VectorType bias_vector; +}; + +// This output stage clamps value between the specified min and max bounds. +// It can be used to implement "rectified linear unit" activation functions +// in neural networks. +struct OutputStageClamp { + std::int32_t min; + std::int32_t max; +}; + +struct OutputStageTanh { + std::int32_t real_zero_as_int32; + std::int32_t real_amplitude_as_int32; +}; + +// An output pipeline is just a std::tuple of output stages. +// This function generates a standard output pipeline consisting of two stages: +// OutputStageQuantizeDownInt32ToUint8Scale, OutputStageSaturatingCastToUint8. +inline std::tuple<OutputStageQuantizeDownInt32ToUint8Scale, + OutputStageSaturatingCastToUint8> +MakeStandardOutputPipeline(std::int32_t result_offset, + std::int32_t result_mult_int, + std::int32_t result_shift) { + OutputStageQuantizeDownInt32ToUint8Scale quantize_down_stage; + quantize_down_stage.result_offset = result_offset; + quantize_down_stage.result_mult_int = result_mult_int; + quantize_down_stage.result_shift = result_shift; + OutputStageSaturatingCastToUint8 saturating_cast_stage; + return std::make_tuple(quantize_down_stage, saturating_cast_stage); +} + +// An output pipeline is just a std::tuple of output stages. +// This function generates a standard output pipeline consisting of two stages: +// OutputStageQuantizeDownInt32ToUint8ScalePC, OutputStageSaturatingCastToUint8. +template <VectorShape tShape> +inline std::tuple<OutputStageQuantizeDownInt32ToUint8ScalePC<tShape>, + OutputStageSaturatingCastToUint8> +MakeStandardOutputPipeline( + const VectorMap<const std::int32_t, tShape>& result_offset, + const VectorMap<const std::int32_t, tShape>& result_mult_int, + std::int32_t result_shift) { + OutputStageQuantizeDownInt32ToUint8ScalePC<tShape> quantize_down_stage; + quantize_down_stage.result_offset = result_offset; + quantize_down_stage.result_mult_int = result_mult_int; + quantize_down_stage.result_shift = result_shift; + OutputStageSaturatingCastToUint8 saturating_cast_stage; + return std::make_tuple(quantize_down_stage, saturating_cast_stage); +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ |