Imported Upstream version 0.1upstream/0.1 submit/tizen/20180504.091146

author: Chunseok Lee <chunseok.lee@samsung.com> 2018-05-04 17:57:16 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2018-05-04 17:57:16 +0900
commit: 07659ccd9fe7b1cf1547cc6cad78bcf489f0a361 (patch)
tree: cf3a123812b7f1ad8b50d7d0ace891e0c03c6110 /runtimes/nn/depend/external/gemmlowp/public
parent: da6f7a3e8360a49fd073a6e0031a4da134d9d984 (diff)
download: nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.tar.gz
nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.tar.bz2
nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.zip
4 files changed, 474 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/public/bit_depth.h b/runtimes/nn/depend/external/gemmlowp/public/bit_depth.h
new file mode 100644
index 000000000..6cb4ecf0d
--- /dev/null
+++ b/runtimes/nn/depend/external/gemmlowp/public/bit_depth.h
@@ -0,0 +1,62 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// bit_depth.h: defines the settins controlling LHS/RHS bit depth
+
+#ifndef GEMMLOWP_PUBLIC_BIT_DEPTH_H_
+#define GEMMLOWP_PUBLIC_BIT_DEPTH_H_
+
+namespace gemmlowp {
+
+// The range of allowed values for an operand.
+template <int tMinValue, int tMaxValue>
+struct OperandRange {
+  static const int kMinValue = tMinValue;
+  static const int kMaxValue = tMaxValue;
+  static_assert(0 <= kMinValue, "");
+  static_assert(kMinValue < kMaxValue, "");
+  static_assert(kMaxValue <= 255, "");
+};
+
+using Uint8Range = OperandRange<0, 255>;
+using Uint8RangeExcludingZero = OperandRange<1, 255>;
+
+template <typename tLhsRange, typename tRhsRange>
+struct BitDepthParams {
+  using LhsRange = tLhsRange;
+  using RhsRange = tRhsRange;
+};
+
+// Default: LHS and RHS are 8bit.
+using DefaultL8R8BitDepthParams = BitDepthParams<Uint8Range, Uint8Range>;
+
+// Variant: LHS may not take the value 0. This allows using
+// faster kernels using signed arithmetic, see
+// NEON_64bit_GEMM_Int8Operands_Int32Accumulators_AccumTwoWithin16Bits
+using L8R8WithLhsNonzeroBitDepthParams =
+    BitDepthParams<Uint8RangeExcludingZero, Uint8Range>;
+
+// Deprecated: when gemmlowp used to allow requantizing 8bit
+// inputs to less-than-8-bit depths, the public setting allowing
+// that was DefaultL7R5BitDepthParams. That requantization
+// feature has been removed, but as the whole point of that
+// requantization was to make less-than-8-bit an internal
+// optimization without any impact on the API (other than lowering
+// accuracy), we can temporarily support users who were using it
+// by mapping it to the default 8bit behavior.
+using DefaultL7R5BitDepthParams = DefaultL8R8BitDepthParams;
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_PUBLIC_BIT_DEPTH_H_
diff --git a/runtimes/nn/depend/external/gemmlowp/public/gemmlowp.h b/runtimes/nn/depend/external/gemmlowp/public/gemmlowp.h
new file mode 100644
index 000000000..05b0f4714
--- /dev/null
+++ b/runtimes/nn/depend/external/gemmlowp/public/gemmlowp.h
@@ -0,0 +1,87 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// gemmlowp.h: the main public interface header of gemmlowp.
+
+#ifndef GEMMLOWP_PUBLIC_GEMMLOWP_H_
+#define GEMMLOWP_PUBLIC_GEMMLOWP_H_
+#include "../internal/dispatch_gemm_shape.h"
+#include "bit_depth.h"
+#include "map.h"
+#include "output_stages.h"
+
+namespace gemmlowp {
+
+class GemmContext : public MultiThreadGemmContext {};
+
+// Computes a general matrix product ("GEMM").
+// This is a version that supports per channel quantization.
+template <typename InputScalar, typename OutputScalar, typename BitDepthParams,
+          MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder,
+          typename LhsOffset, typename RhsOffset, typename OutputPipelineType,
+          typename GemmContextType>
+void GemmWithOutputPipelinePC(GemmContextType* context,
+                              const MatrixMap<const InputScalar, LhsOrder>& lhs,
+                              const MatrixMap<const InputScalar, RhsOrder>& rhs,
+                              MatrixMap<OutputScalar, ResultOrder>* result,
+                              const LhsOffset& lhs_offset,
+                              const RhsOffset& rhs_offset,
+                              const OutputPipelineType& output_pipeline) {
+  DispatchGemmShape<InputScalar, OutputScalar, BitDepthParams>(
+      context, lhs, rhs, result, lhs_offset, rhs_offset, output_pipeline);
+}
+
+// Computes a general matrix product ("GEMM").
+// This is the legacy version that does not support per channel quantization.
+// The meaning of the offsets, result_mult_int and result_shift
+// parameters is the same as in the standard EightBitIntGemm interface
+// (which is also implemented in the eight_bit_int_gemm directory).
+template <typename InputScalar, typename OutputScalar, typename BitDepthParams,
+          MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder,
+          typename OutputPipelineType, typename GemmContextType>
+void GemmWithOutputPipeline(GemmContextType* context,
+                            const MatrixMap<const InputScalar, LhsOrder>& lhs,
+                            const MatrixMap<const InputScalar, RhsOrder>& rhs,
+                            MatrixMap<OutputScalar, ResultOrder>* result,
+                            int lhs_offset, int rhs_offset,
+                            const OutputPipelineType& output_pipeline) {
+  typedef VectorDup<const std::int32_t, VectorShape::Col> OffsetColDup;
+  typedef VectorDup<const std::int32_t, VectorShape::Row> OffsetRowDup;
+  const OffsetColDup lhs_offset_vector(lhs_offset, lhs.rows());
+  const OffsetRowDup rhs_offset_vector(rhs_offset, rhs.cols());
+  DispatchGemmShape<InputScalar, OutputScalar, BitDepthParams>(
+      context, lhs, rhs, result, lhs_offset_vector, rhs_offset_vector,
+      output_pipeline);
+}
+
+// Computes a general matrix product ("GEMM").
+// The meaning of the offsets, result_mult_int and result_shift
+// parameters is the same as in the standard EightBitIntGemm interface
+// (which is also implemented in the eight_bit_int_gemm directory).
+template <typename Scalar, typename BitDepthParams, MapOrder LhsOrder,
+          MapOrder RhsOrder, MapOrder ResultOrder, typename GemmContextType>
+void Gemm(GemmContextType* context,
+          const MatrixMap<const Scalar, LhsOrder>& lhs,
+          const MatrixMap<const Scalar, RhsOrder>& rhs,
+          MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+          int rhs_offset, int result_offset, int result_mult_int,
+          int result_shift) {
+  GemmWithOutputPipeline<Scalar, Scalar, BitDepthParams>(
+      context, lhs, rhs, result, lhs_offset, rhs_offset,
+      MakeStandardOutputPipeline(result_offset, result_mult_int, result_shift));
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_PUBLIC_GEMMLOWP_H_
diff --git a/runtimes/nn/depend/external/gemmlowp/public/map.h b/runtimes/nn/depend/external/gemmlowp/public/map.h
new file mode 100644
index 000000000..3073e05f5
--- /dev/null
+++ b/runtimes/nn/depend/external/gemmlowp/public/map.h
@@ -0,0 +1,140 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// map.h: a minimalist view-existing-buffer-as-a-matrix class,
+// which is how gemmlowp interfaces with external matrix data.
+
+#ifndef GEMMLOWP_PUBLIC_MAP_H_
+#define GEMMLOWP_PUBLIC_MAP_H_
+
+#include "../internal/common.h"
+
+namespace gemmlowp {
+
+// The two storage orders allowed to map buffers as matrices: ColMajor
+// means column-major, RowMajor means row-major.
+enum class MapOrder { ColMajor, RowMajor };
+
+// A MatrixMap is a view of an existing buffer as a matrix. It does not own
+// the buffer.
+template <typename tScalar, MapOrder tOrder>
+class MatrixMap {
+ public:
+  typedef tScalar Scalar;
+  static const MapOrder kOrder = tOrder;
+
+ protected:
+  Scalar* data_;  // not owned.
+  int rows_, cols_, stride_;
+
+ public:
+  MatrixMap() : data_(nullptr), rows_(0), cols_(0), stride_(0) {}
+  MatrixMap(Scalar* data, int rows, int cols)
+      : data_(data),
+        rows_(rows),
+        cols_(cols),
+        stride_(kOrder == MapOrder::ColMajor ? rows : cols) {}
+  MatrixMap(Scalar* data, int rows, int cols, int stride)
+      : data_(data), rows_(rows), cols_(cols), stride_(stride) {}
+  MatrixMap(const MatrixMap& other)
+      : data_(other.data_),
+        rows_(other.rows_),
+        cols_(other.cols_),
+        stride_(other.stride_) {}
+
+  int rows() const { return rows_; }
+  int cols() const { return cols_; }
+  int stride() const { return stride_; }
+  int rows_stride() const { return kOrder == MapOrder::ColMajor ? 1 : stride_; }
+  int cols_stride() const { return kOrder == MapOrder::RowMajor ? 1 : stride_; }
+  Scalar* data() const { return data_; }
+  Scalar* data(int row, int col) const {
+    return data_ + row * rows_stride() + col * cols_stride();
+  }
+  Scalar& operator()(int row, int col) const { return *data(row, col); }
+
+  MatrixMap block(int start_row, int start_col, int block_rows,
+                  int block_cols) const {
+    assert(start_row >= 0);
+    assert(start_row + block_rows <= rows_);
+    assert(start_col >= 0);
+    assert(start_col + block_cols <= cols_);
+
+    return MatrixMap(data(start_row, start_col), block_rows, block_cols,
+                     stride_);
+  }
+};
+
+enum class VectorShape { Col, Row };
+
+// A VectorMap is a view of an existing buffer as a vector. It does not own
+// the buffer.
+template <typename tScalar, VectorShape tShape>
+class VectorMap {
+ public:
+  typedef tScalar Scalar;
+  static const VectorShape kShape = tShape;
+
+ protected:
+  Scalar* data_;  // not owned.
+  int size_;
+
+ public:
+  VectorMap() : data_(nullptr), size_(0) {}
+  VectorMap(Scalar* data, int size) : data_(data), size_(size) {}
+  VectorMap(const VectorMap& other) : data_(other.data_), size_(other.size_) {}
+
+  int size() const { return size_; }
+  Scalar* data() const { return data_; }
+  Scalar* data(int index) const { return data_ + index; }
+  Scalar& operator()(int index) const { return *data(index); }
+
+  VectorMap block(int start, int len) const {
+    assert(start >= 0);
+    assert(start + len <= size_);
+
+    return VectorMap(data(start), len);
+  }
+};
+
+// A VectorDup is a (duplicated value) vector where all components are the same.
+template <typename tScalar, VectorShape tShape>
+class VectorDup {
+ public:
+  typedef tScalar Scalar;
+  static const VectorShape kShape = tShape;
+
+ protected:
+  Scalar data_;
+  int size_;
+
+ public:
+  VectorDup() : data_(0), size_(0) {}
+  VectorDup(Scalar data, int size) : data_(data), size_(size) {}
+  VectorDup(const VectorDup& other) : data_(other.data_), size_(other.size_) {}
+
+  int size() const { return size_; }
+  Scalar& operator()(int) const { return data_; }
+
+  VectorDup block(int start, int len) const {
+    assert(start >= 0);
+    assert(start + len <= size_);
+
+    return VectorDup(data_, len);
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_PUBLIC_MAP_H_
diff --git a/runtimes/nn/depend/external/gemmlowp/public/output_stages.h b/runtimes/nn/depend/external/gemmlowp/public/output_stages.h
new file mode 100644
index 000000000..23bcdc05f
--- /dev/null
+++ b/runtimes/nn/depend/external/gemmlowp/public/output_stages.h
@@ -0,0 +1,185 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// output_stages.h: public definitions of the output stages that can
+// be assembled into an output pipeline, to control how internal
+// 32-bit accumulators are transformed to obtain the final uint8
+// result matrix entries.
+
+#ifndef GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
+#define GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
+
+#include <tuple>
+
+#include "../internal/common.h"
+
+namespace gemmlowp {
+
+// This output stage takes int32 values and returns still int32 values,
+// but "quantized down" to the uint8 scale; in other words, its output
+// is typically what one would then clamp to [0..255] and cast to uint8
+// (see OutputStageSaturatingCastToUint8).
+//
+// This "quantization down" process depends on 3 parameters,
+//   result_offset, result_mult_int, result_shift,
+// and the result is:
+//   ((input + result_offset) * result_mult_int + rounding) >> result_shift
+// where
+//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
+struct OutputStageQuantizeDownInt32ToUint8Scale {
+  std::int32_t result_offset;
+  std::int32_t result_mult_int;
+  std::int32_t result_shift;
+};
+
+// This output stage takes int32 values and returns still int32 values,
+// but "quantized down" to the uint8 scale; in other words, its output
+// is typically what one would then clamp to [0..255] and cast to uint8
+// (see OutputStageSaturatingCastToUint8).
+//
+// This "quantization down" process depends on 3 parameters,
+//   result_offset, result_mult_int, result_shift,
+// and the result is:
+//   ((input + result_offset) * result_mult_int + rounding) >> result_shift
+// where
+//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
+//
+// Difference from OutputStageQuantizeDownInt32ToUint8Scale here is that each
+// row or column of the output (depending on tShape) has its own result_offset
+// and result_mult_int numbers.
+template <VectorShape tShape>
+struct OutputStageQuantizeDownInt32ToUint8ScalePC {
+  VectorMap<const std::int32_t, tShape> result_offset;
+  VectorMap<const std::int32_t, tShape> result_mult_int;
+  std::int32_t result_shift;
+};
+
+// This output stage takes int32 values and returns still int32 values,
+// but "quantized down" to the uint8 scale; in other words, its output
+// is typically what one would then clamp to [0..255] and cast to uint8
+// (see OutputStageSaturatingCastToUint8).
+//
+// This "quantization down" process depends on 3 parameters,
+//   result_offset, result_fixedpoint_multiplier, result_shift,
+// and the result is:
+//   ((FixedPointMul(input, result_fixedpoint_multiplier) +
+//   rounding) >> result_shift) + result_offset_after_shift
+// where
+//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
+// and where FixedPointMul(x, y) is the nearest integer to the following
+// mathematical expression, evaluated without overflow or intermediate
+// rounding:
+//   (x * y) / 2^31
+// In practice, it is expected that FixedPointMul will be implemented
+// using hardware "rounding doubling int32 multiply high" instructions,
+// such as VQRDMULH on ARM. See in fixedpoint.h the generic function,
+// SaturatingRoundingDoublingHighMul.
+//
+// Notice that the other difference from
+// OutputStageQuantizeDownInt32ToUint8Scale is that the result offset
+// is applied after the multiplier and shift, not before. This ensures
+// that no matter what the multiplier and shift are, the result offset
+// is effectively integral: offsetting the final result by an integer.
+// The motivation for this is to faithfully support quantization schemes
+// where the formula linking quantized values to the real mathematical
+// values that they represent, is of the form
+//
+//   real_value = scale * (quantized_value - zero_point)
+//
+// where scale is a real number (represented in quantized form by
+// result_fixedpoint_multiplier and result_shift) and zero_point
+// is an integer telling which quantized value correspond to the
+// real value 0, and is represented here by (the opposite of)
+// result_offset_after_shift.
+// The motivation for such a quantization scheme, designed to
+// ensure that 0 is always a representable value, is that in
+// many applications, we need to 0-pad arrays and that can only be
+// done for quantized arrays if 0 is a representable value in
+// quantized form. In particular, convolution-like operations
+// are often implemented using 0-padding, or "im2col"-like
+// expansions that implicitly rely on 0-padding. If 0 were not
+// a representable value, such operations would have to pad
+// using a nonzero value, introducing bias in the computation.
+struct OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint {
+  std::int32_t result_fixedpoint_multiplier;
+  std::int32_t result_shift;
+  std::int32_t result_offset_after_shift;
+};
+
+// This output stage takes int32 values that are expected to be already
+// on the final uint8 scale, but not necessarily in the [0..255] range.
+// It clamps them to the [0..255] range and returns them casted to uint8.
+struct OutputStageSaturatingCastToUint8 {};
+
+// This output stage depends on a "bias vector" that should contain int32
+// entries, and be either a row-vector of the same number of columns as the
+// result matrix, or a column-vector of the same number of rows as the
+// result matrix. This output stage takes int32 values and adds to them
+// the corresponding entry of the bias vector (broadcasted in the other
+// direction to fit the matrix's shape), outputting int32 values.
+template <typename VectorType>
+struct OutputStageBiasAddition {
+  VectorType bias_vector;
+};
+
+// This output stage clamps value between the specified min and max bounds.
+// It can be used to implement "rectified linear unit" activation functions
+// in neural networks.
+struct OutputStageClamp {
+  std::int32_t min;
+  std::int32_t max;
+};
+
+struct OutputStageTanh {
+  std::int32_t real_zero_as_int32;
+  std::int32_t real_amplitude_as_int32;
+};
+
+// An output pipeline is just a std::tuple of output stages.
+// This function generates a standard output pipeline consisting of two stages:
+// OutputStageQuantizeDownInt32ToUint8Scale, OutputStageSaturatingCastToUint8.
+inline std::tuple<OutputStageQuantizeDownInt32ToUint8Scale,
+                  OutputStageSaturatingCastToUint8>
+MakeStandardOutputPipeline(std::int32_t result_offset,
+                           std::int32_t result_mult_int,
+                           std::int32_t result_shift) {
+  OutputStageQuantizeDownInt32ToUint8Scale quantize_down_stage;
+  quantize_down_stage.result_offset = result_offset;
+  quantize_down_stage.result_mult_int = result_mult_int;
+  quantize_down_stage.result_shift = result_shift;
+  OutputStageSaturatingCastToUint8 saturating_cast_stage;
+  return std::make_tuple(quantize_down_stage, saturating_cast_stage);
+}
+
+// An output pipeline is just a std::tuple of output stages.
+// This function generates a standard output pipeline consisting of two stages:
+// OutputStageQuantizeDownInt32ToUint8ScalePC, OutputStageSaturatingCastToUint8.
+template <VectorShape tShape>
+inline std::tuple<OutputStageQuantizeDownInt32ToUint8ScalePC<tShape>,
+                  OutputStageSaturatingCastToUint8>
+MakeStandardOutputPipeline(
+    const VectorMap<const std::int32_t, tShape>& result_offset,
+    const VectorMap<const std::int32_t, tShape>& result_mult_int,
+    std::int32_t result_shift) {
+  OutputStageQuantizeDownInt32ToUint8ScalePC<tShape> quantize_down_stage;
+  quantize_down_stage.result_offset = result_offset;
+  quantize_down_stage.result_mult_int = result_mult_int;
+  quantize_down_stage.result_shift = result_shift;
+  OutputStageSaturatingCastToUint8 saturating_cast_stage;
+  return std::make_tuple(quantize_down_stage, saturating_cast_stage);
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
author	Chunseok Lee <chunseok.lee@samsung.com>	2018-05-04 17:57:16 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2018-05-04 17:57:16 +0900
commit	07659ccd9fe7b1cf1547cc6cad78bcf489f0a361 (patch)
tree	cf3a123812b7f1ad8b50d7d0ace891e0c03c6110 /runtimes/nn/depend/external/gemmlowp/public
parent	da6f7a3e8360a49fd073a6e0031a4da134d9d984 (diff)
download	nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.tar.gz nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.tar.bz2 nnfw-07659ccd9fe7b1cf1547cc6cad78bcf489f0a361.zip