diff options
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/output.h')
-rw-r--r-- | runtimes/nn/depend/external/gemmlowp/internal/output.h | 435 |
1 files changed, 0 insertions, 435 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/output.h b/runtimes/nn/depend/external/gemmlowp/internal/output.h deleted file mode 100644 index 8ccb8ee1f..000000000 --- a/runtimes/nn/depend/external/gemmlowp/internal/output.h +++ /dev/null @@ -1,435 +0,0 @@ -// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// output.h: processing the 32-bit accumulators output by the unpack -// stage, obtaining the final result matrix entries and storing them into -// the destination matrix. - -#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_ -#define GEMMLOWP_INTERNAL_OUTPUT_H_ - -#include <cmath> -#include <tuple> -#include <type_traits> - -#include "../fixedpoint/fixedpoint.h" -#include "../public/output_stages.h" -#include "simd_wrappers.h" - -namespace gemmlowp { - -template <typename OutputStage, typename InputBufferType> -struct OutputStageEvalBufferImpl { - // This generic template body should never be hit. - static_assert( - std::is_same<InputBufferType, void>::value, - "Unimplemented: missing implementation of this output pipeline stage " - "for this data type. This would happen if some architecture-specific " - "SIMD back-end (output_$arch.h) were incomplete."); -}; - -template <typename OutputStage, typename InputType> -struct OutputStageEvalImpl { - static constexpr int kRows = InputType::kRows; - static constexpr int kCols = InputType::kCols; - using InputBufferType = typename InputType::BufferType; - using BufferEvalImplType = - OutputStageEvalBufferImpl<OutputStage, InputBufferType>; - using OutputBufferType = typename BufferEvalImplType::OutputType; - using OutputScalarType = typename OutputBufferType::ScalarType; - using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>; - - OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {} - - OutputType Eval(InputType input, int, int) const { - OutputType output; - output.buf = buffer_eval_impl.Eval(input.buf); - return output; - } - - const BufferEvalImplType buffer_eval_impl; -}; - -template <int Size> -struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale, - RegisterBuffer<std::int32_t, Size>> { - using InputType = RegisterBuffer<std::int32_t, Size>; - using OutputType = RegisterBuffer<std::int32_t, Size>; - - typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; - - OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} - - OutputType Eval(InputType input) const { - const int result_shift = output_stage.result_shift; - const std::int32_t result_mult_int = output_stage.result_mult_int; - using RegisterType = typename InputType::RegisterType; - const RegisterType result_offset = - Dup<RegisterType>(output_stage.result_offset); - OutputType output; - for (int i = 0; i < InputType::kRegisterCount; i++) { - output.reg[i] = RoundingDivideByPOT( - Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift); - } - return output; - } - - const OutputStage& output_stage; -}; - -template <int Rows, int Cols, VectorShape Shape> -struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>, - RegisterBlock<std::int32_t, Rows, Cols>> { - typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; - typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; - typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage; - - OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} - - OutputType Eval(InputType input, int row, int col) const { - OutputType output; - const int result_shift = output_stage.result_shift; - const int pos = Shape == VectorShape::Col ? row : col; - const auto result_mult_int = - LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos); - const auto result_offset = - LoadForBroadcasting<InputType>(output_stage.result_offset, pos); - const auto dividend = BroadcastMul<InputType>( - BroadcastAdd<InputType>(input, result_offset), result_mult_int); - for (int i = 0; i < InputType::kRegisterCount; i++) { - output.buf.reg[i] = - RoundingDivideByPOT(dividend.buf.reg[i], result_shift); - } - return output; - } - - const OutputStage& output_stage; -}; - -template <int Size> -struct OutputStageEvalBufferImpl< - OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, - RegisterBuffer<std::int32_t, Size>> { - typedef RegisterBuffer<std::int32_t, Size> InputType; - typedef RegisterBuffer<std::int32_t, Size> OutputType; - - typedef OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint OutputStage; - - OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} - - OutputType Eval(InputType input) const { - OutputType output; - using RegisterType = typename InputType::RegisterType; - const RegisterType result_offset_after_shift = - Dup<RegisterType>(output_stage.result_offset_after_shift); - for (int i = 0; i < InputType::kRegisterCount; i++) { - const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul( - input.reg[i], output_stage.result_fixedpoint_multiplier); - output.reg[i] = - Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift), - result_offset_after_shift); - } - return output; - } - - const OutputStage& output_stage; -}; - -// Implementation of OutputStageSaturatingCastToUint8 for scalar data -template <int Size> -struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, - RegisterBuffer<std::int32_t, Size>> { - typedef RegisterBuffer<std::int32_t, Size> InputType; - typedef RegisterBuffer<std::uint8_t, Size> OutputType; - static_assert(InputType::kRegisterLanes == 1, - "This path is only for scalar values"); - - typedef OutputStageSaturatingCastToUint8 OutputStage; - - OutputStageEvalBufferImpl(const OutputStage&) {} - - OutputType Eval(InputType input) const { - OutputType output; - for (int i = 0; i < InputType::kRegisterCount; i++) { - std::int32_t data = input.reg[i]; - output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data; - } - return output; - } -}; - -template <int Rows, int Cols, typename VectorType> -struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, - RegisterBlock<std::int32_t, Rows, Cols>> { - typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; - typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; - typedef OutputStageBiasAddition<VectorType> OutputStage; - - OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} - - OutputType Eval(InputType input, int row, int col) const { - const int pos = VectorType::kShape == VectorShape::Row ? col : row; - return BroadcastAdd<InputType>( - input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos)); - } - - const OutputStage& output_stage; -}; - -template <int Size> -struct OutputStageEvalBufferImpl<OutputStageClamp, - RegisterBuffer<std::int32_t, Size>> { - typedef RegisterBuffer<std::int32_t, Size> InputType; - typedef RegisterBuffer<std::int32_t, Size> OutputType; - - typedef OutputStageClamp OutputStage; - - OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} - - OutputType Eval(InputType input) const { - using RegisterType = typename InputType::RegisterType; - const RegisterType min = Dup<RegisterType>(output_stage.min); - const RegisterType max = Dup<RegisterType>(output_stage.max); - OutputType output; - for (int i = 0; i < InputType::kRegisterCount; i++) { - output.reg[i] = Min(Max(input.reg[i], min), max); - } - return output; - } - - const OutputStage& output_stage; -}; - -template <int Size> -struct OutputStageEvalBufferImpl<OutputStageTanh, - RegisterBuffer<std::int32_t, Size>> { - typedef RegisterBuffer<std::int32_t, Size> InputType; - typedef RegisterBuffer<std::int32_t, Size> OutputType; - using RegisterType = typename InputType::RegisterType; - typedef RegisterType DataType; - typedef OutputStageTanh OutputStage; - - OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) { - const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; - const std::int32_t real_amplitude_as_int32 = - output_stage.real_amplitude_as_int32; - - input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32; - input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32; - output_min = real_zero_as_int32 - real_amplitude_as_int32; - output_max = real_zero_as_int32 + real_amplitude_as_int32; - - double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32; - inverse_amplitude_neg_exponent = 0; - while (inverse_amplitude_normalized_double < 0.5) { - inverse_amplitude_normalized_double *= 2; - inverse_amplitude_neg_exponent++; - } - inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble( - inverse_amplitude_normalized_double); - - double amplitude_normalized_double = real_amplitude_as_int32; - amplitude_exponent = 0; - while (amplitude_normalized_double >= 1.0) { - amplitude_normalized_double *= 0.5; - amplitude_exponent++; - } - amplitude_normalized = - FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double); - } - - OutputType Eval(InputType input) const { - const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; - - typedef FixedPoint<DataType, 3> F3; - typedef FixedPoint<DataType, 0> F0; - - OutputType output; - - for (int i = 0; i < OutputType::kRegisterCount; i++) { - // fixed-point affine transformation - DataType input_centered = - Sub(input.reg[i], Dup<DataType>(real_zero_as_int32)); - F3 fixedpoint_input = - F3::FromRaw(input_centered) * inverse_amplitude_normalized; - // left shift - fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(), - 28 - inverse_amplitude_neg_exponent); - // fixed-point tanh and multiplication - F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized; - // right shift - DataType int32_output = - Add(Dup<DataType>(real_zero_as_int32), - ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent)); - - DataType mask_if_below_cutoff_min = - MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min)); - DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual( - input.reg[i], Dup<DataType>(input_cutoff_max)); - - output.reg[i] = SelectUsingMask( - mask_if_below_cutoff_min, Dup<DataType>(output_min), - SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max), - int32_output)); - } - return output; - } - - const OutputStage& output_stage; - std::int32_t input_cutoff_min, input_cutoff_max; - std::int32_t output_min, output_max; - FixedPoint<DataType, 0> inverse_amplitude_normalized; - int inverse_amplitude_neg_exponent; - FixedPoint<DataType, 0> amplitude_normalized; - int amplitude_exponent; -}; - -// OutputPipelineOutputType is a helper to determine the output data type of a -// pipeline, for a -// given input data type. It is a recursive template; see the explanation on -// OutputPipelineEvalImpl below. -template <typename OutputPipelineType, int FirstStage, typename InputType, - bool StopRecursion = - FirstStage == std::tuple_size<OutputPipelineType>::value> -struct OutputPipelineOutputType { - typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type - FirstStageType; - typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType - FirstStageOutputType; - typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1, - FirstStageOutputType>::Type Type; -}; - -template <typename OutputPipelineType, int FirstStage, typename InputType> -struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, - true> { - typedef InputType Type; -}; - -// OutputPipelineEvalImpl is a helper to implement the evaluation of -// the whole pipeline. It is a recursive template to implement compile-time -// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter -// is how we implement recursion: each specialization implements only -// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a -// helper to implement the termination of the recursion as a partial -// specialization below. -template <typename OutputPipelineType, int FirstStage, typename InputType, - bool StopRecursion = - FirstStage == std::tuple_size<OutputPipelineType>::value> -struct OutputPipelineEvalImpl { - typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type - FirstStageType; - typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType - FirstStageOutputType; - typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage, - InputType>::Type OutputType; - - OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline) - : head_impl(std::get<FirstStage>(output_pipeline)), - tail_impl(output_pipeline) {} - - OutputType Eval(InputType input, int row, int col) const { - // Evaluate the first stage. - FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col); - // Recurse into the remaining stages. - return tail_impl.Eval(first_stage_output, row, col); - } - - const OutputStageEvalImpl<FirstStageType, InputType> head_impl; - const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1, - FirstStageOutputType> - tail_impl; -}; - -// Specialization on 'StopRecursion' for terminating the recursion. -template <typename OutputPipelineType, int FirstStage, typename InputType> -struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> { - OutputPipelineEvalImpl(const OutputPipelineType&) {} - - InputType Eval(InputType input, int, int) const { - // Terminating the recursion. - return input; - } -}; - -template <typename RegisterBlockType, typename DstType> -struct StoreFinalOutputImpl { - static_assert(std::is_same<RegisterBlockType, void>::value, - "This generic impl should never be hit"); -}; - -template <typename ScalarType, int Rows, int Cols, typename DstType> -struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> { - using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>; - static void Run(const RegisterBlockType& src, DstType* dst, int row, - int col) { - for (int r = 0; r < Rows; r++) { - for (int c = 0; c < Cols; c++) { - *dst->data(row + r, col + c) = src.buf.reg[r + c * Rows]; - } - } - } -}; - -// StoreFinalOutput takes the final value at the end of the output pipeline and -// stores it into the destination matrix. It can be specialized for different -// data types; the generic implementation here is typically used only for plain -// old scalar (not SIMD) types. -template <typename RegisterBlockType, typename DstType> -void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) { - StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col); -} - -template <typename OutputPipelineType, typename InputType> -struct OutputPipelineExecutor { - OutputPipelineExecutor(const OutputPipelineType& output_pipeline) - : output_pipeline_eval_impl_(output_pipeline) {} - - // RunOutputPipeline is the entry point into the output pipeline evaluation - // code. It should be the only thing that unpack code calls. It takes the - // result - // of the unpack stage and stores it into the destination matrix. - template <typename DstType> - void Execute(InputType input, DstType* dst, int src_global_row, - int src_global_col, int dst_row, int dst_col) const { - // Statically assert that the output pipeline matches the given destination - // matrix's scalar type. - typedef typename OutputPipelineOutputType< - OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType - - ScalarOutputType; - typedef typename DstType::Scalar ScalarDstType; - static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value, - "mismatched destination scalar type and output pipeline"); - - // Evaluate the output pipeline. - auto output = - output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col); - // Store the result into the destination matrix. - StoreFinalOutput(output, dst, dst_row, dst_col); - } - - const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType> - output_pipeline_eval_impl_; -}; - -} // namespace gemmlowp - -#ifdef GEMMLOWP_NEON -#include "output_neon.h" -#elif defined(GEMMLOWP_SSE4) -#include "output_sse.h" -#endif - -#endif // GEMMLOWP_INTERNAL_OUTPUT_H_ |