diff options
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/output_sse.h')
-rw-r--r-- | runtimes/nn/depend/external/gemmlowp/internal/output_sse.h | 354 |
1 files changed, 354 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/output_sse.h b/runtimes/nn/depend/external/gemmlowp/internal/output_sse.h new file mode 100644 index 000000000..5c0625398 --- /dev/null +++ b/runtimes/nn/depend/external/gemmlowp/internal/output_sse.h @@ -0,0 +1,354 @@ +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// output_sse.h: optimized SSE4.2 specializations of the templates in output.h. + +#ifndef GEMMLOWP_INTERNAL_OUTPUT_SSE_H_ +#define GEMMLOWP_INTERNAL_OUTPUT_SSE_H_ + +#include "output.h" + +#include <smmintrin.h> + +namespace gemmlowp { + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, + RegBufferInt32<4>> { + typedef RegBufferInt32<4> InputType; + typedef RegBufferUint8<4> OutputType; + + typedef OutputStageSaturatingCastToUint8 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]); + __m128i res_8 = _mm_packus_epi16(res_16, res_16); + output.reg[0] = _mm_cvtsi128_si32(res_8); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, + RegBufferInt32<8>> { + typedef RegBufferInt32<8> InputType; + typedef RegBufferUint8<8> OutputType; + + typedef OutputStageSaturatingCastToUint8 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[1]); + __m128i res_8 = _mm_packus_epi16(res_16, res_16); + output.reg[0] = _mm_extract_epi32(res_8, 0); + output.reg[1] = _mm_extract_epi32(res_8, 1); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, + RegBufferInt32<16>> { + typedef RegBufferInt32<16> InputType; + typedef RegBufferUint8<16> OutputType; + + typedef OutputStageSaturatingCastToUint8 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + __m128i res_16_0 = _mm_packs_epi32(input.reg[0], input.reg[1]); + __m128i res_16_1 = _mm_packs_epi32(input.reg[2], input.reg[3]); + output.reg[0] = _mm_packus_epi16(res_16_0, res_16_1); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, + RegBufferInt32<32>> { + typedef RegBufferInt32<32> InputType; + typedef RegBufferUint8<32> OutputType; + + typedef OutputStageSaturatingCastToUint8 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + __m128i res_16_0 = _mm_packs_epi32(input.reg[0], input.reg[1]); + __m128i res_16_1 = _mm_packs_epi32(input.reg[2], input.reg[3]); + output.reg[0] = _mm_packus_epi16(res_16_0, res_16_1); + __m128i res_16_2 = _mm_packs_epi32(input.reg[4], input.reg[5]); + __m128i res_16_3 = _mm_packs_epi32(input.reg[6], input.reg[7]); + output.reg[1] = _mm_packus_epi16(res_16_2, res_16_3); + return output; + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> { + static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + StoreInt32x4(dst->data(row, col), src.buf.reg[0]); + } else { + *dst->data(row + 0, col) = GetLane<0>(src.buf.reg[0]); + *dst->data(row + 1, col) = GetLane<1>(src.buf.reg[0]); + *dst->data(row + 2, col) = GetLane<2>(src.buf.reg[0]); + *dst->data(row + 3, col) = GetLane<3>(src.buf.reg[0]); + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> { + static void Run(const RegBlockInt32<8, 1>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + StoreInt32x4(dst->data(row, col), src.buf.reg[0]); + StoreInt32x4(dst->data(row + 4, col), src.buf.reg[1]); + } else { + *dst->data(row + 0, col) = GetLane<0>(src.buf.reg[0]); + *dst->data(row + 1, col) = GetLane<1>(src.buf.reg[0]); + *dst->data(row + 2, col) = GetLane<2>(src.buf.reg[0]); + *dst->data(row + 3, col) = GetLane<3>(src.buf.reg[0]); + *dst->data(row + 4, col) = GetLane<0>(src.buf.reg[1]); + *dst->data(row + 5, col) = GetLane<1>(src.buf.reg[1]); + *dst->data(row + 6, col) = GetLane<2>(src.buf.reg[1]); + *dst->data(row + 7, col) = GetLane<3>(src.buf.reg[1]); + } + } +}; + +inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) { + __m128i t0 = _mm_unpacklo_epi32(src.buf.reg[0], src.buf.reg[1]); + __m128i t1 = _mm_unpacklo_epi32(src.buf.reg[2], src.buf.reg[3]); + __m128i t2 = _mm_unpackhi_epi32(src.buf.reg[0], src.buf.reg[1]); + __m128i t3 = _mm_unpackhi_epi32(src.buf.reg[2], src.buf.reg[3]); + + RegBlockInt32<4, 4> result; + result.buf.reg[0] = _mm_unpacklo_epi64(t0, t1); + result.buf.reg[1] = _mm_unpackhi_epi64(t0, t1); + result.buf.reg[2] = _mm_unpacklo_epi64(t2, t3); + result.buf.reg[3] = _mm_unpackhi_epi64(t2, t3); + return result; +} + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> { + static void Run(const RegBlockInt32<4, 4>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row, col + i), src.buf.reg[i]); + } + } else { + const auto transpose = Transpose(src); + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row + i, col), transpose.buf.reg[i]); + } + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> { + static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row, col + i), src.buf.reg[2 * i]); + StoreInt32x4(dst->data(row + 4, col + i), src.buf.reg[2 * i + 1]); + } + } else { + RegBlockInt32<4, 4> top; + top.buf.reg[0] = src.buf.reg[0]; + top.buf.reg[1] = src.buf.reg[2]; + top.buf.reg[2] = src.buf.reg[4]; + top.buf.reg[3] = src.buf.reg[6]; + const auto transpose_top = Transpose(top); + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row + i, col), transpose_top.buf.reg[i]); + } + RegBlockInt32<4, 4> bottom; + bottom.buf.reg[0] = src.buf.reg[1]; + bottom.buf.reg[1] = src.buf.reg[3]; + bottom.buf.reg[2] = src.buf.reg[5]; + bottom.buf.reg[3] = src.buf.reg[7]; + const auto transpose_bottom = Transpose(bottom); + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row + 4 + i, col), transpose_bottom.buf.reg[i]); + } + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> { + static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + for (int i = 0; i < 8; i++) { + StoreInt32x4(dst->data(row, col + i), src.buf.reg[2 * i]); + StoreInt32x4(dst->data(row + 4, col + i), src.buf.reg[2 * i + 1]); + } + } else { + RegBlockInt32<4, 4> top_left; + top_left.buf.reg[0] = src.buf.reg[0]; + top_left.buf.reg[1] = src.buf.reg[2]; + top_left.buf.reg[2] = src.buf.reg[4]; + top_left.buf.reg[3] = src.buf.reg[6]; + const auto transpose_top_left = Transpose(top_left); + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row + i, col), transpose_top_left.buf.reg[i]); + } + RegBlockInt32<4, 4> bottom_left; + bottom_left.buf.reg[0] = src.buf.reg[1]; + bottom_left.buf.reg[1] = src.buf.reg[3]; + bottom_left.buf.reg[2] = src.buf.reg[5]; + bottom_left.buf.reg[3] = src.buf.reg[7]; + const auto transpose_bottom_left = Transpose(bottom_left); + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row + 4 + i, col), + transpose_bottom_left.buf.reg[i]); + } + RegBlockInt32<4, 4> top_right; + top_right.buf.reg[0] = src.buf.reg[8]; + top_right.buf.reg[1] = src.buf.reg[10]; + top_right.buf.reg[2] = src.buf.reg[12]; + top_right.buf.reg[3] = src.buf.reg[14]; + const auto transpose_top_right = Transpose(top_right); + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row + i, col + 4), + transpose_top_right.buf.reg[i]); + } + RegBlockInt32<4, 4> bottom_right; + bottom_right.buf.reg[0] = src.buf.reg[9]; + bottom_right.buf.reg[1] = src.buf.reg[11]; + bottom_right.buf.reg[2] = src.buf.reg[13]; + bottom_right.buf.reg[3] = src.buf.reg[15]; + const auto transpose_bottom_right = Transpose(bottom_right); + for (int i = 0; i < 4; i++) { + StoreInt32x4(dst->data(row + 4 + i, col + 4), + transpose_bottom_right.buf.reg[i]); + } + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> { + static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + *dst->data(row, col + 0) = GetLane<0>(src.buf.reg[0]); + *dst->data(row, col + 1) = GetLane<1>(src.buf.reg[0]); + *dst->data(row, col + 2) = GetLane<2>(src.buf.reg[0]); + *dst->data(row, col + 3) = GetLane<3>(src.buf.reg[0]); + } else { + StoreInt32x4(dst->data(row, col), src.buf.reg[0]); + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockUint8<4, 1>, DstType> { + static void Run(const RegBlockUint8<4, 1>& src, DstType* dst, int row, + int col) { + const std::uint32_t src_reg = src.buf.reg[0]; + for (int i = 0; i < 4; i++) { + *dst->data(row + i, col) = (src_reg >> (8 * i)); + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockUint8<8, 1>, DstType> { + static void Run(const RegBlockUint8<8, 1>& src, DstType* dst, int row, + int col) { + for (int i = 0; i < 4; i++) { + *dst->data(row + i, col) = (src.buf.reg[0] >> (8 * i)); + } + for (int i = 0; i < 4; i++) { + *dst->data(row + 4 + i, col) = (src.buf.reg[1] >> (8 * i)); + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockUint8<1, 4>, DstType> { + static void Run(const RegBlockUint8<1, 4>& src, DstType* dst, int row, + int col) { + for (int i = 0; i < 4; i++) { + *dst->data(row, col + i) = (src.buf.reg[0] >> (8 * i)); + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockUint8<4, 4>, DstType> { + static void Run(const RegBlockUint8<4, 4>& src, DstType* dst, int row, + int col) { + std::uint8_t buf[16]; + StoreUint8x16(buf, src.buf.reg[0]); + for (int c = 0; c < 4; c++) { + for (int r = 0; r < 4; r++) { + *dst->data(row + r, col + c) = buf[r + 4 * c]; + } + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockUint8<8, 4>, DstType> { + static void Run(const RegBlockUint8<8, 4>& src, DstType* dst, int row, + int col) { + std::uint8_t buf[32]; + StoreUint8x16(buf, src.buf.reg[0]); + StoreUint8x16(buf + 16, src.buf.reg[1]); + for (int c = 0; c < 4; c++) { + for (int r = 0; r < 8; r++) { + *dst->data(row + r, col + c) = buf[r + 8 * c]; + } + } + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockUint8<8, 8>, DstType> { + static void Run(const RegBlockUint8<8, 8>& src, DstType* dst, int row, + int col) { + std::uint8_t buf[64]; + StoreUint8x16(buf, src.buf.reg[0]); + StoreUint8x16(buf + 16, src.buf.reg[1]); + StoreUint8x16(buf + 32, src.buf.reg[2]); + StoreUint8x16(buf + 48, src.buf.reg[3]); + for (int c = 0; c < 8; c++) { + for (int r = 0; r < 8; r++) { + *dst->data(row + r, col + c) = buf[r + 8 * c]; + } + } + } +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_OUTPUT_SSE_H_ |