diff options
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h')
-rw-r--r-- | runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h | 128 |
1 files changed, 0 insertions, 128 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h b/runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h deleted file mode 100644 index 52163c4e5..000000000 --- a/runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// pack_SSE.h: optimized SSE specializations of the templates in pack.h. - -#ifndef GEMMLOWP_INTERNAL_PACK_SSE_H_ -#define GEMMLOWP_INTERNAL_PACK_SSE_H_ - -#include <smmintrin.h> -#include "pack.h" - -namespace gemmlowp { - -// TODO: Add DepthMajorUint8SideMap - -typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor> - WidthMajorUint8SideMap; - -template <int Cells> -using WidthMajorSideFormatNCells4x2 = - KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>; - -template <int Cells> -class PackingRegisterBlock< - WidthMajorUint8SideMap, - PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > > - : public PackingRegisterBlockBase< - WidthMajorUint8SideMap, - PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > > { - public: - typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat; - typedef typename KernelSideFormat::Cell CellFormat; - static const int kCells = KernelSideFormat::kCells; - static const int kCellWidth = CellFormat::kWidth; - static const int kKernelWidth = CellFormat::kWidth * kCells; - static const int kCellDepth = CellFormat::kDepth; - static const int kCellSize = CellFormat::kSize; - - void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) { - std::uint8_t* dst_ptr = dst->current_data(); - const int width_stride = this->complete_src_.width_stride(); - int depth_step = 8; - - __m128i one = _mm_set1_epi16(1); - for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; - cell_start_depth += depth_step) { - for (int cell_start_width = 0; cell_start_width < kKernelWidth; - cell_start_width += kCellWidth) { - std::int32_t* cell_sums_of_each_slice_ptr = - dst->sums_of_each_slice() + start_width + cell_start_width; - const std::uint8_t* src_data = - this->complete_src_.data(cell_start_width, cell_start_depth); - - __m128i xmm1 = - _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src_data[0])); - __m128i xmm2 = _mm_loadl_epi64( - reinterpret_cast<const __m128i*>(&src_data[1 * width_stride])); - __m128i xmm3 = _mm_loadl_epi64( - reinterpret_cast<const __m128i*>(&src_data[2 * width_stride])); - __m128i xmm4 = _mm_loadl_epi64( - reinterpret_cast<const __m128i*>(&src_data[3 * width_stride])); - - __m128i xmm5 = _mm_unpacklo_epi16(xmm1, xmm2); - __m128i xmm8 = _mm_shuffle_epi32(xmm5, 0x31); - - __m128i xmm6 = _mm_unpacklo_epi16(xmm3, xmm4); - __m128i xmm7 = _mm_shuffle_epi32(xmm6, 0x80); - - __m128i xmm9 = _mm_blend_epi16(xmm5, xmm7, 0xcc); - __m128i xmm10 = _mm_blend_epi16(xmm8, xmm6, 0xcc); - - _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst_ptr[0]), xmm9); - _mm_storel_epi64( - reinterpret_cast<__m128i*>(&dst_ptr[kCellSize * kCells]), xmm10); - - __m128i xmm11 = _mm_shuffle_epi32(xmm9, 0xee); - __m128i xmm12 = _mm_shuffle_epi32(xmm10, 0xee); - - _mm_storel_epi64( - reinterpret_cast<__m128i*>(&dst_ptr[2 * kCellSize * kCells]), - xmm11); - _mm_storel_epi64( - reinterpret_cast<__m128i*>(&dst_ptr[3 * kCellSize * kCells]), - xmm12); - - xmm1 = _mm_cvtepu8_epi16(xmm9); - xmm2 = _mm_madd_epi16(xmm1, one); - __m128i sums_of_each_slice_xmm = _mm_loadu_si128( - reinterpret_cast<const __m128i*>(&cell_sums_of_each_slice_ptr[0])); - sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2); - - xmm1 = _mm_cvtepu8_epi16(xmm10); - xmm2 = _mm_madd_epi16(xmm1, one); - sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2); - - xmm1 = _mm_cvtepu8_epi16(xmm11); - xmm2 = _mm_madd_epi16(xmm1, one); - sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2); - - xmm1 = _mm_cvtepu8_epi16(xmm12); - xmm2 = _mm_madd_epi16(xmm1, one); - sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2); - - _mm_storeu_si128( - reinterpret_cast<__m128i*>(&cell_sums_of_each_slice_ptr[0]), - sums_of_each_slice_xmm); - dst_ptr += kCellSize; - } - dst_ptr += 3 * kCellSize * kCells; - } - dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); - } -}; - -} // namespace gemmlowp - -#endif // GEMMLOWP_INTERNAL_PACK_SSE_H_ |