summaryrefslogtreecommitdiff
path: root/runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h
diff options
context:
space:
mode:
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h')
-rw-r--r--runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h128
1 files changed, 128 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h b/runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h
new file mode 100644
index 000000000..52163c4e5
--- /dev/null
+++ b/runtimes/nn/depend/external/gemmlowp/internal/pack_sse.h
@@ -0,0 +1,128 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack_SSE.h: optimized SSE specializations of the templates in pack.h.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_SSE_H_
+#define GEMMLOWP_INTERNAL_PACK_SSE_H_
+
+#include <smmintrin.h>
+#include "pack.h"
+
+namespace gemmlowp {
+
+// TODO: Add DepthMajorUint8SideMap
+
+typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor>
+ WidthMajorUint8SideMap;
+
+template <int Cells>
+using WidthMajorSideFormatNCells4x2 =
+ KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>;
+
+template <int Cells>
+class PackingRegisterBlock<
+ WidthMajorUint8SideMap,
+ PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > >
+ : public PackingRegisterBlockBase<
+ WidthMajorUint8SideMap,
+ PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > > {
+ public:
+ typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
+ typedef typename KernelSideFormat::Cell CellFormat;
+ static const int kCells = KernelSideFormat::kCells;
+ static const int kCellWidth = CellFormat::kWidth;
+ static const int kKernelWidth = CellFormat::kWidth * kCells;
+ static const int kCellDepth = CellFormat::kDepth;
+ static const int kCellSize = CellFormat::kSize;
+
+ void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+ std::uint8_t* dst_ptr = dst->current_data();
+ const int width_stride = this->complete_src_.width_stride();
+ int depth_step = 8;
+
+ __m128i one = _mm_set1_epi16(1);
+ for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
+ cell_start_depth += depth_step) {
+ for (int cell_start_width = 0; cell_start_width < kKernelWidth;
+ cell_start_width += kCellWidth) {
+ std::int32_t* cell_sums_of_each_slice_ptr =
+ dst->sums_of_each_slice() + start_width + cell_start_width;
+ const std::uint8_t* src_data =
+ this->complete_src_.data(cell_start_width, cell_start_depth);
+
+ __m128i xmm1 =
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src_data[0]));
+ __m128i xmm2 = _mm_loadl_epi64(
+ reinterpret_cast<const __m128i*>(&src_data[1 * width_stride]));
+ __m128i xmm3 = _mm_loadl_epi64(
+ reinterpret_cast<const __m128i*>(&src_data[2 * width_stride]));
+ __m128i xmm4 = _mm_loadl_epi64(
+ reinterpret_cast<const __m128i*>(&src_data[3 * width_stride]));
+
+ __m128i xmm5 = _mm_unpacklo_epi16(xmm1, xmm2);
+ __m128i xmm8 = _mm_shuffle_epi32(xmm5, 0x31);
+
+ __m128i xmm6 = _mm_unpacklo_epi16(xmm3, xmm4);
+ __m128i xmm7 = _mm_shuffle_epi32(xmm6, 0x80);
+
+ __m128i xmm9 = _mm_blend_epi16(xmm5, xmm7, 0xcc);
+ __m128i xmm10 = _mm_blend_epi16(xmm8, xmm6, 0xcc);
+
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst_ptr[0]), xmm9);
+ _mm_storel_epi64(
+ reinterpret_cast<__m128i*>(&dst_ptr[kCellSize * kCells]), xmm10);
+
+ __m128i xmm11 = _mm_shuffle_epi32(xmm9, 0xee);
+ __m128i xmm12 = _mm_shuffle_epi32(xmm10, 0xee);
+
+ _mm_storel_epi64(
+ reinterpret_cast<__m128i*>(&dst_ptr[2 * kCellSize * kCells]),
+ xmm11);
+ _mm_storel_epi64(
+ reinterpret_cast<__m128i*>(&dst_ptr[3 * kCellSize * kCells]),
+ xmm12);
+
+ xmm1 = _mm_cvtepu8_epi16(xmm9);
+ xmm2 = _mm_madd_epi16(xmm1, one);
+ __m128i sums_of_each_slice_xmm = _mm_loadu_si128(
+ reinterpret_cast<const __m128i*>(&cell_sums_of_each_slice_ptr[0]));
+ sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+ xmm1 = _mm_cvtepu8_epi16(xmm10);
+ xmm2 = _mm_madd_epi16(xmm1, one);
+ sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+ xmm1 = _mm_cvtepu8_epi16(xmm11);
+ xmm2 = _mm_madd_epi16(xmm1, one);
+ sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+ xmm1 = _mm_cvtepu8_epi16(xmm12);
+ xmm2 = _mm_madd_epi16(xmm1, one);
+ sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+ _mm_storeu_si128(
+ reinterpret_cast<__m128i*>(&cell_sums_of_each_slice_ptr[0]),
+ sums_of_each_slice_xmm);
+ dst_ptr += kCellSize;
+ }
+ dst_ptr += 3 * kCellSize * kCells;
+ }
+ dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+ }
+};
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_PACK_SSE_H_