diff options
Diffstat (limited to 'inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp')
-rw-r--r-- | inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp | 683 |
1 files changed, 683 insertions, 0 deletions
diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp new file mode 100644 index 000000000..887b4683c --- /dev/null +++ b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp @@ -0,0 +1,683 @@ +// Copyright (C) 2018 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ie_preprocess_data.hpp" +#include "ie_preprocess_data_sse42.hpp" + +#include <nmmintrin.h> // SSE 4.2 + +#include <stdint.h> + +namespace InferenceEngine { +namespace Resize { + +static inline int ceil(double value) { + __m128d t = _mm_set_sd(value); + int i = _mm_cvtsd_si32(t); + return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t, i), t)); +} + + +static inline int floor(double value) { + __m128d t = _mm_set_sd(value); + int i = _mm_cvtsd_si32(t); + return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t, i))); +} + +static inline int16_t mulq15(int16_t a, int16_t b) { + return static_cast<int16_t>(((1 << 14) + (int32_t)a * (int32_t)b) >> 15); +} + +static inline uint16_t mulq16(uint16_t a, uint16_t b) { + return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16); +} + +void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) { + Border border = {BORDER_REPLICATE, 0}; + + auto dstDims = outBlob->getTensorDesc().getDims(); + auto srcDims = inBlob->getTensorDesc().getDims(); + + auto dwidth = static_cast<const int>(dstDims[3]); + auto dheight = static_cast<const int>(dstDims[2]); + auto swidth = static_cast<const int>(srcDims[3]); + auto channels = static_cast<const int>(srcDims[1]); + + auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides(); + auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides(); + auto origSrcW = src_strides[2]; + auto origSrcH = src_strides[1] / src_strides[2]; + auto origDstW = dst_strides[2]; + auto origDstH = dst_strides[1] / dst_strides[2]; + + const int src_go_x = 0; + const int src_go_y = 0; + const int dst_go_x = 0; + const int dst_go_y = 0; + auto src_full_width = static_cast<const int>(srcDims[3]); + auto src_full_height = static_cast<const int>(srcDims[2]); + auto dst_full_width = static_cast<const int>(dstDims[3]); + auto dst_full_height = static_cast<const int>(dstDims[2]); + + const uint8_t *sptr = static_cast<uint8_t *>(inBlob->buffer()) + + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding(); + uint8_t *dptr = static_cast<uint8_t *>(outBlob->buffer()) + + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]); + auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]); + auto scale_x = static_cast<float>(src_full_width) / dst_full_width; + auto scale_y = static_cast<float>(src_full_height) / dst_full_height; + + const int BITS = 15; + const int SCALE = (1 << BITS); + const int alpha_clones_num = 4; + const int cols_block_size = 8; + const int kRowsBlockSize = 4; + + auto *pxofs1 = reinterpret_cast<int32_t *>(buffer); + auto *alpha = reinterpret_cast<int16_t *>(pxofs1 + dwidth); + auto *yofs = reinterpret_cast<int32_t *>(alpha + dwidth * alpha_clones_num); + auto *beta = reinterpret_cast<int16_t *>(yofs + dheight); + auto *tptr = reinterpret_cast<uint8_t *>(beta + dheight); + + auto tptr_ = tptr; + + tptr_[0] = (uint8_t) border.value; + tptr_[1] = (uint8_t) border.value; + tptr_[2] = (uint8_t) border.value; + tptr_[3] = (uint8_t) border.value; + tptr_[swidth + 0 + 4] = (uint8_t) border.value; + tptr_[swidth + 1 + 4] = (uint8_t) border.value; + tptr_[swidth + 2 + 4] = (uint8_t) border.value; + tptr_[swidth + 3 + 4] = (uint8_t) border.value; + tptr_[swidth * kRowsBlockSize + 0 + 4] = (uint8_t) border.value; + tptr_[swidth * kRowsBlockSize + 1 + 4] = (uint8_t) border.value; + tptr_[swidth * kRowsBlockSize + 2 + 4] = (uint8_t) border.value; + tptr_[swidth * kRowsBlockSize + 3 + 4] = (uint8_t) border.value; + + for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) { + auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5); + int32_t sx = floor(fx); + fx -= sx; + + int32_t sx0 = sx; + if (sx < 0 && border.type == BORDER_REPLICATE) { + fx = 0; + sx0 = 0; + } + + fx = fx * SCALE; + + if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) { + fx = 1.f * SCALE - 1; + sx0 = (std::max)(src_full_width - 2, 0); + } + + pxofs1[dx - dst_go_x] = kRowsBlockSize * (sx0 - src_go_x); + for (int i = 0; i < alpha_clones_num; i++) { + alpha[(dx - dst_go_x) * alpha_clones_num + i] = (int16_t) fx; + } + } + + for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) { + float fy = static_cast<float>((dy + 0.5) * scale_y - 0.5); + int32_t sy = floor(fy); + fy -= sy; + + int32_t sy0 = sy; + if (sy < 0 && border.type == BORDER_REPLICATE) { + fy = 0; + sy0 = 0; + } + + fy = fy * SCALE; + + if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) { + fy = 1.f * SCALE - 1; + sy0 = (std::max)(src_full_height - 2, 0); + } + + yofs[dy - dst_go_y] = (sy0 - src_go_y) * sstep; + beta[dy - dst_go_y] = (int16_t) fy; + } + + if (swidth < cols_block_size || dwidth < cols_block_size || dheight < kRowsBlockSize) { + auto full_pass = [&](int c, int y) { + auto sptr_ = sptr + c * origSrcW * origSrcH; + auto dptr_ = dptr + c * origDstW * origDstH; + auto tptr_ = tptr; + + for (int x = 0; x < swidth; x++) { + int val0 = (yofs[y] < 0) ? border.value : sptr_[yofs[y] + x + 0]; + int val1 = (yofs[y] / sstep + 1 >= src_full_height - src_go_y) ? border.value : sptr_[yofs[y] + x + + sstep]; + + int res = val0 + mulq15(beta[y], (int16_t) (val1 - val0)); + tptr_[x + 4] = (uint8_t) res; + } + + for (int x = 0; x < dwidth; x++) { + int val0 = tptr_[pxofs1[x] / kRowsBlockSize + 0 + 4]; + int val1 = tptr_[pxofs1[x] / kRowsBlockSize + 1 + 4]; + + int res = val0 + mulq15(alpha[x * alpha_clones_num], (int16_t) (val1 - val0)); + dptr_[y * dstep + x] = (uint8_t) res; + } + }; + + for (int c = 0; c < channels; c++) { + for (int y = 0; y < dheight; y++) { + full_pass(c, y); + } + } + + return; + } + + auto full_pass_vec = [&](const uint8_t* sptr_, uint8_t* dptr_, uint8_t* tptr_, int y) { + int32_t filtered_rows_id[4]; + for (int i = 0; i < 4; i++) { + filtered_rows_id[i] = (yofs[y + i] < 0) ? 0 : + (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) ? 0 : yofs[y + i]; + } + + __m128i b0 = _mm_set1_epi16(beta[y + 0]); + __m128i b1 = _mm_set1_epi16(beta[y + 1]); + __m128i b2 = _mm_set1_epi16(beta[y + 2]); + __m128i b3 = _mm_set1_epi16(beta[y + 3]); + + int x = 0; + vertical_pass: + for (; x <= swidth - cols_block_size; x += cols_block_size) { + __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0])), + *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1])), 1); + __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2])), + *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3])), 1); + __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0] + sstep)), + *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1] + sstep)), 1); + __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2] + sstep)), + *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3] + sstep)), 1); + + __m128i val0_0 = _mm_unpacklo_epi8(val0lo, _mm_setzero_si128()); + __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128()); + __m128i val0_2 = _mm_unpacklo_epi8(val0hi, _mm_setzero_si128()); + __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128()); + + __m128i val1_0 = _mm_unpacklo_epi8(val1lo, _mm_setzero_si128()); + __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128()); + __m128i val1_2 = _mm_unpacklo_epi8(val1hi, _mm_setzero_si128()); + __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128()); + + __m128i s0_0 = _mm_sub_epi16(val1_0, val0_0); + __m128i s0_1 = _mm_sub_epi16(val1_1, val0_1); + __m128i s0_2 = _mm_sub_epi16(val1_2, val0_2); + __m128i s0_3 = _mm_sub_epi16(val1_3, val0_3); + + __m128i t0 = _mm_mulhrs_epi16(s0_0, b0); + __m128i t1 = _mm_mulhrs_epi16(s0_1, b1); + __m128i t2 = _mm_mulhrs_epi16(s0_2, b2); + __m128i t3 = _mm_mulhrs_epi16(s0_3, b3); + + __m128i r0 = _mm_add_epi16(val0_0, t0); + __m128i r1 = _mm_add_epi16(val0_1, t1); + __m128i r2 = _mm_add_epi16(val0_2, t2); + __m128i r3 = _mm_add_epi16(val0_3, t3); + + __m128i q0 = _mm_packus_epi16(r0, r1); + __m128i q1 = _mm_packus_epi16(r2, r3); + + __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/); + __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/); + + __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15)); + __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15)); + + _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 0) * kRowsBlockSize + 4), q4); + _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 4) * kRowsBlockSize + 4), q5); + } + + if (x < swidth) { + x = swidth - cols_block_size; + goto vertical_pass; + } + + if (border.type == BORDER_CONSTANT) { + for (int i = 0; i < kRowsBlockSize; i++) { + if (yofs[y + i] < 0) { + for (x = 0; x < swidth; x++) { + int val0 = border.value; + int val1 = sptr_[yofs[y + i] + x + sstep]; + + int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0)); + tptr_[x * 4 + i + 4] = (uint8_t) res; + } + } + + if (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) { + for (x = 0; x < swidth; x++) { + int val0 = sptr_[yofs[y + i] + x]; + int val1 = border.value; + + int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0)); + tptr_[x * 4 + i + 4] = (uint8_t) res; + } + } + } + } + + x = 0; + horizontal_pass: + for (; x <= dwidth - cols_block_size; x += cols_block_size) { + __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 0) * alpha_clones_num)); + __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 2) * alpha_clones_num)); + __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 4) * alpha_clones_num)); + __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 6) * alpha_clones_num)); + + __m128i val_0 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 0] + 4)), + *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 1] + 4)), 1); + __m128i val_1 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 2] + 4)), + *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 3] + 4)), 1); + __m128i val_2 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 4] + 4)), + *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 5] + 4)), 1); + __m128i val_3 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 6] + 4)), + *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 7] + 4)), 1); + + val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0)); + val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0)); + val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0)); + val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0)); + + __m128i val0_0 = _mm_unpacklo_epi8(val_0, _mm_setzero_si128()); + __m128i val0_1 = _mm_unpacklo_epi8(val_1, _mm_setzero_si128()); + __m128i val0_2 = _mm_unpacklo_epi8(val_2, _mm_setzero_si128()); + __m128i val0_3 = _mm_unpacklo_epi8(val_3, _mm_setzero_si128()); + + __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128()); + __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128()); + __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128()); + __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128()); + + val1_0 = _mm_sub_epi16(val1_0, val0_0); + val1_1 = _mm_sub_epi16(val1_1, val0_1); + val1_2 = _mm_sub_epi16(val1_2, val0_2); + val1_3 = _mm_sub_epi16(val1_3, val0_3); + + __m128i t0 = _mm_mulhrs_epi16(val1_0, a10); + __m128i t1 = _mm_mulhrs_epi16(val1_1, a32); + __m128i t2 = _mm_mulhrs_epi16(val1_2, a54); + __m128i t3 = _mm_mulhrs_epi16(val1_3, a76); + + __m128i r0 = _mm_add_epi16(val0_0, t0); + __m128i r1 = _mm_add_epi16(val0_1, t1); + __m128i r2 = _mm_add_epi16(val0_2, t2); + __m128i r3 = _mm_add_epi16(val0_3, t3); + + __m128i q0 = _mm_packus_epi16(r0, r1); + __m128i q1 = _mm_packus_epi16(r2, r3); + + __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15)); + __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15)); + + __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/); + __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/); + + _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 0) * dstep + x), q4); + _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 1) * dstep + x), _mm_srli_si128(q4, 8)); + _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 2) * dstep + x), q5); + _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 3) * dstep + x), _mm_srli_si128(q5, 8)); + } + + if (x < dwidth) { + x = dwidth - cols_block_size; + goto horizontal_pass; + } + }; + + for (int c = 0; c < channels; c++) { + for (int y = 0; y <= dheight - kRowsBlockSize; y += kRowsBlockSize) { + auto sptr_ = sptr + c * origSrcW * origSrcH; + auto dptr_ = dptr + c * origDstW * origDstH; + auto tptr_ = tptr; + + full_pass_vec(sptr_, dptr_, tptr_, y); + + if (y + kRowsBlockSize > dheight - kRowsBlockSize) + full_pass_vec(sptr_, dptr_, tptr_, dheight - kRowsBlockSize); + } + } +} + +void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) { + auto dstDims = outBlob->getTensorDesc().getDims(); + auto srcDims = inBlob->getTensorDesc().getDims(); + + auto dwidth = static_cast<const int>(dstDims[3]); + auto dheight = static_cast<const int>(dstDims[2]); + auto swidth = static_cast<const int>(srcDims[3]); + auto sheight = static_cast<const int>(srcDims[2]); + auto channels = static_cast<const int>(srcDims[1]); + + auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides(); + auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides(); + auto origSrcW = src_strides[2]; + auto origSrcH = src_strides[1] / src_strides[2]; + auto origDstW = dst_strides[2]; + auto origDstH = dst_strides[1] / dst_strides[2]; + + const int src_go_x = 0; + const int src_go_y = 0; + const int dst_go_x = 0; + const int dst_go_y = 0; + + auto src_full_width = static_cast<const int>(srcDims[3]); + auto src_full_height = static_cast<const int>(srcDims[2]); + auto dst_full_width = static_cast<const int>(dstDims[3]); + auto dst_full_height = static_cast<const int>(dstDims[2]); + + auto sptr = static_cast<uint8_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding(); + auto dptr = static_cast<uint8_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding(); + auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]); + auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]); + + float scale_x = static_cast<float>(src_full_width) / dst_full_width; + float scale_y = static_cast<float>(src_full_height) / dst_full_height; + + int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, scale_x); + int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, scale_y); + + auto* xsi = reinterpret_cast<uint16_t*>(buffer); + auto* ysi = xsi + dwidth; + auto* xalpha = ysi + dheight; + auto* yalpha = xalpha + dwidth*x_max_count + 8*16; + + computeResizeAreaTab(src_go_x, dst_go_x, src_full_width, dwidth, scale_x, xsi, xalpha, x_max_count); + computeResizeAreaTab(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, yalpha, y_max_count); + + int vest_sum_size = 2*swidth; + uint16_t* vert_sum = yalpha + dheight*y_max_count; + uint16_t* alpha0 = vert_sum + vest_sum_size; + uint16_t* alpha1 = alpha0 + dwidth; + uint16_t* alpha2 = alpha1 + dwidth; + uint16_t* alpha3 = alpha2 + dwidth; + uint16_t* sxid0 = alpha3 + dwidth; + uint16_t* sxid1 = sxid0 + 4*dwidth; + uint16_t* sxid2 = sxid1 + 4*dwidth; + uint16_t* sxid3 = sxid2 + 4*dwidth; + + uint16_t* alpha[] = {alpha0, alpha1, alpha2, alpha3}; + uint16_t* sxid[] = {sxid0, sxid1, sxid2, sxid3}; + generate_alpha_and_id_arrays(x_max_count, dwidth, xalpha, xsi, alpha, sxid); + + auto full_pass = [&](int c, int y) { + uint8_t* pdst_row = dptr + (y * dstep) + c * origDstW * origDstH; + uint16_t* vert_sum_ = vert_sum; + + int ysi_row = ysi[y]; + + memset(vert_sum_, 0, swidth * sizeof(uint16_t)); + + for (int dy = 0; dy < y_max_count; dy++) { + uint16_t yalpha_dy = yalpha[y * y_max_count + dy]; + const uint8_t *sptr_dy = sptr + ((ysi_row + dy) * sstep) + c * origSrcW * origSrcH; + if (ysi_row + dy >= sheight) break; + + int x = 0; + + __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy); + for (; x <= swidth - 16; x += 16) { + __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x)); + + // sptr_dy[x] << 8 + __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval); + __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval); + + __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0)); + __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8)); + + vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo)); + vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi); + } + + for (; x < swidth; x++) { + vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8)); + } + } + + if (x_max_count == 2) { + int x = 0; + for (; x <= dwidth - 8; x += 8) { + __m128i res = _mm_set1_epi16(1 << (8 - 1)); + + int id0 = xsi[x]; + + __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0)); + __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8)); + + __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2)); + __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8)); + + __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2)); + __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8)); + + __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0), + _mm_shuffle_epi8(chunk1, sx0_id1)); + __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0), + _mm_shuffle_epi8(chunk1, sx1_id1)); + + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0)); + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1)); + + res = _mm_srli_epi16(res, 8); + res = _mm_packus_epi16(res, res); + _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res); + } + + for (; x < dwidth; x++) { + uint16_t res = 1 << (8 - 1); + int id = xsi[x]; + res += mulq16(alpha0[x], vert_sum_[id + 0]); + res += mulq16(alpha1[x], vert_sum_[id + 1]); + pdst_row[x] = saturateU32toU8(res >> 8); + } + } else if (x_max_count == 3) { + int x = 0; + for (; x <= dwidth - 8; x += 8) { + __m128i res = _mm_set1_epi16(1 << (8 - 1)); + + int id0 = xsi[x]; + + __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0)); + __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8)); + __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16)); + + __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3)); + __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8)); + __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16)); + + __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3)); + __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8)); + __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16)); + + __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3)); + __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8)); + __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16)); + + __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0), + _mm_shuffle_epi8(chunk1, sx0_id1)), + _mm_shuffle_epi8(chunk2, sx0_id2)); + __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0), + _mm_shuffle_epi8(chunk1, sx1_id1)), + _mm_shuffle_epi8(chunk2, sx1_id2)); + __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0), + _mm_shuffle_epi8(chunk1, sx2_id1)), + _mm_shuffle_epi8(chunk2, sx2_id2)); + + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0)); + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1)); + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2)); + + res = _mm_srli_epi16(res, 8); + res = _mm_packus_epi16(res, res); + _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res); + } + + for (; x < dwidth; x++) { + uint16_t res = 1 << (8 - 1); + int id = xsi[x]; + res += mulq16(alpha0[x], vert_sum_[id + 0]); + res += mulq16(alpha1[x], vert_sum_[id + 1]); + res += mulq16(alpha2[x], vert_sum_[id + 2]); + pdst_row[x] = saturateU32toU8(res >> 8); + } + } else if (x_max_count == 4) { + int x = 0; + for (; x <= dwidth - 8; x += 8) { + __m128i res = _mm_set1_epi16(1 << (8 - 1)); + + int id0 = xsi[x]; + + __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0)); + __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8)); + __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16)); + __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24)); + + __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4)); + __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8)); + __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16)); + __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24)); + + __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4)); + __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8)); + __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16)); + __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24)); + + __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4)); + __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8)); + __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16)); + __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24)); + + __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4)); + __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8)); + __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16)); + __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24)); + + __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0), + _mm_shuffle_epi8(chunk1, sx0_id1)), + _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2), + _mm_shuffle_epi8(chunk3, sx0_id3))); + __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0), + _mm_shuffle_epi8(chunk1, sx1_id1)), + _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2), + _mm_shuffle_epi8(chunk3, sx1_id3))); + __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0), + _mm_shuffle_epi8(chunk1, sx2_id1)), + _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2), + _mm_shuffle_epi8(chunk3, sx2_id3))); + __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0), + _mm_shuffle_epi8(chunk1, sx3_id1)), + _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2), + _mm_shuffle_epi8(chunk3, sx3_id3))); + + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0)); + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1)); + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2)); + res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3)); + + res = _mm_srli_epi16(res, 8); + res = _mm_packus_epi16(res, res); + _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res); + } + + for (; x < dwidth; x++) { + uint16_t res = 1 << (8 - 1); + int id = xsi[x]; + res += mulq16(alpha0[x], vert_sum_[id + 0]); + res += mulq16(alpha1[x], vert_sum_[id + 1]); + res += mulq16(alpha2[x], vert_sum_[id + 2]); + res += mulq16(alpha3[x], vert_sum_[id + 3]); + pdst_row[x] = saturateU32toU8(res >> 8); + } + } else if (x_max_count <= 7) { + int x = 0; + for (; x <= dwidth - 8; x += 8) { + __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1)); + for (int i = 0; i < x_max_count; i++) { + __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i], + xalpha[x * x_max_count + x_max_count * 1 + i], + xalpha[x * x_max_count + x_max_count * 2 + i], + xalpha[x * x_max_count + x_max_count * 3 + i], + xalpha[x * x_max_count + x_max_count * 4 + i], + xalpha[x * x_max_count + x_max_count * 5 + i], + xalpha[x * x_max_count + x_max_count * 6 + i], + xalpha[x * x_max_count + x_max_count * 7 + i]); + __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i], + vert_sum_[xsi[x + 1] + i], + vert_sum_[xsi[x + 2] + i], + vert_sum_[xsi[x + 3] + i], + vert_sum_[xsi[x + 4] + i], + vert_sum_[xsi[x + 5] + i], + vert_sum_[xsi[x + 6] + i], + vert_sum_[xsi[x + 7] + i]); + + res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum)); + } + res = _mm_srli_epi16(res, 8); + res = _mm_packus_epi16(res, res); + _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res); + } + + for (; x < dwidth; x++) { + uint16_t res = 1 << (8 - 1); + for (int i = 0; i < x_max_count; i++) { + uint16_t a = xalpha[x * x_max_count + i]; + int sx = xsi[x] + i; + + res += mulq16(a, vert_sum_[sx]); + } + pdst_row[x] = saturateU32toU8(res >> 8); + } + } else { + for (int x = 0; x < dwidth; x++) { + uint16_t res = 1 << (8 - 1); + __m128i vres = _mm_setzero_si128(); + int id = xsi[x]; + + int i = 0; + for (; i <= x_max_count - 8; i += 8) { + __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i)); + __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i)); + + vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s)); + } + vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2)); + vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4)); + vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8)); + res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7)); + + for (; i < x_max_count; i++) { + uint16_t a = xalpha[x * x_max_count + i]; + uint16_t s = vert_sum_[id + i]; + + res += mulq16(a, s); + } + + pdst_row[x] = saturateU32toU8(res >> 8); + } + } + }; + + for (int c = 0; c < channels; c++) { + for (int y = 0; y < dheight; y++) { + full_pass(c, y); + } + } +} + +} // namespace Resize +} // namespace InferenceEngine |