summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorJim Bankoski <jimbankoski@google.com>2016-07-08 10:06:54 -0700
committerJim Bankoski <jimbankoski@google.com>2016-07-12 05:53:00 -0700
commit88e695146500982ac0a997c9da2a486992f28400 (patch)
tree42a0597a4bf48e0ea3bb75fd041010001ad29200 /vpx_dsp
parent45ed7effed521aef24b9f4ba8d395e824fe6d649 (diff)
downloadlibvpx-88e695146500982ac0a997c9da2a486992f28400.tar.gz
libvpx-88e695146500982ac0a997c9da2a486992f28400.tar.bz2
libvpx-88e695146500982ac0a997c9da2a486992f28400.zip
deblock filter : moved from vp8 code branch
The deblocking filters used in vp8 have been moved to vpx_dsp for use by both vp8 and vp9. Change-Id: I5209d76edafc894b550f751fc76d3aa6799b392d
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/deblock.c204
-rw-r--r--vpx_dsp/mips/deblock_msa.c683
-rw-r--r--vpx_dsp/mips/macros_msa.h4
-rw-r--r--vpx_dsp/vpx_dsp.mk3
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl12
-rw-r--r--vpx_dsp/x86/deblock_sse2.asm661
6 files changed, 1567 insertions, 0 deletions
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
new file mode 100644
index 000000000..411bc7754
--- /dev/null
+++ b/vpx_dsp/deblock.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+
+const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3,
+ 14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+ 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13,
+ 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10,
+ 8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5,
+ 4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3,
+ 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0,
+ 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7,
+ 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+ 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3,
+ 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10,
+ 8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2,
+ 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13,
+ 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11,
+ 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14,
+ 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+ 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6,
+ 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13,
+ 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12,
+ 13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, };
+
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line, int cols,
+ unsigned char *f, int size) {
+ unsigned char *p_src, *p_dst;
+ int row;
+ int col;
+ unsigned char v;
+ unsigned char d[4];
+
+ for (row = 0; row < size; row++) {
+ /* post_proc_down for one row */
+ p_src = src_ptr;
+ p_dst = dst_ptr;
+
+ for (col = 0; col < cols; col++) {
+ unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+ unsigned char p_above1 = p_src[col - src_pixels_per_line];
+ unsigned char p_below1 = p_src[col + src_pixels_per_line];
+ unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+
+ v = p_src[col];
+
+ if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
+ && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+ unsigned char k1, k2, k3;
+ k1 = (p_above2 + p_above1 + 1) >> 1;
+ k2 = (p_below2 + p_below1 + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
+ }
+
+ p_dst[col] = v;
+ }
+
+ /* now post_proc_across */
+ p_src = dst_ptr;
+ p_dst = dst_ptr;
+
+ p_src[-2] = p_src[-1] = p_src[0];
+ p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
+
+ for (col = 0; col < cols; col++) {
+ v = p_src[col];
+
+ if ((abs(v - p_src[col - 2]) < f[col])
+ && (abs(v - p_src[col - 1]) < f[col])
+ && (abs(v - p_src[col + 1]) < f[col])
+ && (abs(v - p_src[col + 2]) < f[col])) {
+ unsigned char k1, k2, k3;
+ k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+ k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
+ }
+
+ d[col & 3] = v;
+
+ if (col >= 2)
+ p_dst[col - 2] = d[(col - 2) & 3];
+ }
+
+ /* handle the last two pixels */
+ p_dst[col - 2] = d[(col - 2) & 3];
+ p_dst[col - 1] = d[(col - 1) & 3];
+
+ /* next row */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += dst_pixels_per_line;
+ }
+}
+
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
+ int cols, int flimit) {
+ int r, c, i;
+
+ unsigned char *s = src;
+ unsigned char d[16];
+
+ for (r = 0; r < rows; r++) {
+ int sumsq = 0;
+ int sum = 0;
+
+ for (i = -8; i < 0; i++)
+ s[i] = s[0];
+
+ /* 17 avoids valgrind warning - we buffer values in c in d
+ * and only write them when we've read 8 ahead...
+ */
+ for (i = 0; i < 17; i++)
+ s[i + cols] = s[cols - 1];
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i] * s[i];
+ sum += s[i];
+ d[i + 8] = 0;
+ }
+
+ for (c = 0; c < cols + 8; c++) {
+ int x = s[c + 7] - s[c - 8];
+ int y = s[c + 7] + s[c - 8];
+
+ sum += x;
+ sumsq += x * y;
+
+ d[c & 15] = s[c];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[c & 15] = (8 + sum + s[c]) >> 4;
+ }
+
+ s[c - 8] = d[(c - 8) & 15];
+ }
+
+ s += pitch;
+ }
+}
+
+void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
+ int flimit) {
+ int r, c, i;
+ unsigned int seed;
+ const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)];
+
+ for (c = 0; c < cols; c++) {
+ unsigned char *s = &dst[c];
+ int sumsq = 0;
+ int sum = 0;
+ unsigned char d[16];
+ const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+ for (i = -8; i < 0; i++)
+ s[i * pitch] = s[0];
+
+ /* 17 avoids valgrind warning - we buffer values in c in d
+ * and only write them when we've read 8 ahead...
+ */
+ for (i = 0; i < 17; i++)
+ s[(i + rows) * pitch] = s[(rows - 1) * pitch];
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i * pitch] * s[i * pitch];
+ sum += s[i * pitch];
+ }
+
+ for (r = 0; r < rows + 8; r++) {
+ sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+ sum += s[7 * pitch] - s[-8 * pitch];
+ d[r & 15] = s[0];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+ }
+ if (r >= 8)
+ s[-8 * pitch] = d[(r - 8) & 15];
+ s += pitch;
+ }
+ }
+}
+
+#if CONFIG_POSTPROC
+static void vpx_de_mblock(YV12_BUFFER_CONFIG *post,
+ int q) {
+ vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+ vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+}
+
+#endif
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
new file mode 100644
index 000000000..616721d8e
--- /dev/null
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -0,0 +1,683 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+extern int16_t vpx_rv[];
+
+#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7, \
+ out8, out9, out10, out11, \
+ out12, out13, out14, out15) \
+{ \
+ v8i16 temp0, temp1, temp2, temp3, temp4; \
+ v8i16 temp5, temp6, temp7, temp8, temp9; \
+ \
+ ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
+ temp0, temp1, temp2, temp3); \
+ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
+ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
+ ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
+ temp0, temp1, temp2, temp3); \
+ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_UB(temp5, temp4, out8, out10); \
+ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
+ ILVRL_W2_UB(temp5, temp4, out12, out14); \
+ out0 = (v16u8)temp6; \
+ out2 = (v16u8)temp7; \
+ out4 = (v16u8)temp8; \
+ out6 = (v16u8)temp9; \
+ out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
+ out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
+ out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
+ out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
+ out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+ out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
+ out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
+}
+
+#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
+ below1_in, below2_in, ref, out) \
+{ \
+ v16u8 temp0, temp1; \
+ \
+ temp1 = __msa_aver_u_b(above2_in, above1_in); \
+ temp0 = __msa_aver_u_b(below2_in, below1_in); \
+ temp1 = __msa_aver_u_b(temp1, temp0); \
+ out = __msa_aver_u_b(src_in, temp1); \
+ temp0 = __msa_asub_u_b(src_in, above2_in); \
+ temp1 = __msa_asub_u_b(src_in, above1_in); \
+ temp0 = (temp0 < ref); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ temp1 = __msa_asub_u_b(src_in, below1_in); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ temp1 = __msa_asub_u_b(src_in, below2_in); \
+ temp1 = (temp1 < ref); \
+ temp0 = temp0 & temp1; \
+ out = __msa_bmz_v(out, src_in, temp0); \
+}
+
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15) \
+{ \
+ v8i16 temp0, temp1, temp2, temp3, temp4; \
+ v8i16 temp5, temp6, temp7, temp8, temp9; \
+ \
+ ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
+ ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
+ ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
+ ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
+ ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
+ ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
+ ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
+ ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
+ ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
+ ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
+ ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
+ ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
+ ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
+ in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
+ in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
+ ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
+ in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
+ in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
+ ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
+ temp2, temp3, temp4, temp5); \
+ ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
+ temp6, temp7, temp8, temp9); \
+ ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
+ in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
+ in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
+ ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
+ in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
+ in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
+}
+
+#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
+ in6, in7, in8, in9, in10, in11) \
+{ \
+ v8i16 temp0, temp1, temp2, temp3; \
+ v8i16 temp4, temp5, temp6, temp7; \
+ \
+ ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
+ ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
+ ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
+ ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
+ ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
+ ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
+ temp4 = __msa_ilvr_h(temp5, temp4); \
+ ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
+ temp5 = __msa_ilvr_h(temp7, temp6); \
+ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
+ in0 = (v16u8)temp0; \
+ in2 = (v16u8)temp1; \
+ in4 = (v16u8)temp2; \
+ in6 = (v16u8)temp3; \
+ in8 = (v16u8)temp6; \
+ in10 = (v16u8)temp7; \
+ in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
+ in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
+ in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
+ in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
+ in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
+ in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
+}
+
+static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f) {
+ uint8_t *p_src = src_ptr;
+ uint8_t *p_dst = dst_ptr;
+ uint8_t *f_orig = f;
+ uint8_t *p_dst_st = dst_ptr;
+ uint16_t col;
+ uint64_t out0, out1, out2, out3;
+ v16u8 above2, above1, below2, below1, src, ref, ref_temp;
+ v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
+ v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
+
+ for (col = (cols / 16); col--;) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+ p_dst, dst_stride);
+
+ p_dst += 16;
+ p_src += 16;
+ f += 16;
+ }
+
+ if (0 != (cols / 16)) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ out0 = __msa_copy_u_d((v2i64) inter0, 0);
+ out1 = __msa_copy_u_d((v2i64) inter1, 0);
+ out2 = __msa_copy_u_d((v2i64) inter2, 0);
+ out3 = __msa_copy_u_d((v2i64) inter3, 0);
+ SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64) inter4, 0);
+ out1 = __msa_copy_u_d((v2i64) inter5, 0);
+ out2 = __msa_copy_u_d((v2i64) inter6, 0);
+ out3 = __msa_copy_u_d((v2i64) inter7, 0);
+ SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+ }
+
+ f = f_orig;
+ p_dst = dst_ptr - 2;
+ LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7);
+
+ for (col = 0; col < (cols / 8); ++col) {
+ ref = LD_UB(f);
+ f += 8;
+ VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7, inter8, inter9, inter10, inter11);
+ if (0 == col) {
+ above2 = inter2;
+ above1 = inter2;
+ } else {
+ above2 = inter0;
+ above1 = inter1;
+ }
+ src = inter2;
+ below1 = inter3;
+ below2 = inter4;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+ above2 = inter5;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+ above1 = inter6;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+ src = inter7;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+ below1 = inter8;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+ below2 = inter9;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+ if (col == (cols / 8 - 1)) {
+ above2 = inter9;
+ } else {
+ above2 = inter10;
+ }
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+ if (col == (cols / 8 - 1)) {
+ above1 = inter9;
+ } else {
+ above1 = inter11;
+ }
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+ TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
+ inter9, inter2, inter3, inter4, inter5, inter6, inter7,
+ inter8, inter9);
+ p_dst += 8;
+ LD_UB2(p_dst, dst_stride, inter0, inter1);
+ ST8x1_UB(inter2, p_dst_st);
+ ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+ LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+ ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+ ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+ LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+ ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+ ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+ LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+ ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+ ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+ p_dst_st += 8;
+ }
+}
+
+static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f) {
+ uint8_t *p_src = src_ptr;
+ uint8_t *p_dst = dst_ptr;
+ uint8_t *p_dst_st = dst_ptr;
+ uint8_t *f_orig = f;
+ uint16_t col;
+ v16u8 above2, above1, below2, below1;
+ v16u8 src, ref, ref_temp;
+ v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
+ v16u8 inter7, inter8, inter9, inter10, inter11;
+ v16u8 inter12, inter13, inter14, inter15;
+
+ for (col = (cols / 16); col--;) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ src = LD_UB(p_src + 10 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+ below1 = LD_UB(p_src + 11 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+ below2 = LD_UB(p_src + 12 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+ above2 = LD_UB(p_src + 13 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+ above1 = LD_UB(p_src + 14 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+ src = LD_UB(p_src + 15 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+ below1 = LD_UB(p_src + 16 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+ below2 = LD_UB(p_src + 17 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+ ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+ p_dst, dst_stride);
+ ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
+ p_dst + 8 * dst_stride, dst_stride);
+ p_src += 16;
+ p_dst += 16;
+ f += 16;
+ }
+
+ f = f_orig;
+ p_dst = dst_ptr - 2;
+ LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+ inter6, inter7);
+ LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
+ inter12, inter13, inter14, inter15);
+
+ for (col = 0; col < cols / 8; ++col) {
+ ref = LD_UB(f);
+ f += 8;
+ TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
+ inter7, inter8, inter9, inter10, inter11, inter12, inter13,
+ inter14, inter15);
+ if (0 == col) {
+ above2 = inter2;
+ above1 = inter2;
+ } else {
+ above2 = inter0;
+ above1 = inter1;
+ }
+
+ src = inter2;
+ below1 = inter3;
+ below2 = inter4;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+ above2 = inter5;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+ above1 = inter6;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+ src = inter7;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+ below1 = inter8;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+ below2 = inter9;
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+ if (col == (cols / 8 - 1)) {
+ above2 = inter9;
+ } else {
+ above2 = inter10;
+ }
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+ if (col == (cols / 8 - 1)) {
+ above1 = inter9;
+ } else {
+ above1 = inter11;
+ }
+ ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+ VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
+ inter8, inter9, inter2, inter3, inter4, inter5,
+ inter6, inter7, inter8, inter9, inter10, inter11,
+ inter12, inter13, inter14, inter15, above2, above1);
+
+ p_dst += 8;
+ LD_UB2(p_dst, dst_stride, inter0, inter1);
+ ST8x1_UB(inter2, p_dst_st);
+ ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+ LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+ ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+ ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+ LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+ ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+ ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+ LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+ ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+ ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+ LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
+ ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
+ ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
+ LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
+ ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
+ ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
+ LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
+ ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
+ ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
+ LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
+ ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
+ ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
+ p_dst_st += 8;
+ }
+}
+
+void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
+ int32_t src_stride,
+ int32_t dst_stride, int32_t cols,
+ uint8_t *f, int32_t size) {
+ if (8 == size) {
+ postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
+ } else if (16 == size) {
+ postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
+ }
+}
+
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
+ int32_t rows, int32_t cols, int32_t flimit) {
+ int32_t row, col, cnt;
+ uint8_t *src_dup = src_ptr;
+ v16u8 src0, src, tmp_orig;
+ v16u8 tmp = {0};
+ v16i8 zero = {0};
+ v8u16 sum_h, src_r_h, src_l_h;
+ v4u32 src_r_w, src_l_w;
+ v4i32 flimit_vec;
+
+ flimit_vec = __msa_fill_w(flimit);
+ for (row = rows; row--;) {
+ int32_t sum_sq = 0;
+ int32_t sum = 0;
+ src0 = (v16u8) __msa_fill_b(src_dup[0]);
+ ST8x1_UB(src0, (src_dup - 8));
+
+ src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]);
+ ST_UB(src0, src_dup + cols);
+ src_dup[cols + 16] = src_dup[cols - 1];
+ tmp_orig = (v16u8) __msa_ldi_b(0);
+ tmp_orig[15] = tmp[15];
+ src = LD_UB(src_dup - 8);
+ src[15] = 0;
+ ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+ src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
+ src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
+ sum_sq = HADD_SW_S32(src_r_w);
+ sum_sq += HADD_SW_S32(src_l_w);
+ sum_h = __msa_hadd_u_h(src, src);
+ sum = HADD_UH_U32(sum_h);
+ {
+ v16u8 src7, src8, src_r, src_l;
+ v16i8 mask;
+ v8u16 add_r, add_l;
+ v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
+ v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
+ v4i32 sub0, sub1, sub2, sub3;
+ v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+ v4i32 mul0, mul1, mul2, mul3;
+ v4i32 total0, total1, total2, total3;
+ v8i16 const8 = __msa_fill_h(8);
+
+ src7 = LD_UB(src_dup + 7);
+ src8 = LD_UB(src_dup - 8);
+ for (col = 0; col < (cols >> 4); ++col) {
+ ILVRL_B2_UB(src7, src8, src_r, src_l);
+ HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
+
+ sum_r[0] = sum + sub_r[0];
+ for (cnt = 0; cnt < 7; ++cnt) {
+ sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
+ }
+ sum_l[0] = sum_r[7] + sub_l[0];
+ for (cnt = 0; cnt < 7; ++cnt) {
+ sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
+ }
+ sum = sum_l[7];
+ src = LD_UB(src_dup + 16 * col);
+ ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+ src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4);
+ src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4);
+ tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7);
+
+ HADD_UB2_UH(src_r, src_l, add_r, add_l);
+ UNPCK_SH_SW(sub_r, sub0, sub1);
+ UNPCK_SH_SW(sub_l, sub2, sub3);
+ ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
+ ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
+ MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
+ mul2, mul3);
+ sum_sq0[0] = sum_sq + mul0[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
+ }
+ sum_sq1[0] = sum_sq0[3] + mul1[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
+ }
+ sum_sq2[0] = sum_sq1[3] + mul2[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
+ }
+ sum_sq3[0] = sum_sq2[3] + mul3[0];
+ for (cnt = 0; cnt < 3; ++cnt) {
+ sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
+ }
+ sum_sq = sum_sq3[3];
+
+ UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
+ UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
+ total0 = sum_sq0 * __msa_ldi_w(15);
+ total0 -= sum0_w * sum0_w;
+ total1 = sum_sq1 * __msa_ldi_w(15);
+ total1 -= sum1_w * sum1_w;
+ total2 = sum_sq2 * __msa_ldi_w(15);
+ total2 -= sum2_w * sum2_w;
+ total3 = sum_sq3 * __msa_ldi_w(15);
+ total3 -= sum3_w * sum3_w;
+ total0 = (total0 < flimit_vec);
+ total1 = (total1 < flimit_vec);
+ total2 = (total2 < flimit_vec);
+ total3 = (total3 < flimit_vec);
+ PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+ mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
+ tmp = __msa_bmz_v(tmp, src, (v16u8) mask);
+
+ if (col == 0) {
+ uint64_t src_d;
+
+ src_d = __msa_copy_u_d((v2i64) tmp_orig, 1);
+ SD(src_d, (src_dup - 8));
+ }
+
+ src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
+ src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
+ ST_UB(tmp, (src_dup + (16 * col)));
+ }
+
+ src_dup += pitch;
+ }
+ }
+}
+
+void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
+ int32_t cols, int32_t flimit) {
+ int32_t row, col, cnt, i;
+ unsigned int seed;
+ const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)];
+ v4i32 flimit_vec;
+ v16u8 dst7, dst8, dst_r_b, dst_l_b;
+ v16i8 mask;
+ v8u16 add_r, add_l;
+ v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
+ v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
+
+ flimit_vec = __msa_fill_w(flimit);
+
+ for (col = 0; col < (cols >> 4); ++col) {
+ uint8_t *dst_tmp = &dst_ptr[col << 4];
+ v16u8 dst;
+ v16i8 zero = {0};
+ v16u8 tmp[16];
+ v8i16 mult0, mult1, rv2_0, rv2_1;
+ v8i16 sum0_h = {0};
+ v8i16 sum1_h = {0};
+ v4i32 mul0 = {0};
+ v4i32 mul1 = {0};
+ v4i32 mul2 = {0};
+ v4i32 mul3 = {0};
+ v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+ v4i32 add0, add1, add2, add3;
+ const int16_t *rv2[16];
+
+ dst = LD_UB(dst_tmp);
+ for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
+ rv2[i] = rv3 + ((cnt * 17) & 127);
+ ++i;
+ }
+ for (cnt = -8; cnt < 0; ++cnt) {
+ ST_UB(dst, dst_tmp + cnt * pitch);
+ }
+
+ dst = LD_UB((dst_tmp + (rows - 1) * pitch));
+ for (cnt = rows; cnt < rows + 17; ++cnt) {
+ ST_UB(dst, dst_tmp + cnt * pitch);
+ }
+ for (cnt = -8; cnt <= 6; ++cnt) {
+ dst = LD_UB(dst_tmp + (cnt * pitch));
+ UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
+ MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
+ mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0);
+ mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0);
+ mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1);
+ mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1);
+ ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
+ }
+
+ for (row = 0; row < (rows + 8); ++row) {
+ for (i = 0; i < 8; ++i) {
+ rv2_0[i] = *(rv2[i] + (row & 127));
+ rv2_1[i] = *(rv2[i + 8] + (row & 127));
+ }
+ dst7 = LD_UB(dst_tmp + (7 * pitch));
+ dst8 = LD_UB(dst_tmp - (8 * pitch));
+ ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
+
+ HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
+ UNPCK_SH_SW(sub_r, sub0, sub1);
+ UNPCK_SH_SW(sub_l, sub2, sub3);
+ sum0_h += sub_r;
+ sum1_h += sub_l;
+
+ HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
+
+ ILVRL_H2_SW(zero, add_r, add0, add1);
+ ILVRL_H2_SW(zero, add_l, add2, add3);
+ mul0 += add0 * sub0;
+ mul1 += add1 * sub1;
+ mul2 += add2 * sub2;
+ mul3 += add3 * sub3;
+ dst = LD_UB(dst_tmp);
+ ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
+ dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
+ dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
+ tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7);
+
+ UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
+ UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
+ total0 = mul0 * __msa_ldi_w(15);
+ total0 -= sum0_w * sum0_w;
+ total1 = mul1 * __msa_ldi_w(15);
+ total1 -= sum1_w * sum1_w;
+ total2 = mul2 * __msa_ldi_w(15);
+ total2 -= sum2_w * sum2_w;
+ total3 = mul3 * __msa_ldi_w(15);
+ total3 -= sum3_w * sum3_w;
+ total0 = (total0 < flimit_vec);
+ total1 = (total1 < flimit_vec);
+ total2 = (total2 < flimit_vec);
+ total3 = (total3 < flimit_vec);
+ PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+ mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
+ tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask);
+
+ if (row >= 8) {
+ ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
+ }
+
+ dst_tmp += pitch;
+ }
+ }
+}
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index 91e3615cf..ea59eafe9 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -1060,6 +1060,7 @@
ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
}
#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
/* Description : Interleave left half of halfword elements from vectors
@@ -1074,6 +1075,7 @@
out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
}
#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
/* Description : Interleave left half of word elements from vectors
Arguments : Inputs - in0, in1, in2, in3
@@ -1137,6 +1139,7 @@
out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
}
#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) { \
@@ -1215,6 +1218,7 @@
out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
}
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index c73692a37..bd6d9382e 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -52,8 +52,11 @@ endif # CONFIG_VP9_HIGHBITDEPTH
ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
DSP_SRCS-yes += add_noise.c
+DSP_SRCS-yes += deblock.c
DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
endif # CONFIG_POSTPROC
DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 66d466a7f..8736e4698 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1894,6 +1894,18 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
specialize qw/vpx_plane_add_noise sse2 msa/;
+
+ add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+ specialize qw/vpx_mbpost_proc_down sse2 msa/;
+ $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
+
+ add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+ specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
+ $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
+
+ add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
+ specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
+
}
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpx_dsp/x86/deblock_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm
new file mode 100644
index 000000000..6df360df4
--- /dev/null
+++ b/vpx_dsp/x86/deblock_sse2.asm
@@ -0,0 +1,661 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+ movdqa xmm4, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm5, xmm1
+ pavgb xmm5, xmm3
+
+ ;calculate absolute value
+ psubusb xmm4, xmm1
+ psubusb xmm1, xmm0
+ psubusb xmm6, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm4, xmm1
+ paddusb xmm6, xmm3
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm7, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm4
+ psubusb xmm7, xmm6
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm7, xmm1
+ por xmm7, xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+ movdqa xmm6, xmm0
+ movdqa xmm4, xmm0
+ movdqa xmm2, xmm1
+ pavgb xmm1, xmm3
+
+ ;calculate absolute value
+ psubusb xmm6, xmm2
+ psubusb xmm2, xmm0
+ psubusb xmm4, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm6, xmm2
+ paddusb xmm4, xmm3
+
+ pavgb xmm5, xmm1
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm3, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm6
+ psubusb xmm3, xmm4
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm3, xmm1
+
+ por xmm7, xmm2
+ por xmm7, xmm3
+
+ pavgb xmm5, xmm0
+
+ ;decide if or not to use filtered value
+ pand xmm0, xmm7
+ pandn xmm7, xmm5
+ paddusb xmm0, xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+ movdqa xmm2, XMMWORD PTR [rbx]
+ movdqa [rsp], xmm2
+ add rbx, 16
+%endmacro
+
+;void vpx_post_proc_down_and_across_mb_row_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned char *dst_ptr,
+; int src_pixels_per_line,
+; int dst_pixels_per_line,
+; int cols,
+; int *flimits,
+; int size
+;)
+global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vpx_post_proc_down_and_across_mb_row_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; put flimit on stack
+ mov rbx, arg(5) ;flimits ptr
+ UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+
+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
+ movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+ xor rdx, rdx ;col
+.nextcol:
+ ;load current and next 2 rows
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
+
+ FIRST_2_ROWS
+
+ ;load above 2 rows
+ neg rax
+ movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax]
+
+ SECOND_2_ROWS
+
+ movdqu XMMWORD PTR [rdi], xmm0
+
+ neg rax ; positive stride
+ add rsi, 16
+ add rdi, 16
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .downdone
+ UPDATE_FLIMIT
+ jmp .nextcol
+
+.downdone:
+ ; done with the all cols, start the across filtering in place
+ sub rsi, rdx
+ sub rdi, rdx
+
+ mov rbx, arg(5) ; flimits
+ UPDATE_FLIMIT
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rdi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ mov rdx, -8
+ movq [rdi+rdx], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(4)
+ movq mm1, [rdi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rdi+rdx], mm1
+
+ xor rdx, rdx
+ movq mm0, QWORD PTR [rdi-16];
+ movq mm1, QWORD PTR [rdi-8];
+
+.acrossnextcol:
+ movdqu xmm0, XMMWORD PTR [rdi + rdx]
+ movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
+
+ FIRST_2_ROWS
+
+ movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
+
+ SECOND_2_ROWS
+
+ movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
+ movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
+ movdq2q mm0, xmm0
+ psrldq xmm0, 8
+ movdq2q mm1, xmm0
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .acrossdone
+ UPDATE_FLIMIT
+ jmp .acrossnextcol
+
+.acrossdone:
+ ; last 16 pixels
+ movq QWORD PTR [rdi+rdx-16], mm0
+
+ cmp edx, dword arg(4)
+ jne .throw_last_8
+ movq QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+ ; done with this rwo
+ add rsi,rax ;next src line
+ mov eax, dword arg(3) ;dst_pixels_per_line
+ add rdi,rax ;next destination
+ mov eax, dword arg(2) ;src_pixels_per_line
+
+ mov rbx, arg(5) ;flimits
+ UPDATE_FLIMIT
+
+ dec rcx ;decrement count
+ jnz .nextrow ;next row
+
+ add rsp, 16
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit
+
+;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
+extern sym(vpx_rv)
+global sym(vpx_mbpost_proc_down_xmm) PRIVATE
+sym(vpx_mbpost_proc_down_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 128+16
+
+ ; unsigned char d[16][8] at [rsp]
+ ; create flimit2 at [rsp+128]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp+128], eax
+ mov [rsp+128+4], eax
+ mov [rsp+128+8], eax
+ mov [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+ lea r8, [GLOBAL(sym(vpx_rv))]
+%endif
+
+ ;rows +=8;
+ add dword arg(2), 8
+
+ ;for(c=0; c<cols; c+=8)
+.loop_col:
+ mov rsi, arg(0) ; s
+ pxor xmm0, xmm0 ;
+
+ movsxd rax, dword ptr arg(1) ;pitch ;
+
+ ; this copies the last row down into the border 8 rows
+ mov rdi, rsi
+ mov rdx, arg(2)
+ sub rdx, 9
+ imul rdx, rax
+ lea rdi, [rdi+rdx]
+ movq xmm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_borderd: ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], xmm1
+
+ dec rcx
+ jne .init_borderd
+
+ neg rax ; rax = -pitch
+
+ ; this copies the first row up into the border 8 rows
+ mov rdi, rsi
+ movq xmm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_border: ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], xmm1
+
+ dec rcx
+ jne .init_border
+
+
+
+ lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
+ neg rax
+
+ pxor xmm5, xmm5
+ pxor xmm6, xmm6 ;
+
+ pxor xmm7, xmm7 ;
+ mov rdi, rsi
+
+ mov rcx, 15 ;
+
+.loop_initvar:
+ movq xmm1, QWORD PTR [rdi];
+ punpcklbw xmm1, xmm0 ;
+
+ paddw xmm5, xmm1 ;
+ pmullw xmm1, xmm1 ;
+
+ movdqa xmm2, xmm1 ;
+ punpcklwd xmm1, xmm0 ;
+
+ punpckhwd xmm2, xmm0 ;
+ paddd xmm6, xmm1 ;
+
+ paddd xmm7, xmm2 ;
+ lea rdi, [rdi+rax] ;
+
+ dec rcx
+ jne .loop_initvar
+ ;save the var and sum
+ xor rdx, rdx
+.loop_row:
+ movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
+ movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ paddw xmm5, xmm2
+ psubw xmm5, xmm1
+
+ pmullw xmm2, xmm2
+ movdqa xmm4, xmm2
+
+ punpcklwd xmm2, xmm0
+ punpckhwd xmm4, xmm0
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm4
+
+ pmullw xmm1, xmm1
+ movdqa xmm2, xmm1
+
+ punpcklwd xmm1, xmm0
+ psubd xmm6, xmm1
+
+ punpckhwd xmm2, xmm0
+ psubd xmm7, xmm2
+
+
+ movdqa xmm3, xmm6
+ pslld xmm3, 4
+
+ psubd xmm3, xmm6
+ movdqa xmm1, xmm5
+
+ movdqa xmm4, xmm5
+ pmullw xmm1, xmm1
+
+ pmulhw xmm4, xmm4
+ movdqa xmm2, xmm1
+
+ punpcklwd xmm1, xmm4
+ punpckhwd xmm2, xmm4
+
+ movdqa xmm4, xmm7
+ pslld xmm4, 4
+
+ psubd xmm4, xmm7
+
+ psubd xmm3, xmm1
+ psubd xmm4, xmm2
+
+ psubd xmm3, flimit4
+ psubd xmm4, flimit4
+
+ psrad xmm3, 31
+ psrad xmm4, 31
+
+ packssdw xmm3, xmm4
+ packsswb xmm3, xmm0
+
+ movq xmm1, QWORD PTR [rsi+rax*8]
+
+ movq xmm2, xmm1
+ punpcklbw xmm1, xmm0
+
+ paddw xmm1, xmm5
+ mov rcx, rdx
+
+ and rcx, 127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ push rax
+ lea rax, [GLOBAL(sym(vpx_rv))]
+ movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2]
+ pop rax
+%elif ABI_IS_32BIT=0
+ movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2]
+%else
+ movdqu xmm4, [sym(vpx_rv) + rcx*2]
+%endif
+
+ paddw xmm1, xmm4
+ ;paddw xmm1, eight8s
+ psraw xmm1, 4
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm3
+
+ pandn xmm3, xmm2
+ por xmm1, xmm3
+
+ and rcx, 15
+ movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+ cmp edx, 8
+ jl .skip_assignment
+
+ mov rcx, rdx
+ sub rcx, 8
+ and rcx, 15
+ movq mm0, [rsp + rcx*8] ;d[rcx*8]
+ movq [rsi], mm0
+
+.skip_assignment:
+ lea rsi, [rsi+rax]
+
+ lea rdi, [rdi+rax]
+ add rdx, 1
+
+ cmp edx, dword arg(2) ;rows
+ jl .loop_row
+
+ add dword arg(0), 8 ; s += 8
+ sub dword arg(3), 8 ; cols -= 8
+ cmp dword arg(3), 0
+ jg .loop_col
+
+ add rsp, 128+16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
+; int pitch, int rows, int cols,int flimit)
+global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vpx_mbpost_proc_across_ip_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; create flimit4 at [rsp]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp], eax
+ mov [rsp+4], eax
+ mov [rsp+8], eax
+ mov [rsp+12], eax
+%define flimit4 [rsp]
+
+
+ ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+ xor rdx, rdx ;sumsq=0;
+ xor rcx, rcx ;sum=0;
+ mov rsi, arg(0); s
+
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rsi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+
+ mov rdi, -8
+ movq [rsi+rdi], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(3)
+ movq mm1, [rsi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rsi+rdx], mm1
+
+.ip_var_loop:
+ ;for(i=-8;i<=6;i++)
+ ;{
+ ; sumsq += s[i]*s[i];
+ ; sum += s[i];
+ ;}
+ movzx eax, byte [rsi+rdi]
+ add ecx, eax
+ mul al
+ add edx, eax
+ add rdi, 1
+ cmp rdi, 6
+ jle .ip_var_loop
+
+
+ ;mov rax, sumsq
+ ;movd xmm7, rax
+ movd xmm7, edx
+
+ ;mov rax, sum
+ ;movd xmm6, rax
+ movd xmm6, ecx
+
+ mov rsi, arg(0) ;s
+ xor rcx, rcx
+
+ movsxd rdx, dword arg(3) ;cols
+ add rdx, 8
+ pxor mm0, mm0
+ pxor mm1, mm1
+
+ pxor xmm0, xmm0
+.nextcol4:
+
+ movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
+ movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
+
+ punpcklbw xmm1, xmm0 ; expanding
+ punpcklbw xmm2, xmm0 ; expanding
+
+ punpcklwd xmm1, xmm0 ; expanding to dwords
+ punpcklwd xmm2, xmm0 ; expanding to dwords
+
+ psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
+ paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
+
+ paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
+ pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm1
+
+ pshufd xmm6, xmm6, 0 ; duplicate the last ones
+ pshufd xmm7, xmm7, 0 ; duplicate the last ones
+
+ psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
+ psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
+
+ pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
+ pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm3
+
+ pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
+ pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
+ pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ movdqa xmm3, xmm6
+ pmaddwd xmm3, xmm3
+
+ movdqa xmm5, xmm7
+ pslld xmm5, 4
+
+ psubd xmm5, xmm7
+ psubd xmm5, xmm3
+
+ psubd xmm5, flimit4
+ psrad xmm5, 31
+
+ packssdw xmm5, xmm0
+ packsswb xmm5, xmm0
+
+ movd xmm1, DWORD PTR [rsi+rcx]
+ movq xmm2, xmm1
+
+ punpcklbw xmm1, xmm0
+ punpcklwd xmm1, xmm0
+
+ paddd xmm1, xmm6
+ paddd xmm1, [GLOBAL(four8s)]
+
+ psrad xmm1, 4
+ packssdw xmm1, xmm0
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm5
+
+ pandn xmm5, xmm2
+ por xmm5, xmm1
+
+ movd [rsi+rcx-8], mm0
+ movq mm0, mm1
+
+ movdq2q mm1, xmm5
+ psrldq xmm7, 12
+
+ psrldq xmm6, 12
+ add rcx, 4
+
+ cmp rcx, rdx
+ jl .nextcol4
+
+ ;s+=pitch;
+ movsxd rax, dword arg(1)
+ add arg(0), rax
+
+ sub dword arg(2), 1 ;rows-=1
+ cmp dword arg(2), 0
+ jg .ip_row_loop
+
+ add rsp, 16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+SECTION_RODATA
+align 16
+four8s:
+ times 4 dd 8