deblock filter : moved from vp8 code branch

The deblocking filters used in vp8 have been moved to vpx_dsp for use by both vp8 and vp9. Change-Id: I5209d76edafc894b550f751fc76d3aa6799b392d
author: Jim Bankoski <jimbankoski@google.com> 2016-07-08 10:06:54 -0700
committer: Jim Bankoski <jimbankoski@google.com> 2016-07-12 05:53:00 -0700
commit: 88e695146500982ac0a997c9da2a486992f28400 (patch)
tree: 42a0597a4bf48e0ea3bb75fd041010001ad29200 /vpx_dsp
parent: 45ed7effed521aef24b9f4ba8d395e824fe6d649 (diff)
download: libvpx-88e695146500982ac0a997c9da2a486992f28400.tar.gz
libvpx-88e695146500982ac0a997c9da2a486992f28400.tar.bz2
libvpx-88e695146500982ac0a997c9da2a486992f28400.zip
6 files changed, 1567 insertions, 0 deletions
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
new file mode 100644
index 000000000..411bc7754
--- /dev/null
+++ b/vpx_dsp/deblock.c
@@ -0,0 +1,204 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+
+const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3,
+    14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+    8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13,
+    13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10,
+    8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5,
+    4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3,
+    4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0,
+    10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7,
+    5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3,
+    10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10,
+    8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2,
+    2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13,
+    1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11,
+    7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14,
+    5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6,
+    10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13,
+    4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12,
+    13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, };
+
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
+                                            unsigned char *dst_ptr,
+                                            int src_pixels_per_line,
+                                            int dst_pixels_per_line, int cols,
+                                            unsigned char *f, int size) {
+  unsigned char *p_src, *p_dst;
+  int row;
+  int col;
+  unsigned char v;
+  unsigned char d[4];
+
+  for (row = 0; row < size; row++) {
+    /* post_proc_down for one row */
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+      unsigned char p_above1 = p_src[col - src_pixels_per_line];
+      unsigned char p_below1 = p_src[col + src_pixels_per_line];
+      unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+
+      v = p_src[col];
+
+      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
+          && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+        unsigned char k1, k2, k3;
+        k1 = (p_above2 + p_above1 + 1) >> 1;
+        k2 = (p_below2 + p_below1 + 1) >> 1;
+        k3 = (k1 + k2 + 1) >> 1;
+        v = (k3 + v + 1) >> 1;
+      }
+
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    p_src[-2] = p_src[-1] = p_src[0];
+    p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
+
+    for (col = 0; col < cols; col++) {
+      v = p_src[col];
+
+      if ((abs(v - p_src[col - 2]) < f[col])
+          && (abs(v - p_src[col - 1]) < f[col])
+          && (abs(v - p_src[col + 1]) < f[col])
+          && (abs(v - p_src[col + 2]) < f[col])) {
+        unsigned char k1, k2, k3;
+        k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+        k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+        k3 = (k1 + k2 + 1) >> 1;
+        v = (k3 + v + 1) >> 1;
+      }
+
+      d[col & 3] = v;
+
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 3];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 3];
+    p_dst[col - 1] = d[(col - 1) & 3];
+
+    /* next row */
+    src_ptr += src_pixels_per_line;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
+                                 int cols, int flimit) {
+  int r, c, i;
+
+  unsigned char *s = src;
+  unsigned char d[16];
+
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum = 0;
+
+    for (i = -8; i < 0; i++)
+      s[i] = s[0];
+
+    /* 17 avoids valgrind warning - we buffer values in c in d
+     * and only write them when we've read 8 ahead...
+     */
+    for (i = 0; i < 17; i++)
+      s[i + cols] = s[cols - 1];
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+
+    s += pitch;
+  }
+}
+
+void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
+                            int flimit) {
+  int r, c, i;
+  unsigned int seed;
+  const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)];
+
+  for (c = 0; c < cols; c++) {
+    unsigned char *s = &dst[c];
+    int sumsq = 0;
+    int sum = 0;
+    unsigned char d[16];
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i < 0; i++)
+      s[i * pitch] = s[0];
+
+    /* 17 avoids valgrind warning - we buffer values in c in d
+     * and only write them when we've read 8 ahead...
+     */
+    for (i = 0; i < 17; i++)
+      s[(i + rows) * pitch] = s[(rows - 1) * pitch];
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+      if (r >= 8)
+        s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+
+#if CONFIG_POSTPROC
+static void vpx_de_mblock(YV12_BUFFER_CONFIG *post,
+    int q) {
+  vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+      post->y_width, q2mbl(q));
+  vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+      post->y_width, q2mbl(q));
+}
+
+#endif
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
new file mode 100644
index 000000000..616721d8e
--- /dev/null
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -0,0 +1,683 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+extern int16_t vpx_rv[];
+
+#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                                out0, out1, out2, out3,                  \
+                                out4, out5, out6, out7,                  \
+                                out8, out9, out10, out11,                \
+                                out12, out13, out14, out15)              \
+{                                                                        \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                             \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                             \
+                                                                         \
+    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
+               temp0, temp1, temp2, temp3);                              \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                             \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                             \
+    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
+               temp0, temp1, temp2, temp3);                              \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_UB(temp5, temp4, out8, out10);                              \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_UB(temp5, temp4, out12, out14);                             \
+    out0 = (v16u8)temp6;                                                 \
+    out2 = (v16u8)temp7;                                                 \
+    out4 = (v16u8)temp8;                                                 \
+    out6 = (v16u8)temp9;                                                 \
+    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                \
+    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);             \
+    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);             \
+    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);             \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                \
+    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                \
+    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                \
+}
+
+#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in,    \
+                           below1_in, below2_in, ref, out)  \
+{                                                           \
+    v16u8 temp0, temp1;                                     \
+                                                            \
+    temp1 = __msa_aver_u_b(above2_in, above1_in);           \
+    temp0 = __msa_aver_u_b(below2_in, below1_in);           \
+    temp1 = __msa_aver_u_b(temp1, temp0);                   \
+    out = __msa_aver_u_b(src_in, temp1);                    \
+    temp0 = __msa_asub_u_b(src_in, above2_in);              \
+    temp1 = __msa_asub_u_b(src_in, above1_in);              \
+    temp0 = (temp0 < ref);                                  \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    temp1 = __msa_asub_u_b(src_in, below1_in);              \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    temp1 = __msa_asub_u_b(src_in, below2_in);              \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    out = __msa_bmz_v(out, src_in, temp0);                  \
+}
+
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                         in8, in9, in10, in11, in12, in13, in14, in15)  \
+{                                                                       \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                            \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                            \
+                                                                        \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                       \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                            \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                       \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                            \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                            \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                            \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                     \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                     \
+    ILVRL_H2_SH(temp5, temp4, temp6, temp7);                            \
+    ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5);                   \
+    ILVRL_H2_SH(temp5, temp4, temp8, temp9);                            \
+    ILVRL_W2_SH(temp8, temp6, temp4, temp5);                            \
+    ILVRL_W2_SH(temp9, temp7, temp6, temp7);                            \
+    ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9);                       \
+    ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2);                   \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0);              \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1);              \
+    ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1);                       \
+    ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6);                   \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2);              \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3);              \
+    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14,            \
+               temp2, temp3, temp4, temp5);                             \
+    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4,  \
+               temp6, temp7, temp8, temp9);                             \
+    ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1);               \
+    in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0);              \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0);              \
+    ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3);               \
+    in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2);             \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2);             \
+}
+
+#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5,    \
+                                in6, in7, in8, in9, in10, in11)  \
+{                                                                \
+    v8i16 temp0, temp1, temp2, temp3;                            \
+    v8i16 temp4, temp5, temp6, temp7;                            \
+                                                                 \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                     \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                     \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                     \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                     \
+    ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5);                \
+    temp4 = __msa_ilvr_h(temp5, temp4);                          \
+    ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7);                \
+    temp5 = __msa_ilvr_h(temp7, temp6);                          \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                     \
+    in0 = (v16u8)temp0;                                          \
+    in2 = (v16u8)temp1;                                          \
+    in4 = (v16u8)temp2;                                          \
+    in6 = (v16u8)temp3;                                          \
+    in8 = (v16u8)temp6;                                          \
+    in10 = (v16u8)temp7;                                         \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0);       \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1);       \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2);       \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3);       \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6);       \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7);      \
+}
+
+static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+                                            int32_t src_stride,
+                                            int32_t dst_stride, int32_t cols,
+                                            uint8_t *f) {
+  uint8_t *p_src = src_ptr;
+  uint8_t *p_dst = dst_ptr;
+  uint8_t *f_orig = f;
+  uint8_t *p_dst_st = dst_ptr;
+  uint16_t col;
+  uint64_t out0, out1, out2, out3;
+  v16u8 above2, above1, below2, below1, src, ref, ref_temp;
+  v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
+  v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
+
+  for (col = (cols / 16); col--;) {
+    ref = LD_UB(f);
+    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+    src = LD_UB(p_src);
+    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+    above2 = LD_UB(p_src + 3 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+    above1 = LD_UB(p_src + 4 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+    src = LD_UB(p_src + 5 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+    below1 = LD_UB(p_src + 6 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+    below2 = LD_UB(p_src + 7 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+    above2 = LD_UB(p_src + 8 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+    above1 = LD_UB(p_src + 9 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+    ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+           p_dst, dst_stride);
+
+    p_dst += 16;
+    p_src += 16;
+    f += 16;
+  }
+
+  if (0 != (cols / 16)) {
+    ref = LD_UB(f);
+    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+    src = LD_UB(p_src);
+    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+    above2 = LD_UB(p_src + 3 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+    above1 = LD_UB(p_src + 4 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+    src = LD_UB(p_src + 5 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+    below1 = LD_UB(p_src + 6 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+    below2 = LD_UB(p_src + 7 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+    above2 = LD_UB(p_src + 8 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+    above1 = LD_UB(p_src + 9 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+    out0 = __msa_copy_u_d((v2i64) inter0, 0);
+    out1 = __msa_copy_u_d((v2i64) inter1, 0);
+    out2 = __msa_copy_u_d((v2i64) inter2, 0);
+    out3 = __msa_copy_u_d((v2i64) inter3, 0);
+    SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+    out0 = __msa_copy_u_d((v2i64) inter4, 0);
+    out1 = __msa_copy_u_d((v2i64) inter5, 0);
+    out2 = __msa_copy_u_d((v2i64) inter6, 0);
+    out3 = __msa_copy_u_d((v2i64) inter7, 0);
+    SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+  }
+
+  f = f_orig;
+  p_dst = dst_ptr - 2;
+  LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+         inter6, inter7);
+
+  for (col = 0; col < (cols / 8); ++col) {
+    ref = LD_UB(f);
+    f += 8;
+    VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
+                            inter6, inter7, inter8, inter9, inter10, inter11);
+    if (0 == col) {
+      above2 = inter2;
+      above1 = inter2;
+    } else {
+      above2 = inter0;
+      above1 = inter1;
+    }
+    src = inter2;
+    below1 = inter3;
+    below2 = inter4;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+    above2 = inter5;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+    above1 = inter6;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+    src = inter7;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+    below1 = inter8;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+    below2 = inter9;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+    if (col == (cols / 8 - 1)) {
+      above2 = inter9;
+    } else {
+      above2 = inter10;
+    }
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+    if (col == (cols / 8 - 1)) {
+      above1 = inter9;
+    } else {
+      above1 = inter11;
+    }
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+    TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
+                       inter9, inter2, inter3, inter4, inter5, inter6, inter7,
+                       inter8, inter9);
+    p_dst += 8;
+    LD_UB2(p_dst, dst_stride, inter0, inter1);
+    ST8x1_UB(inter2, p_dst_st);
+    ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+    LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+    ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+    ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+    LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+    ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+    ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+    LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+    ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+    ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+    p_dst_st += 8;
+  }
+}
+
+static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+                                          int32_t src_stride,
+                                          int32_t dst_stride, int32_t cols,
+                                          uint8_t *f) {
+  uint8_t *p_src = src_ptr;
+  uint8_t *p_dst = dst_ptr;
+  uint8_t *p_dst_st = dst_ptr;
+  uint8_t *f_orig = f;
+  uint16_t col;
+  v16u8 above2, above1, below2, below1;
+  v16u8 src, ref, ref_temp;
+  v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
+  v16u8 inter7, inter8, inter9, inter10, inter11;
+  v16u8 inter12, inter13, inter14, inter15;
+
+  for (col = (cols / 16); col--;) {
+    ref = LD_UB(f);
+    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+    src = LD_UB(p_src);
+    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+    above2 = LD_UB(p_src + 3 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+    above1 = LD_UB(p_src + 4 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+    src = LD_UB(p_src + 5 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+    below1 = LD_UB(p_src + 6 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+    below2 = LD_UB(p_src + 7 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+    above2 = LD_UB(p_src + 8 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+    above1 = LD_UB(p_src + 9 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+    src = LD_UB(p_src + 10 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+    below1 = LD_UB(p_src + 11 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+    below2 = LD_UB(p_src + 12 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+    above2 = LD_UB(p_src + 13 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+    above1 = LD_UB(p_src + 14 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+    src = LD_UB(p_src + 15 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+    below1 = LD_UB(p_src + 16 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+    below2 = LD_UB(p_src + 17 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+    ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+           p_dst, dst_stride);
+    ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
+           p_dst + 8 * dst_stride, dst_stride);
+    p_src += 16;
+    p_dst += 16;
+    f += 16;
+  }
+
+  f = f_orig;
+  p_dst = dst_ptr - 2;
+  LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+         inter6, inter7);
+  LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
+         inter12, inter13, inter14, inter15);
+
+  for (col = 0; col < cols / 8; ++col) {
+    ref = LD_UB(f);
+    f += 8;
+    TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
+                     inter7, inter8, inter9, inter10, inter11, inter12, inter13,
+                     inter14, inter15);
+    if (0 == col) {
+      above2 = inter2;
+      above1 = inter2;
+    } else {
+      above2 = inter0;
+      above1 = inter1;
+    }
+
+    src = inter2;
+    below1 = inter3;
+    below2 = inter4;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+    above2 = inter5;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+    above1 = inter6;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+    src = inter7;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+    below1 = inter8;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+    below2 = inter9;
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+    if (col == (cols / 8 - 1)) {
+      above2 = inter9;
+    } else {
+      above2 = inter10;
+    }
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+    if (col == (cols / 8 - 1)) {
+      above1 = inter9;
+    } else {
+      above1 = inter11;
+    }
+    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+    VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
+                            inter8, inter9, inter2, inter3, inter4, inter5,
+                            inter6, inter7, inter8, inter9, inter10, inter11,
+                            inter12, inter13, inter14, inter15, above2, above1);
+
+    p_dst += 8;
+    LD_UB2(p_dst, dst_stride, inter0, inter1);
+    ST8x1_UB(inter2, p_dst_st);
+    ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+    LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+    ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+    ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+    LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+    ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+    ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+    LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+    ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+    ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+    LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
+    ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
+    ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
+    LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
+    ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
+    ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
+    LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
+    ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
+    ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
+    LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
+    ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
+    ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
+    p_dst_st += 8;
+  }
+}
+
+void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t src_stride,
+                                              int32_t dst_stride, int32_t cols,
+                                              uint8_t *f, int32_t size) {
+  if (8 == size) {
+    postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
+  } else if (16 == size) {
+    postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
+  }
+}
+
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
+                                   int32_t rows, int32_t cols, int32_t flimit) {
+  int32_t row, col, cnt;
+  uint8_t *src_dup = src_ptr;
+  v16u8 src0, src, tmp_orig;
+  v16u8 tmp = {0};
+  v16i8 zero = {0};
+  v8u16 sum_h, src_r_h, src_l_h;
+  v4u32 src_r_w, src_l_w;
+  v4i32 flimit_vec;
+
+  flimit_vec = __msa_fill_w(flimit);
+  for (row = rows; row--;) {
+    int32_t sum_sq = 0;
+    int32_t sum = 0;
+    src0 = (v16u8) __msa_fill_b(src_dup[0]);
+    ST8x1_UB(src0, (src_dup - 8));
+
+    src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]);
+    ST_UB(src0, src_dup + cols);
+    src_dup[cols + 16] = src_dup[cols - 1];
+    tmp_orig = (v16u8) __msa_ldi_b(0);
+    tmp_orig[15] = tmp[15];
+    src = LD_UB(src_dup - 8);
+    src[15] = 0;
+    ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+    src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
+    src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
+    sum_sq = HADD_SW_S32(src_r_w);
+    sum_sq += HADD_SW_S32(src_l_w);
+    sum_h = __msa_hadd_u_h(src, src);
+    sum = HADD_UH_U32(sum_h);
+    {
+      v16u8 src7, src8, src_r, src_l;
+      v16i8 mask;
+      v8u16 add_r, add_l;
+      v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
+      v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
+      v4i32 sub0, sub1, sub2, sub3;
+      v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+      v4i32 mul0, mul1, mul2, mul3;
+      v4i32 total0, total1, total2, total3;
+      v8i16 const8 = __msa_fill_h(8);
+
+      src7 = LD_UB(src_dup + 7);
+      src8 = LD_UB(src_dup - 8);
+      for (col = 0; col < (cols >> 4); ++col) {
+        ILVRL_B2_UB(src7, src8, src_r, src_l);
+        HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
+
+        sum_r[0] = sum + sub_r[0];
+        for (cnt = 0; cnt < 7; ++cnt) {
+          sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
+        }
+        sum_l[0] = sum_r[7] + sub_l[0];
+        for (cnt = 0; cnt < 7; ++cnt) {
+          sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
+        }
+        sum = sum_l[7];
+        src = LD_UB(src_dup + 16 * col);
+        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+        src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4);
+        src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4);
+        tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7);
+
+        HADD_UB2_UH(src_r, src_l, add_r, add_l);
+        UNPCK_SH_SW(sub_r, sub0, sub1);
+        UNPCK_SH_SW(sub_l, sub2, sub3);
+        ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
+        ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
+        MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
+             mul2, mul3);
+        sum_sq0[0] = sum_sq + mul0[0];
+        for (cnt = 0; cnt < 3; ++cnt) {
+          sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
+        }
+        sum_sq1[0] = sum_sq0[3] + mul1[0];
+        for (cnt = 0; cnt < 3; ++cnt) {
+          sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
+        }
+        sum_sq2[0] = sum_sq1[3] + mul2[0];
+        for (cnt = 0; cnt < 3; ++cnt) {
+          sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
+        }
+        sum_sq3[0] = sum_sq2[3] + mul3[0];
+        for (cnt = 0; cnt < 3; ++cnt) {
+          sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
+        }
+        sum_sq = sum_sq3[3];
+
+        UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
+        UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
+        total0 = sum_sq0 * __msa_ldi_w(15);
+        total0 -= sum0_w * sum0_w;
+        total1 = sum_sq1 * __msa_ldi_w(15);
+        total1 -= sum1_w * sum1_w;
+        total2 = sum_sq2 * __msa_ldi_w(15);
+        total2 -= sum2_w * sum2_w;
+        total3 = sum_sq3 * __msa_ldi_w(15);
+        total3 -= sum3_w * sum3_w;
+        total0 = (total0 < flimit_vec);
+        total1 = (total1 < flimit_vec);
+        total2 = (total2 < flimit_vec);
+        total3 = (total3 < flimit_vec);
+        PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+        mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
+        tmp = __msa_bmz_v(tmp, src, (v16u8) mask);
+
+        if (col == 0) {
+          uint64_t src_d;
+
+          src_d = __msa_copy_u_d((v2i64) tmp_orig, 1);
+          SD(src_d, (src_dup - 8));
+        }
+
+        src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
+        src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
+        ST_UB(tmp, (src_dup + (16 * col)));
+      }
+
+      src_dup += pitch;
+    }
+  }
+}
+
+void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
+                              int32_t cols, int32_t flimit) {
+  int32_t row, col, cnt, i;
+  unsigned int seed;
+  const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)];
+  v4i32 flimit_vec;
+  v16u8 dst7, dst8, dst_r_b, dst_l_b;
+  v16i8 mask;
+  v8u16 add_r, add_l;
+  v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
+  v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
+
+  flimit_vec = __msa_fill_w(flimit);
+
+  for (col = 0; col < (cols >> 4); ++col) {
+    uint8_t *dst_tmp = &dst_ptr[col << 4];
+    v16u8 dst;
+    v16i8 zero = {0};
+    v16u8 tmp[16];
+    v8i16 mult0, mult1, rv2_0, rv2_1;
+    v8i16 sum0_h = {0};
+    v8i16 sum1_h = {0};
+    v4i32 mul0 = {0};
+    v4i32 mul1 = {0};
+    v4i32 mul2 = {0};
+    v4i32 mul3 = {0};
+    v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+    v4i32 add0, add1, add2, add3;
+    const int16_t *rv2[16];
+
+    dst = LD_UB(dst_tmp);
+    for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
+      rv2[i] = rv3 + ((cnt * 17) & 127);
+      ++i;
+    }
+    for (cnt = -8; cnt < 0; ++cnt) {
+      ST_UB(dst, dst_tmp + cnt * pitch);
+    }
+
+    dst = LD_UB((dst_tmp + (rows - 1) * pitch));
+    for (cnt = rows; cnt < rows + 17; ++cnt) {
+      ST_UB(dst, dst_tmp + cnt * pitch);
+    }
+    for (cnt = -8; cnt <= 6; ++cnt) {
+      dst = LD_UB(dst_tmp + (cnt * pitch));
+      UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
+      MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
+      mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0);
+      mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0);
+      mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1);
+      mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1);
+      ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
+    }
+
+    for (row = 0; row < (rows + 8); ++row) {
+      for (i = 0; i < 8; ++i) {
+        rv2_0[i] = *(rv2[i] + (row & 127));
+        rv2_1[i] = *(rv2[i + 8] + (row & 127));
+      }
+      dst7 = LD_UB(dst_tmp + (7 * pitch));
+      dst8 = LD_UB(dst_tmp - (8 * pitch));
+      ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
+
+      HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
+      UNPCK_SH_SW(sub_r, sub0, sub1);
+      UNPCK_SH_SW(sub_l, sub2, sub3);
+      sum0_h += sub_r;
+      sum1_h += sub_l;
+
+      HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
+
+      ILVRL_H2_SW(zero, add_r, add0, add1);
+      ILVRL_H2_SW(zero, add_l, add2, add3);
+      mul0 += add0 * sub0;
+      mul1 += add1 * sub1;
+      mul2 += add2 * sub2;
+      mul3 += add3 * sub3;
+      dst = LD_UB(dst_tmp);
+      ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
+      dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
+      dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
+      tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7);
+
+      UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
+      UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
+      total0 = mul0 * __msa_ldi_w(15);
+      total0 -= sum0_w * sum0_w;
+      total1 = mul1 * __msa_ldi_w(15);
+      total1 -= sum1_w * sum1_w;
+      total2 = mul2 * __msa_ldi_w(15);
+      total2 -= sum2_w * sum2_w;
+      total3 = mul3 * __msa_ldi_w(15);
+      total3 -= sum3_w * sum3_w;
+      total0 = (total0 < flimit_vec);
+      total1 = (total1 < flimit_vec);
+      total2 = (total2 < flimit_vec);
+      total3 = (total3 < flimit_vec);
+      PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+      mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
+      tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask);
+
+      if (row >= 8) {
+        ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
+      }
+
+      dst_tmp += pitch;
+    }
+  }
+}
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index 91e3615cf..ea59eafe9 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -1060,6 +1060,7 @@
   ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
 }
 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
 
 /* Description : Interleave left half of halfword elements from vectors
@@ -1074,6 +1075,7 @@
   out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
 }
 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
 
 /* Description : Interleave left half of word elements from vectors
    Arguments   : Inputs  - in0, in1, in2, in3
@@ -1137,6 +1139,7 @@
   out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
 }
 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
 
 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                 out0, out1, out2, out3) {                       \
@@ -1215,6 +1218,7 @@
   out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
   out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
 }
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index c73692a37..bd6d9382e 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -52,8 +52,11 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
 
 ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
 DSP_SRCS-yes += add_noise.c
+DSP_SRCS-yes += deblock.c
 DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
 endif # CONFIG_POSTPROC
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 66d466a7f..8736e4698 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1894,6 +1894,18 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
     add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
     specialize qw/vpx_plane_add_noise sse2 msa/;
+
+    add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+    specialize qw/vpx_mbpost_proc_down sse2 msa/;
+    $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
+
+    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+    specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
+    $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
+
+    add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
+    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
+
 }
 
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpx_dsp/x86/deblock_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm
new file mode 100644
index 000000000..6df360df4
--- /dev/null
+++ b/vpx_dsp/x86/deblock_sse2.asm
@@ -0,0 +1,661 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm5,       xmm1
+        pavgb       xmm5,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm4,       xmm1
+        psubusb     xmm1,       xmm0
+        psubusb     xmm6,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm4,       xmm1
+        paddusb     xmm6,       xmm3
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm7,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm4
+        psubusb     xmm7,       xmm6
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm7,       xmm1
+        por         xmm7,       xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm2,       xmm1
+        pavgb       xmm1,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm6,       xmm2
+        psubusb     xmm2,       xmm0
+        psubusb     xmm4,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm6,       xmm2
+        paddusb     xmm4,       xmm3
+
+        pavgb       xmm5,       xmm1
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm3,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm6
+        psubusb     xmm3,       xmm4
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm3,       xmm1
+
+        por         xmm7,       xmm2
+        por         xmm7,       xmm3
+
+        pavgb       xmm5,       xmm0
+
+        ;decide if or not to use filtered value
+        pand        xmm0,       xmm7
+        pandn       xmm7,       xmm5
+        paddusb     xmm0,       xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+        movdqa      xmm2,       XMMWORD PTR [rbx]
+        movdqa      [rsp],      xmm2
+        add         rbx,        16
+%endmacro
+
+;void vpx_post_proc_down_and_across_mb_row_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int cols,
+;    int *flimits,
+;    int size
+;)
+global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vpx_post_proc_down_and_across_mb_row_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+        ; put flimit on stack
+        mov         rbx,        arg(5)           ;flimits ptr
+        UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+        mov         rsi,        arg(0)           ;src_ptr
+        mov         rdi,        arg(1)           ;dst_ptr
+
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
+        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+        xor         rdx,        rdx              ;col
+.nextcol:
+        ;load current and next 2 rows
+        movdqu      xmm0,       XMMWORD PTR [rsi]
+        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
+
+        FIRST_2_ROWS
+
+        ;load above 2 rows
+        neg         rax
+        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
+
+        SECOND_2_ROWS
+
+        movdqu      XMMWORD PTR [rdi], xmm0
+
+        neg         rax                          ; positive stride
+        add         rsi,        16
+        add         rdi,        16
+
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
+        jge         .downdone
+        UPDATE_FLIMIT
+        jmp         .nextcol
+
+.downdone:
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        mov         rbx,        arg(5) ; flimits
+        UPDATE_FLIMIT
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rdi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        mov         rdx,    -8
+        movq        [rdi+rdx], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(4)
+        movq        mm1,   [rdi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rdi+rdx], mm1
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-16];
+        movq        mm1,        QWORD PTR [rdi-8];
+
+.acrossnextcol:
+        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
+
+        FIRST_2_ROWS
+
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
+
+        SECOND_2_ROWS
+
+        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
+        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
+        movdq2q     mm0,        xmm0
+        psrldq      xmm0,       8
+        movdq2q     mm1,        xmm0
+
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
+        jge         .acrossdone
+        UPDATE_FLIMIT
+        jmp         .acrossnextcol
+
+.acrossdone:
+        ; last 16 pixels
+        movq        QWORD PTR [rdi+rdx-16], mm0
+
+        cmp         edx,        dword arg(4)
+        jne         .throw_last_8
+        movq        QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+        ; done with this rwo
+        add         rsi,rax                      ;next src line
+        mov         eax, dword arg(3)            ;dst_pixels_per_line
+        add         rdi,rax                      ;next destination
+        mov         eax, dword arg(2)            ;src_pixels_per_line
+
+        mov         rbx,        arg(5)           ;flimits
+        UPDATE_FLIMIT
+
+        dec         rcx                          ;decrement count
+        jnz         .nextrow                     ;next row
+
+    add rsp, 16
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit
+
+;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(vpx_rv)
+global sym(vpx_mbpost_proc_down_xmm) PRIVATE
+sym(vpx_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vpx_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+.loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd:                                                  ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
+            neg         rax                                     ; rax = -pitch
+
+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border:                                                   ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_border
+
+
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vpx_rv))]
+            movdqu      xmm4,       [rax + rcx*2] ;vpx_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;vpx_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(vpx_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            cmp         edx,        8
+            jl          .skip_assignment
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+            movq        [rsi],      mm0
+
+.skip_assignment:
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vpx_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rsi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
+        mov         rdi,    -8
+        movq        [rsi+rdi], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(3)
+        movq        mm1,   [rsi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rsi+rdx], mm1
+
+.ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         .ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+.nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [GLOBAL(four8s)]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          .nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg .ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+SECTION_RODATA
+align 16
+four8s:
+    times 4 dd 8
author	Jim Bankoski <jimbankoski@google.com>	2016-07-08 10:06:54 -0700
committer	Jim Bankoski <jimbankoski@google.com>	2016-07-12 05:53:00 -0700
commit	88e695146500982ac0a997c9da2a486992f28400 (patch)
tree	42a0597a4bf48e0ea3bb75fd041010001ad29200 /vpx_dsp
parent	45ed7effed521aef24b9f4ba8d395e824fe6d649 (diff)
download	libvpx-88e695146500982ac0a997c9da2a486992f28400.tar.gz libvpx-88e695146500982ac0a997c9da2a486992f28400.tar.bz2 libvpx-88e695146500982ac0a997c9da2a486992f28400.zip