From 6854dd70391fb9d4f3c91fdd12ac07aface61ad9 Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Fri, 27 Oct 2017 17:21:17 +0530 Subject: avcodec/mips: Improve hevc bi weighted copy, hz and vt mc msa functions Pack the data to half word before clipping. Use immediate unsigned saturation for clip to max saving one vector register. Signed-off-by: Kaustubh Raste Reviewed-by: Manojkumar Bhosale Signed-off-by: Michael Niedermayer --- libavcodec/mips/hevc_mc_biw_msa.c | 1570 +++++++++++++++++++------------------ 1 file changed, 793 insertions(+), 777 deletions(-) diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c index 458e73d..75c1c7a 100644 --- a/libavcodec/mips/hevc_mc_biw_msa.c +++ b/libavcodec/mips/hevc_mc_biw_msa.c @@ -29,8 +29,10 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { }; #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \ - out0_r, out1_r, out0_l, out1_l) \ + out0, out1) \ { \ + v4i32 out0_r, out1_r, out0_l, out1_l; \ + \ ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ \ @@ -40,37 +42,41 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ \ SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ - \ - out0_r = CLIP_SW_0_255(out0_r); \ - out1_r = CLIP_SW_0_255(out1_r); \ - out0_l = CLIP_SW_0_255(out0_l); \ - out1_l = CLIP_SW_0_255(out1_l); \ + PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ + CLIP_SH2_0_255(out0, out1); \ } -#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \ - wgt, rnd, offset, \ - out0_r, out1_r, out2_r, out3_r, \ - out0_l, out1_l, out2_l, out3_l) \ -{ \ - HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \ - out0_r, out1_r, out0_l, out1_l) \ - HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, \ - out2_r, out3_r, out2_l, out3_l) \ +#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \ + wgt, rnd, offset, out0, out1, out2, out3) \ +{ \ + HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \ + HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \ } -#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \ -{ \ - ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ - SRARI_H2_SH(out0, out1, rnd_val); \ - CLIP_SH2_0_255(out0, out1); \ +#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \ + offset, out0, out1) \ +{ \ + v4i32 out0_r, out1_r, out0_l, out1_l; \ + \ + ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ + ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ + out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \ + out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \ + out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \ + out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ + SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ + PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ + CLIP_SH2_0_255_MAX_SATU(out0, out1); \ } -#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \ - vec0, vec1, vec2, vec3, rnd_val, \ - out0, out1, out2, out3) \ -{ \ - HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \ - HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \ +#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ + vec3, wgt, rnd, offset, out0, out1, \ + out2, out3) \ +{ \ + HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \ + out0, out1); \ + HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \ + out2, out3); \ } static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, @@ -86,93 +92,77 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, int32_t offset1, int32_t rnd_val) { + uint32_t loop_cnt, tp0, tp1, tp2, tp3; + uint64_t tpd0, tpd1, tpd2, tpd3; int32_t offset, weight; + v16u8 out0, out1; v16i8 zero = { 0 }; - v4i32 weight_vec, offset_vec, rnd_vec; + v16i8 src0 = { 0 }, src1 = { 0 }; + v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; + v8i16 dst0, dst1, dst2, dst3, weight_vec; + v4i32 dst0_r, dst0_l, offset_vec, rnd_vec; offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); offset_vec = __msa_fill_w(offset); - weight_vec = __msa_fill_w(weight); + weight_vec = (v8i16) __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); if (2 == height) { - v16i8 src0, src1; - v8i16 in0, in1, dst0; - v4i32 dst0_r, dst0_l; - - LD_SB2(src0_ptr, src_stride, src0, src1); - LD_SH2(src1_ptr, src2_stride, in0, in1); - in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); - src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0); + LW2(src0_ptr, src_stride, tp0, tp1); + INSERT_W2_SB(tp0, tp1, src0); + LD2(src1_ptr, src2_stride, tpd0, tpd1); + INSERT_D2_SH(tpd0, tpd1, in0); dst0 = (v8i16) __msa_ilvr_b(zero, src0); dst0 <<= 6; ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); - dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, - (v8i16) weight_vec); - dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, - (v8i16) weight_vec); + dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec); + dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec); SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); - dst0_r = CLIP_SW_0_255(dst0_r); - dst0_l = CLIP_SW_0_255(dst0_l); - - HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r); - ST4x2_UB(dst0_r, dst, dst_stride); + dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); + dst0 = CLIP_SH_0_255_MAX_SATU(dst0); + out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); + ST4x2_UB(out0, dst, dst_stride); } else if (4 == height) { - v16i8 src0, src1, src2, src3; - v8i16 in0, in1, in2, in3; - v8i16 dst0, dst1; - v4i32 dst0_r, dst1_r, dst0_l, dst1_l; - - LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); - LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); - ILVR_D2_SH(in1, in0, in3, in2, in0, in1); - ILVR_W2_SB(src1, src0, src3, src2, src0, src1); - ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); - dst0 <<= 6; - dst1 <<= 6; - HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, - weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); - - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); + INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); + LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); + INSERT_D2_SH(tpd0, tpd1, in0); + INSERT_D2_SH(tpd2, tpd3, in1); + ILVRL_B2_SH(zero, src0, dst0, dst1); + SLLI_2V(dst0, dst1, 6); + HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec, + offset_vec, dst0, dst1); + out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); } else if (0 == height % 8) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 dst0, dst1, dst2, dst3; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_SB8(src0_ptr, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); - src0_ptr += (8 * src_stride); - LD_SH8(src1_ptr, src2_stride, - in0, in1, in2, in3, in4, in5, in6, in7); - src1_ptr += (8 * src2_stride); - - ILVR_D2_SH(in1, in0, in3, in2, in0, in1); - ILVR_D2_SH(in5, in4, in7, in6, in2, in3); - ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6, - src0, src1, src2, src3); - ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, - dst0, dst1, dst2, dst3); - + LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); + src0_ptr += 4 * src_stride; + INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); + LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); + src0_ptr += 4 * src_stride; + INSERT_W4_SB(tp0, tp1, tp2, tp3, src1); + LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); + src1_ptr += (4 * src2_stride); + INSERT_D2_SH(tpd0, tpd1, in0); + INSERT_D2_SH(tpd2, tpd3, in1); + LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); + src1_ptr += (4 * src2_stride); + INSERT_D2_SH(tpd0, tpd1, in2); + INSERT_D2_SH(tpd2, tpd3, in3); + ILVRL_B2_SH(zero, src0, dst0, dst1); + ILVRL_B2_SH(zero, src1, dst2, dst3); SLLI_4V(dst0, dst1, dst2, dst3, 6); - HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, - in0, in1, in2, in3, - weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); - - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, + in3, weight_vec, rnd_vec, offset_vec, + dst0, dst1, dst2, dst3); + PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); + ST4x8_UB(out0, out1, dst, dst_stride); dst += (8 * dst_stride); } } @@ -193,11 +183,12 @@ static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, { uint32_t loop_cnt; int32_t offset, weight; + uint64_t tp0, tp1, tp2, tp3; + v16u8 out0, out1; v16i8 zero = { 0 }; - v16i8 src0, src1, src2, src3; + v16i8 src0 = { 0 }, src1 = { 0 }; v8i16 in0, in1, in2, in3; v8i16 dst0, dst1, dst2, dst3; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; v4i32 offset_vec, weight_vec, rnd_vec; offset = (offset0 + offset1) << rnd_val; @@ -209,23 +200,21 @@ static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, rnd_vec = __msa_fill_w(rnd_val + 1); for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); src0_ptr += (4 * src_stride); + INSERT_D2_SB(tp0, tp1, src0); + INSERT_D2_SB(tp2, tp3, src1); LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); src1_ptr += (4 * src2_stride); - ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, - dst0, dst1, dst2, dst3); - + ILVRL_B2_SH(zero, src0, dst0, dst1); + ILVRL_B2_SH(zero, src1, dst2, dst3); SLLI_4V(dst0, dst1, dst2, dst3, 6); - HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, - in0, in1, in2, in3, - weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); - - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, + in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, + dst0, dst1, dst2, dst3); + PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); + ST6x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -243,8 +232,13 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, int32_t offset1, int32_t rnd_val) { + uint64_t tp0, tp1, tp2, tp3; int32_t offset, weight; + v16u8 out0, out1, out2; v16i8 zero = { 0 }; + v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }; + v8i16 in0, in1, in2, in3, in4, in5; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5; v4i32 offset_vec, weight_vec, rnd_vec; offset = (offset0 + offset1) << rnd_val; @@ -256,80 +250,59 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, rnd_vec = __msa_fill_w(rnd_val + 1); if (2 == height) { - v16i8 src0, src1; - v8i16 in0, in1, dst0, dst1; - v4i32 dst0_r, dst1_r, dst0_l, dst1_l; - - LD_SB2(src0_ptr, src_stride, src0, src1); + LD2(src0_ptr, src_stride, tp0, tp1); + INSERT_D2_SB(tp0, tp1, src0); LD_SH2(src1_ptr, src2_stride, in0, in1); + ILVRL_B2_SH(zero, src0, dst0, dst1); + SLLI_2V(dst0, dst1, 6); - ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1); - - dst0 <<= 6; - dst1 <<= 6; HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + dst0, dst1); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST8x2_UB(dst0_r, dst, dst_stride); + out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST8x2_UB(out0, dst, dst_stride); } else if (6 == height) { - v16i8 src0, src1, src2, src3, src4, src5; - v8i16 in0, in1, in2, in3, in4, in5; - v8i16 dst0, dst1, dst2, dst3, dst4, dst5; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; - - LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); + LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); + src0_ptr += 4 * src_stride; + INSERT_D2_SB(tp0, tp1, src0); + INSERT_D2_SB(tp2, tp3, src1); + LD2(src0_ptr, src_stride, tp0, tp1); + INSERT_D2_SB(tp0, tp1, src2); + ILVRL_B2_SH(zero, src0, dst0, dst1); + ILVRL_B2_SH(zero, src1, dst2, dst3); + ILVRL_B2_SH(zero, src2, dst4, dst5); LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); - ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, - dst0, dst1, dst2, dst3); - ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5); - SLLI_4V(dst0, dst1, dst2, dst3, 6); - dst4 <<= 6; - dst5 <<= 6; - HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, - in0, in1, in2, in3, - weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); - HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, - weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); - - HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, - dst4_l, dst4_r, dst5_l, dst5_r, - dst0_r, dst1_r, dst2_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + SLLI_2V(dst4, dst5, 6); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, dst0, dst1, + dst2, dst3); + HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, + offset_vec, dst4, dst5); + PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); - ST8x2_UB(dst2_r, dst, dst_stride); + ST8x2_UB(out2, dst, dst_stride); } else if (0 == height % 4) { uint32_t loop_cnt; - v16i8 src0, src1, src2, src3; - v8i16 in0, in1, in2, in3; - v8i16 dst0, dst1, dst2, dst3; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); src0_ptr += (4 * src_stride); + INSERT_D2_SB(tp0, tp1, src0); + INSERT_D2_SB(tp2, tp3, src1); + ILVRL_B2_SH(zero, src0, dst0, dst1); + ILVRL_B2_SH(zero, src1, dst2, dst3); LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); src1_ptr += (4 * src2_stride); - ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, - dst0, dst1, dst2, dst3); SLLI_4V(dst0, dst1, dst2, dst3, 6); - HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, - in0, in1, in2, in3, - weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); - - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, + in3, weight_vec, rnd_vec, offset_vec, + dst0, dst1, dst2, dst3); + PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -351,11 +324,10 @@ static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, uint32_t loop_cnt; int32_t offset, weight; v16i8 zero = { 0 }; + v16u8 out0, out1, out2; v16i8 src0, src1, src2, src3; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 dst0, dst1, dst2, dst3, dst4, dst5; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; v4i32 offset_vec, weight_vec, rnd_vec; offset = (offset0 + offset1) << rnd_val; @@ -383,44 +355,37 @@ static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, dst4 <<= 6; dst5 <<= 6; - HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, - in0, in1, in2, in3, - weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); - HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, - weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); - - HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, - dst4_l, dst4_r, dst5_l, dst5_r, - dst0_r, dst1_r, dst2_r); - ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, dst0, dst1, + dst2, dst3); + HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, + offset_vec, dst4, dst5); + PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); + ST12x4_UB(out0, out1, out2, dst, dst_stride); dst += (4 * dst_stride); } } -static void hevc_biwgt_copy_16multx4mult_msa(uint8_t *src0_ptr, - int32_t src_stride, - int16_t *src1_ptr, - int32_t src2_stride, - uint8_t *dst, - int32_t dst_stride, - int32_t height, - int32_t weight0, - int32_t weight1, - int32_t offset0, - int32_t offset1, - int32_t rnd_val, - int32_t width) +static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr, + int32_t src_stride, + int16_t *src1_ptr, + int32_t src2_stride, + uint8_t *dst, + int32_t dst_stride, + int32_t height, + int32_t weight0, + int32_t weight1, + int32_t offset0, + int32_t offset1, + int32_t rnd_val) { - uint32_t loop_cnt, cnt; - uint8_t *src0_ptr_tmp; - int16_t *src1_ptr_tmp; - uint8_t *dst_tmp; + uint32_t loop_cnt; int32_t offset, weight; + v16u8 out0, out1, out2, out3; v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v4i32 offset_vec, weight_vec, rnd_vec; offset = (offset0 + offset1) << rnd_val; @@ -431,79 +396,31 @@ static void hevc_biwgt_copy_16multx4mult_msa(uint8_t *src0_ptr, weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); - for (cnt = (width >> 4); cnt--;) { - src0_ptr_tmp = src0_ptr; - src1_ptr_tmp = src1_ptr; - dst_tmp = dst; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - v16i8 src0, src1, src2, src3; - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l; - - LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3); - src0_ptr_tmp += (4 * src_stride); - LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); - LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7); - src1_ptr_tmp += (4 * src2_stride); - - ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, - tmp0, tmp1, tmp2, tmp3); - ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, - tmp4, tmp5, tmp6, tmp7); - - SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); - SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); - HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, - in0, in1, in4, in5, - weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); - - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, - dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); - dst_tmp += (2 * dst_stride); - - HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7, - in2, in3, in6, in7, - weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); - - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, - dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); - dst_tmp += (2 * dst_stride); - } - - src0_ptr += 16; - src1_ptr += 16; - dst += 16; + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); + src1_ptr += (4 * src2_stride); + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1, + tmp2, tmp3); + ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5, + tmp6, tmp7); + SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); + SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); + HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5, + weight_vec, rnd_vec, offset_vec, tmp0, tmp1, + tmp4, tmp5); + HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7, + weight_vec, rnd_vec, offset_vec, tmp2, tmp3, + tmp6, tmp7); + PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); + PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); + ST_UB4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); } } -static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr, - int32_t src_stride, - int16_t *src1_ptr, - int32_t src2_stride, - uint8_t *dst, - int32_t dst_stride, - int32_t height, - int32_t weight0, - int32_t weight1, - int32_t offset0, - int32_t offset1, - int32_t rnd_val) -{ - hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, - src1_ptr, src2_stride, - dst, dst_stride, height, weight0, - weight1, offset0, offset1, rnd_val, 16); -} - static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, @@ -517,14 +434,55 @@ static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, int32_t offset1, int32_t rnd_val) { - hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, - src1_ptr, src2_stride, - dst, dst_stride, height, weight0, - weight1, offset0, offset1, rnd_val, 16); - hevc_biwgt_copy_8w_msa(src0_ptr + 16, src_stride, - src1_ptr + 16, src2_stride, - dst + 16, dst_stride, height, weight0, - weight1, offset0, offset1, rnd_val); + uint32_t loop_cnt; + int32_t offset, weight; + v16u8 out0, out1, out2, out3, out4, out5; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 }; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11; + v4i32 offset_vec, weight_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (loop_cnt = 8; loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5); + LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); + LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11); + src1_ptr += (4 * src2_stride); + + ILVRL_B2_SH(zero, src0, dst0, dst1); + ILVRL_B2_SH(zero, src1, dst2, dst3); + ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5); + ILVRL_B2_SH(zero, src4, dst6, dst7); + ILVRL_B2_SH(zero, src5, dst8, dst9); + ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11); + SLLI_4V(dst0, dst1, dst2, dst3, 6); + SLLI_4V(dst4, dst5, dst6, dst7, 6); + SLLI_4V(dst8, dst9, dst10, dst11, 6); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5, + weight_vec, rnd_vec, offset_vec, dst0, dst1, + dst2, dst3); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6, + weight_vec, rnd_vec, offset_vec, dst4, dst5, + dst6, dst7); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10, + in11, weight_vec, rnd_vec, offset_vec, + dst8, dst9, dst10, dst11); + PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); + PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); + ST_UB4(out0, out1, out3, out4, dst, dst_stride); + ST8x4_UB(out2, out5, dst + 16, dst_stride); + dst += (4 * dst_stride); + } } static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr, @@ -540,10 +498,52 @@ static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr, int32_t offset1, int32_t rnd_val) { - hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, - src1_ptr, src2_stride, - dst, dst_stride, height, weight0, - weight1, offset0, offset1, rnd_val, 32); + uint32_t loop_cnt; + int32_t offset, weight; + v16u8 out0, out1, out2, out3; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 offset_vec, weight_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + src0_ptr += src_stride; + LD_SB2(src0_ptr, 16, src2, src3); + src0_ptr += src_stride; + LD_SH4(src1_ptr, 8, in0, in1, in2, in3); + src1_ptr += src2_stride; + LD_SH4(src1_ptr, 8, in4, in5, in6, in7); + src1_ptr += src2_stride; + + ILVRL_B2_SH(zero, src0, tmp0, tmp4); + ILVRL_B2_SH(zero, src1, tmp1, tmp5); + ILVRL_B2_SH(zero, src2, tmp2, tmp6); + ILVRL_B2_SH(zero, src3, tmp3, tmp7); + SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); + SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); + HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, tmp0, tmp4, + tmp1, tmp5); + HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7, + weight_vec, rnd_vec, offset_vec, tmp2, tmp6, + tmp3, tmp7); + PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); + PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); + ST_UB2(out0, out1, dst, 16); + dst += dst_stride; + ST_UB2(out2, out3, dst, 16); + dst += dst_stride; + } } static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr, @@ -559,10 +559,43 @@ static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr, int32_t offset1, int32_t rnd_val) { - hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, - src1_ptr, src2_stride, - dst, dst_stride, height, weight0, - weight1, offset0, offset1, rnd_val, 48); + uint32_t loop_cnt; + int32_t offset, weight; + v16u8 out0, out1, out2; + v16i8 src0, src1, src2; + v16i8 zero = { 0 }; + v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5; + v4i32 offset_vec, weight_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (loop_cnt = 64; loop_cnt--;) { + LD_SB3(src0_ptr, 16, src0, src1, src2); + src0_ptr += src_stride; + LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5); + src1_ptr += src2_stride; + + ILVRL_B2_SH(zero, src0, dst0, dst1); + ILVRL_B2_SH(zero, src1, dst2, dst3); + ILVRL_B2_SH(zero, src2, dst4, dst5); + SLLI_4V(dst0, dst1, dst2, dst3, 6); + SLLI_2V(dst4, dst5, 6); + HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, dst0, dst1, + dst2, dst3); + HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, + offset_vec, dst4, dst5); + PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); + ST_UB2(out0, out1, dst, 16); + ST_UB(out2, dst + 32); + dst += dst_stride; + } } static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr, @@ -578,10 +611,46 @@ static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr, int32_t offset1, int32_t rnd_val) { - hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride, - src1_ptr, src2_stride, - dst, dst_stride, height, weight0, - weight1, offset0, offset1, rnd_val, 64); + uint32_t loop_cnt; + int32_t offset, weight; + v16u8 out0, out1, out2, out3; + v16i8 zero = { 0 }; + v16i8 src0, src1, src2, src3; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 offset_vec, weight_vec, rnd_vec; + + offset = (offset0 + offset1) << rnd_val; + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + for (loop_cnt = height; loop_cnt--;) { + LD_SB4(src0_ptr, 16, src0, src1, src2, src3); + src0_ptr += src_stride; + LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7); + src1_ptr += src2_stride; + + ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1, + tmp2, tmp3); + ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5, + tmp6, tmp7); + SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); + SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); + HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, tmp0, tmp4, + tmp1, tmp5); + HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7, + weight_vec, rnd_vec, offset_vec, tmp2, tmp6, + tmp3, tmp7); + PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); + PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); + ST_UB4(out0, out1, out2, out3, dst, 16); + dst += dst_stride; + } } static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, @@ -599,17 +668,16 @@ static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t rnd_val) { uint32_t loop_cnt; - int32_t offset, weight; + int32_t offset, weight, constant; v8i16 filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3; v16i8 mask1, mask2, mask3; v16i8 vec0, vec1, vec2, vec3; v8i16 dst0, dst1; v8i16 in0, in1, in2, in3; - v4i32 dst0_r, dst1_r, dst0_l, dst1_l; - v8i16 filter_vec, const_vec; + v8i16 filter_vec, out0, out1; v4i32 weight_vec, offset_vec, rnd_vec; - v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); src0_ptr -= 3; filter_vec = LD_SH(filter); @@ -622,9 +690,10 @@ static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); + constant = 128 * weight1; + constant <<= 6; + offset += constant; - const_vec = __msa_ldi_h(128); - const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -639,21 +708,19 @@ static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst0 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst0, dst0, dst0, dst0); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst1 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst1, dst1, dst1, dst1); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + out0, out1); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); + ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); } } @@ -673,25 +740,25 @@ static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t rnd_val) { uint32_t loop_cnt; - int32_t offset, weight; + int32_t offset, weight, constant; v8i16 filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3; v16i8 mask1, mask2, mask3; v16i8 vec0, vec1, vec2, vec3; v8i16 dst0, dst1, dst2, dst3; v8i16 in0, in1, in2, in3; - v4i32 dst0_r, dst1_r, dst0_l, dst1_l, dst2_r, dst3_r, dst2_l, dst3_l; - v8i16 filter_vec, const_vec; + v8i16 filter_vec, out0, out1, out2, out3; v4i32 weight_vec, offset_vec, rnd_vec; - v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); src0_ptr -= 3; offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); + constant = 128 * weight1; + constant <<= 6; + offset += constant; - const_vec = __msa_ldi_h(128); - const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -712,34 +779,28 @@ static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst0 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst0, dst0, dst0, dst0); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst1 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst1, dst1, dst1, dst1); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst2 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst2, dst2, dst2, dst2); + dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst3 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst3, dst3, dst3, dst3); + dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + out0, out1, out2, out3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -758,12 +819,85 @@ static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t offset1, int32_t rnd_val) { - hevc_hz_biwgt_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride, - dst, dst_stride, filter, height, - weight0, weight1, offset0, offset1, rnd_val); - hevc_hz_biwgt_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride, - dst + 8, dst_stride, filter, height, - weight0, weight1, offset0, offset1, rnd_val); + uint32_t loop_cnt; + int32_t offset, weight, constant; + v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3; + v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; + v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3; + v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec; + v4i32 weight_vec, offset_vec, rnd_vec; + + src0_ptr -= 3; + + weight0 = weight0 & 0x0000FFFF; + weight = weight0 | (weight1 << 16); + constant = 128 * weight1; + constant <<= 6; + offset = (offset0 + offset1) << rnd_val; + offset += constant; + + offset_vec = __msa_fill_w(offset); + weight_vec = __msa_fill_w(weight); + rnd_vec = __msa_fill_w(rnd_val + 1); + + filter_vec = LD_SH(filter); + SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask0 = LD_SB(&ff_hevc_mask_arr[0]); + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + mask4 = LD_SB(&ff_hevc_mask_arr[16]); + mask5 = mask4 + 2; + mask6 = mask4 + 4; + mask7 = mask4 + 6; + + for (loop_cnt = 4; loop_cnt--;) { + LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); + LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, + vec3); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, + vec3); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2, + vec3); + dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, + vec3); + dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, + weight_vec, rnd_vec, offset_vec, out0, out1, out2, + out3); + PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3); + src0_ptr += (4 * src_stride); + LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3); + src1_ptr += (4 * src2_stride); + ILVR_D2_SH(in1, in0, in3, in2, in0, in1); + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, + vec3); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, + vec3); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, + offset_vec, out0, out1); + out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); + ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride); + dst += (4 * dst_stride); + } } static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, @@ -781,15 +915,14 @@ static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t rnd_val) { uint32_t loop_cnt; - int32_t offset, weight; + int32_t offset, weight, constant; v16i8 src0, src1, src2, src3; v8i16 in0, in1, in2, in3; v8i16 filt0, filt1, filt2, filt3; v16i8 mask1, mask2, mask3; - v8i16 filter_vec, const_vec; + v8i16 filter_vec, out0, out1, out2, out3; v16i8 vec0, vec1, vec2, vec3; v8i16 dst0, dst1, dst2, dst3; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; v4i32 weight_vec, offset_vec, rnd_vec; v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; @@ -797,9 +930,10 @@ static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); + constant = 128 * weight1; + constant <<= 6; + offset += constant; - const_vec = __msa_ldi_h(128); - const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -824,34 +958,28 @@ static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst0 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst0, dst0, dst0, dst0); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst1 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst1, dst1, dst1, dst1); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst2 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst2, dst2, dst2, dst2); + dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst3 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst3, dst3, dst3, dst3); + dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + out0, out1, out2, out3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); + ST_SH2(out0, out1, dst, dst_stride); dst += (2 * dst_stride); } } @@ -872,25 +1000,26 @@ static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, { uint32_t loop_cnt; uint64_t dst_val0; - int32_t offset, weight; + int32_t offset, weight, constant; v16i8 src0, src1; v8i16 in0, in1, in2; v8i16 filt0, filt1, filt2, filt3; v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; v16i8 vec0, vec1, vec2, vec3; v8i16 dst0, dst1, dst2; - v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l; - v8i16 filter_vec, const_vec; + v4i32 dst2_r, dst2_l; + v8i16 filter_vec, out0, out1, out2; v4i32 weight_vec, offset_vec, rnd_vec; - v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); src0_ptr = src0_ptr - 3; offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); + constant = 128 * weight1; + constant <<= 6; + offset += constant; - const_vec = __msa_ldi_h(128); - const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -906,33 +1035,30 @@ static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, mask6 = mask0 + 12; mask7 = mask0 + 14; - for (loop_cnt = height; loop_cnt--;) { - LD_SB2(src0_ptr, 16, src0, src1); - src0_ptr += src_stride; - LD_SH2(src1_ptr, 8, in0, in1); - in2 = LD_SH(src1_ptr + 16); - src1_ptr += src2_stride; - XORI_B2_128_SB(src0, src1); + LD_SB2(src0_ptr, 16, src0, src1); + src0_ptr += src_stride; + LD_SH2(src1_ptr, 8, in0, in1); + in2 = LD_SH(src1_ptr + 16); + src1_ptr += src2_stride; + XORI_B2_128_SB(src0, src1); + for (loop_cnt = 31; loop_cnt--;) { VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst0 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst0, dst0, dst0, dst0); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); - dst1 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst1, dst1, dst1, dst1); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst2 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst2, dst2, dst2, dst2); + dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + out0, out1); ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, @@ -940,16 +1066,44 @@ static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec); SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); - dst2_r = CLIP_SW_0_255(dst2_r); - dst2_l = CLIP_SW_0_255(dst2_l); + dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); + out2 = CLIP_SH_0_255(dst2_r); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r); - dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0); - ST_SW(dst0_r, dst); + LD_SB2(src0_ptr, 16, src0, src1); + src0_ptr += src_stride; + LD_SH2(src1_ptr, 8, in0, in1); + in2 = LD_SH(src1_ptr + 16); + src1_ptr += src2_stride; + XORI_B2_128_SB(src0, src1); + PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); + dst_val0 = __msa_copy_u_d((v2i64) out2, 0); + ST_SH(out0, dst); SD(dst_val0, dst + 16); dst += dst_stride; } + + VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); + dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, + out0, out1); + ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); + dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec); + dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec); + SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); + dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); + out2 = CLIP_SH_0_255(dst2_r); + PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); + dst_val0 = __msa_copy_u_d((v2i64) out2, 0); + ST_SH(out0, dst); + SD(dst_val0, dst + 16); + dst += dst_stride; } static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, @@ -967,25 +1121,25 @@ static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t rnd_val) { uint32_t loop_cnt; - int32_t offset, weight; + int32_t offset, weight, constant; v16i8 src0, src1, src2; v8i16 in0, in1, in2, in3; v8i16 filt0, filt1, filt2, filt3; - v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; v16i8 vec0, vec1, vec2, vec3; v8i16 dst0, dst1, dst2, dst3; - v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filter_vec, out0, out1, out2, out3; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 3; offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); + constant = 128 * weight1; + constant <<= 6; + offset += constant; - const_vec = __msa_ldi_h(128); - const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -1012,34 +1166,28 @@ static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst0 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst0, dst0, dst0, dst0); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); - dst1 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst1, dst1, dst1, dst1); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst2 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst2, dst2, dst2, dst2); + dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst3 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst3, dst3, dst3, dst3); + dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + out0, out1, out2, out3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, 16); + PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); + ST_SH2(out0, out1, dst, 16); dst += dst_stride; } } @@ -1059,27 +1207,25 @@ static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t rnd_val) { uint32_t loop_cnt; - int32_t offset, weight; - uint64_t dst_val0; - v16i8 src0, src1, src2, src3; - v8i16 in0, in1, in2, in3, in4, in5; + int32_t offset, weight, constant; + v16i8 src0, src1, src2, src3, src4; + v8i16 in0, in1, in2, in3; v8i16 filt0, filt1, filt2, filt3; - v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; v16i8 vec0, vec1, vec2, vec3; - v8i16 dst0, dst1, dst2, dst3, dst4, dst5; - v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; + v8i16 dst0, dst1, dst2, dst3; + v8i16 filter_vec, out0, out1, out2, out3; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 3; offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); + constant = 128 * weight1; + constant <<= 6; + offset += constant; - const_vec = __msa_ldi_h(128); - const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -1095,86 +1241,57 @@ static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr, mask6 = mask0 + 12; mask7 = mask0 + 14; - for (loop_cnt = height; loop_cnt--;) { - LD_SB3(src0_ptr, 16, src0, src1, src2); - src3 = LD_SB(src0_ptr + 40); + for (loop_cnt = 64; loop_cnt--;) { + LD_SB2(src0_ptr, 16, src0, src1); + src2 = LD_SB(src0_ptr + 24); + LD_SH4(src1_ptr, 8, in0, in1, in2, in3); + XORI_B3_128_SB(src0, src1, src2); + LD_SB2(src0_ptr + 32, 8, src3, src4); src0_ptr += src_stride; - LD_SH2(src1_ptr, 8, in0, in1); - in2 = LD_SH(src1_ptr + 16); - XORI_B4_128_SB(src0, src1, src2, src3); + XORI_B2_128_SB(src3, src4); VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst0 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst0, dst0, dst0, dst0); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); - dst1 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst1, dst1, dst1, dst1); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst2 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst2, dst2, dst2, dst2); - VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, - vec0, vec1, vec2, vec3); - dst3 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst3, dst3, dst3, dst3); + dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst4 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst4, dst4, dst4, dst4); - VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, - vec0, vec1, vec2, vec3); - dst5 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst5, dst5, dst5, dst5); + dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); - HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, + HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + out0, out1, out2, out3); - ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); - dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, - (v8i16) weight_vec); - dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, - (v8i16) weight_vec); - SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); - dst2_r = CLIP_SW_0_255(dst2_r); - dst2_l = CLIP_SW_0_255(dst2_l); - - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r); - dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0); - ST_SW(dst0_r, dst); - SD(dst_val0, dst + 16); + PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); + ST_SH2(out0, out1, dst, 16); - LD_SH2(src1_ptr + 24, 8, in3, in4); - in5 = LD_SH(src1_ptr + 40); + LD_SH2(src1_ptr + 32, 8, in2, in3); src1_ptr += src2_stride; - HEVC_BIW_RND_CLIP2(dst3, dst4, in3, in4, + VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, + vec0, vec1, vec2, vec3); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, + filt3); + + HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst3_r, dst4_r, dst3_l, dst4_l); + out0, out1); - ILVRL_H2_SW(dst5, in5, dst5_r, dst5_l); - dst5_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_r, - (v8i16) weight_vec); - dst5_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_l, - (v8i16) weight_vec); - SRAR_W2_SW(dst5_r, dst5_l, rnd_vec); - dst5_r = CLIP_SW_0_255(dst5_r); - dst5_l = CLIP_SW_0_255(dst5_l); - - HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r); - HEVC_PCK_SW_SB2(dst3_l, dst3_r, dst3_r); - dst_val0 = __msa_copy_u_d((v2i64) dst3_r, 0); - SD(dst_val0, dst + 24); - ST_SW(dst4_r, dst + 32); + out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); + ST_SH(out0, dst + 32); dst += dst_stride; } } @@ -1197,25 +1314,25 @@ static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr, uint8_t *dst_tmp; int16_t *src1_ptr_tmp; uint32_t loop_cnt, cnt; - int32_t offset, weight; + int32_t offset, weight, constant; v16i8 src0, src1, src2; v8i16 in0, in1, in2, in3; v8i16 filt0, filt1, filt2, filt3; - v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; v16i8 vec0, vec1, vec2, vec3; v8i16 dst0, dst1, dst2, dst3; - v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; + v8i16 filter_vec, out0, out1, out2, out3; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 3; offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); + constant = 128 * weight1; + constant <<= 6; + offset += constant; - const_vec = __msa_ldi_h(128); - const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); @@ -1246,34 +1363,28 @@ static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr, VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst0 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst0, dst0, dst0, dst0); + dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, + filt2, filt3); VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); - dst1 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst1, dst1, dst1, dst1); + dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, + filt2, filt3); VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst2 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst2, dst2, dst2, dst2); + dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, + filt2, filt3); VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); - dst3 = const_vec; - DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, - dst3, dst3, dst3, dst3); + dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, + filt2, filt3); HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + out0, out1, out2, out3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst_tmp, 16); + PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); + ST_SH2(out0, out1, dst_tmp, 16); dst_tmp += 32; } @@ -1309,22 +1420,22 @@ static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, v16i8 src2110, src4332, src6554, src8776, src10998; v16i8 src12111110, src14131312; v8i16 dst10, dst32, dst54, dst76; - v4i32 dst10_r, dst32_r, dst54_r, dst76_r; - v4i32 dst10_l, dst32_l, dst54_l, dst76_l; v8i16 filt0, filt1, filt2, filt3; - v8i16 filter_vec, const_vec; - v4i32 weight_vec, offset_vec, rnd_vec; + v8i16 filter_vec, out0, out1, out2, out3; + v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; src0_ptr -= (3 * src_stride); offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); - const_vec = __msa_ldi_h(128); + const_vec = __msa_ldi_w(128); const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); + weight1_vec = __msa_fill_w(weight1); + offset_vec += const_vec * weight1_vec; filter_vec = LD_SH(filter); SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); @@ -1357,28 +1468,22 @@ static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, src8776, src10998, src12111110, src14131312); XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); - dst10 = const_vec; - DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1, - filt2, filt3, dst10, dst10, dst10, dst10); - dst32 = const_vec; - DPADD_SB4_SH(src4332, src6554, src8776, src10998, - filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); - dst54 = const_vec; - DPADD_SB4_SH(src6554, src8776, src10998, src12111110, - filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); - dst76 = const_vec; - DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, - filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); + DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, + filt0, dst10, dst32, dst54, dst76); + DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1, + filt1, dst10, dst32, dst54, dst76); + DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2, + filt2, filt2, dst10, dst32, dst54, dst76); + DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3, + filt3, filt3, dst10, dst32, dst54, dst76); HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst10_r, dst32_r, dst54_r, dst76_r, - dst10_l, dst32_l, dst54_l, dst76_l); + out0, out1, out2, out3); - HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r, - dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r); - ST4x8_UB(dst10_r, dst54_r, dst, dst_stride); + PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); + ST4x8_UB(out0, out1, dst, dst_stride); dst += (8 * dst_stride); src2110 = src10998; @@ -1411,20 +1516,21 @@ static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; v8i16 tmp0, tmp1, tmp2, tmp3; v8i16 filt0, filt1, filt2, filt3; - v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; - v4i32 weight_vec, offset_vec, rnd_vec; + v8i16 filter_vec, out0, out1, out2, out3; + v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; src0_ptr -= (3 * src_stride); offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); - const_vec = __msa_ldi_h(128); + const_vec = __msa_ldi_w(128); const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); + weight1_vec = __msa_fill_w(weight1); + offset_vec += const_vec * weight1_vec; filter_vec = LD_SH(filter); SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); @@ -1447,28 +1553,22 @@ static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); - tmp0 = const_vec; - DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, - filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); - tmp1 = const_vec; - DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, - filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); - tmp2 = const_vec; - DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, - filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); - tmp3 = const_vec; - DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, - filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3); + DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0, + filt0, tmp0, tmp1, tmp2, tmp3); + DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1, + filt1, tmp0, tmp1, tmp2, tmp3); + DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2, + filt2, tmp0, tmp1, tmp2, tmp3); + DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3, + filt3, tmp0, tmp1, tmp2, tmp3); HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + out0, out1, out2, out3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); src10_r = src54_r; @@ -1506,20 +1606,22 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, v16i8 src21_l, src43_l, src65_l, src87_l; v16i8 src2110, src4332, src6554, src8776; v8i16 filt0, filt1, filt2, filt3; - v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l; - v4i32 weight_vec, offset_vec, rnd_vec; + v8i16 out0, out1, out2, filter_vec; + v4i32 dst2_r, dst2_l; + v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; src0_ptr -= (3 * src_stride); offset = (offset0 + offset1) << rnd_val; weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); - const_vec = __msa_ldi_h(128); + const_vec = __msa_ldi_w(128); const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); + weight1_vec = __msa_fill_w(weight1); + offset_vec += const_vec * weight1_vec; filter_vec = LD_SH(filter); SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); @@ -1537,7 +1639,7 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, src2110, src4332, src6554); - for (loop_cnt = (height >> 1); loop_cnt--;) { + for (loop_cnt = 8; loop_cnt--;) { LD_SB2(src0_ptr, src_stride, src7, src8); src0_ptr += (2 * src_stride); LD_SH2(src1_ptr, src2_stride, in0, in1); @@ -1550,19 +1652,18 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l); - tmp0 = const_vec; - DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, - filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); - tmp1 = const_vec; - DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, - filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); - tmp2 = const_vec; - DPADD_SB4_SH(src2110, src4332, src6554, src8776, - filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); + DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0, + tmp0, tmp1, tmp2); + DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1); + tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1); + DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1); + tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2); + DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1); + tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3); HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + out0, out1); ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l); dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, @@ -1570,13 +1671,11 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec); SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); - dst2_r = CLIP_SW_0_255(dst2_r); - dst2_l = CLIP_SW_0_255(dst2_l); - - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r); - ST8x2_UB(dst0_r, dst, dst_stride); - ST4x2_UB(dst2_r, dst + 8, dst_stride); + dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); + out2 = CLIP_SH_0_255(dst2_r); + PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); + ST8x2_UB(out0, dst, dst_stride); + ST4x2_UB(out2, dst + 8, dst_stride); dst += (2 * dst_stride); src10_r = src32_r; @@ -1620,9 +1719,9 @@ static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, v16i8 src21_l, src43_l, src65_l, src87_l; v8i16 tmp0, tmp1, tmp2, tmp3; v8i16 filt0, filt1, filt2, filt3; - v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; - v4i32 weight_vec, offset_vec, rnd_vec; + v8i16 filter_vec; + v8i16 out0, out1, out2, out3; + v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; src0_ptr -= (3 * src_stride); @@ -1630,11 +1729,13 @@ static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, weight0 = weight0 & 0x0000FFFF; weight = weight0 | (weight1 << 16); - const_vec = __msa_ldi_h(128); + const_vec = __msa_ldi_w(128); const_vec <<= 6; offset_vec = __msa_fill_w(offset); weight_vec = __msa_fill_w(weight); rnd_vec = __msa_fill_w(rnd_val + 1); + weight1_vec = __msa_fill_w(weight1); + offset_vec += const_vec * weight1_vec; filter_vec = LD_SH(filter); SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); @@ -1667,28 +1768,22 @@ static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); - tmp0 = const_vec; - DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, - filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0); - tmp1 = const_vec; - DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, - filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1); - tmp2 = const_vec; - DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, - filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2); - tmp3 = const_vec; - DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, - filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3); + DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0, + filt0, filt0, tmp0, tmp1, tmp2, tmp3); + DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1, + filt1, filt1, tmp0, tmp1, tmp2, tmp3); + DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2, + filt2, filt2, tmp0, tmp1, tmp2, tmp3); + DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3, + filt3, filt3, tmp0, tmp1, tmp2, tmp3); HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + out0, out1, out2, out3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, - dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride); + PCKEV_B2_SH(out2, out0, out3, out1, out0, out1); + ST_SH2(out0, out1, dst_tmp, dst_stride); dst_tmp += (2 * dst_stride); src10_r = src32_r; @@ -2615,7 +2710,6 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, v8i16 dst0, dst1; v16i8 vec0, vec1; v8i16 in0, in1, in2, in3; - v4i32 dst0_r, dst1_r, dst0_l, dst1_l; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; @@ -2650,10 +2744,10 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + dst0, dst1); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); } static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, @@ -2678,7 +2772,6 @@ static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, v16i8 mask1; v16i8 vec0, vec1; v8i16 dst0, dst1, dst2, dst3; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; @@ -2727,12 +2820,10 @@ static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST4x8_UB(dst0, dst1, dst, dst_stride); dst += (8 * dst_stride); } } @@ -2791,7 +2882,6 @@ static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, v16i8 vec0, vec1; v8i16 in0, in1, in2, in3; v8i16 dst0, dst1, dst2, dst3; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; @@ -2835,12 +2925,10 @@ static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST6x4_UB(dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -2867,7 +2955,6 @@ static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, v16i8 mask1, vec0, vec1; v8i16 dst0, dst1; v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst0_l, dst1_l; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -2898,10 +2985,10 @@ static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + dst0, dst1); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST8x2_UB(dst0_r, dst, dst_stride); + dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST8x2_UB(dst0, dst, dst_stride); } static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, @@ -2927,8 +3014,6 @@ static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, v16i8 vec0, vec1; v8i16 dst0, dst1, dst2, dst3, dst4, dst5; v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -2975,18 +3060,16 @@ static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); + dst4, dst5); - HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, - dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); + ST8x4_UB(dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); - ST8x2_UB(dst2_r, dst, dst_stride); + ST8x2_UB(dst3, dst, dst_stride); } static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, @@ -3012,7 +3095,6 @@ static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, v16i8 vec0, vec1; v8i16 in0, in1, in2, in3; v8i16 dst0, dst1, dst2, dst3; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; @@ -3055,12 +3137,10 @@ static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST8x4_UB(dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -3123,8 +3203,6 @@ static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, v16i8 vec0, vec1; v8i16 dst0, dst1, dst2, dst3, dst4, dst5; v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -3176,17 +3254,14 @@ static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); + dst4, dst5); - HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, - dst4_l, dst4_r, dst5_l, dst5_r, - dst0_r, dst1_r, dst2_r); - ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); + ST12x4_UB(dst0, dst1, dst3, dst, dst_stride); dst += (4 * dst_stride); } } @@ -3215,7 +3290,6 @@ static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr, v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16i8 vec0, vec1; v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -3271,23 +3345,19 @@ static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST_SH2(dst0, dst1, dst, dst_stride); dst += (2 * dst_stride); HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7, in4, in5, in6, in7, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST_SH2(dst0, dst1, dst, dst_stride); dst += (2 * dst_stride); } } @@ -3317,7 +3387,6 @@ static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, v8i16 dst0, dst1, dst2, dst3; v8i16 in0, in1, in2, in3, in4, in5; v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -3364,12 +3433,10 @@ static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST_SH2(dst0, dst1, dst, dst_stride); dst += (2 * dst_stride); /* 8 width */ VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); @@ -3380,10 +3447,10 @@ static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + dst0, dst1); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST8x2_UB(dst0_r, dst_tmp, dst_stride); + dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); + ST8x2_UB(dst0, dst_tmp, dst_stride); dst_tmp += (2 * dst_stride); } } @@ -3412,7 +3479,6 @@ static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr, v16i8 vec0, vec1; v8i16 in0, in1, in2, in3; v8i16 filter_vec, const_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; v4i32 weight_vec, offset_vec, rnd_vec; src0_ptr -= 1; @@ -3457,12 +3523,10 @@ static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + dst0, dst1, dst2, dst3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, 16); + PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); + ST_SH2(dst0, dst1, dst, 16); dst += dst_stride; } } @@ -3554,7 +3618,6 @@ static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; v16i8 src2110, src4332, src6554; v8i16 dst10, dst32; - v4i32 dst10_r, dst32_r, dst10_l, dst32_l; v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; @@ -3597,10 +3660,10 @@ static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1, weight_vec, rnd_vec, offset_vec, - dst10_r, dst32_r, dst10_l, dst32_l); + dst10, dst32); - HEVC_PCK_SW_SB4(dst10_l, dst10_r, dst32_l, dst32_r, dst10_r); - ST4x4_UB(dst10_r, dst10_r, 0, 1, 2, 3, dst, dst_stride); + dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10); + ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); } @@ -3626,8 +3689,6 @@ static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; v16i8 src2110, src4332, src6554, src8776; v8i16 dst10, dst32, dst54, dst76; - v4i32 dst10_r, dst32_r, dst54_r, dst76_r; - v4i32 dst10_l, dst32_l, dst54_l, dst76_l; v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; @@ -3687,12 +3748,10 @@ static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst10_r, dst32_r, dst54_r, dst76_r, - dst10_l, dst32_l, dst54_l, dst76_l); + dst10, dst32, dst54, dst76); - HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r, - dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r); - ST4x8_UB(dst10_r, dst54_r, dst, dst_stride); + PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32); + ST4x8_UB(dst10, dst32, dst, dst_stride); dst += (8 * dst_stride); } } @@ -3751,7 +3810,6 @@ static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; src0_ptr -= src_stride; @@ -3798,12 +3856,10 @@ static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + ST6x4_UB(tmp0, tmp1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -3829,7 +3885,6 @@ static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; - v4i32 dst0_r, dst1_r, dst0_l, dst1_l; src0_ptr -= src_stride; @@ -3862,10 +3917,10 @@ static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1); HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + tmp0, tmp1); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST8x2_UB(dst0_r, dst, dst_stride); + tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); + ST8x2_UB(tmp0, dst, dst_stride); } static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, @@ -3891,8 +3946,6 @@ static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; src0_ptr -= src_stride; @@ -3936,18 +3989,16 @@ static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); + tmp4, tmp5); - HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, - dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); dst += (4 * dst_stride); - ST8x2_UB(dst2_r, dst, dst_stride); + ST8x2_UB(tmp3, dst, dst_stride); } static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, @@ -3973,7 +4024,6 @@ static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; src0_ptr -= src_stride; @@ -4020,12 +4070,10 @@ static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -4086,8 +4134,6 @@ static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; src0_ptr -= (1 * src_stride); @@ -4147,17 +4193,14 @@ static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); + tmp4, tmp5); - HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, - dst4_l, dst4_r, dst5_l, dst5_r, - dst0_r, dst1_r, dst2_r); - ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride); + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); + ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride); dst += (4 * dst_stride); } } @@ -4186,7 +4229,6 @@ static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; src0_ptr -= src_stride; @@ -4231,11 +4273,9 @@ static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, - dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); + tmp0, tmp1, tmp2, tmp3); + PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); + ST_SH2(tmp0, tmp1, dst, dst_stride); dst += (2 * dst_stride); LD_SB2(src0_ptr, src_stride, src5, src2); src0_ptr += (2 * src_stride); @@ -4258,12 +4298,10 @@ static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, - dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); + ST_SH2(tmp0, tmp1, dst, dst_stride); dst += (2 * dst_stride); } } @@ -4294,8 +4332,6 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l; src0_ptr -= src_stride; @@ -4357,19 +4393,17 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp4, tmp5); /* 8width */ HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5, weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); + tmp2, tmp3); /* 16width */ - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, - dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); /* 8width */ - HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); - ST8x2_UB(dst4_r, dst + 16, dst_stride); + tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2); + ST_SH2(tmp0, tmp1, dst, dst_stride); + ST8x2_UB(tmp2, dst + 16, dst_stride); dst += (2 * dst_stride); /* 16width */ @@ -4404,20 +4438,18 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp4, tmp5); /* 8width */ HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5, weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); + tmp2, tmp3); /* 16width */ - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, - dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); + PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); /* 8width */ - HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); - ST8x2_UB(dst4_r, dst + 16, dst_stride); + tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2); + ST_SH2(tmp0, tmp1, dst, dst_stride); + ST8x2_UB(tmp2, dst + 16, dst_stride); dst += (2 * dst_stride); } } @@ -4449,8 +4481,6 @@ static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, v8i16 filt0, filt1; v8i16 filter_vec, const_vec; v4i32 weight_vec, offset_vec, rnd_vec; - v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l; src0_ptr -= src_stride; @@ -4501,12 +4531,10 @@ static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp4, tmp5); /* 16width */ - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r, - dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST_SW2(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); + ST_SH2(tmp0, tmp1, dst, dst_stride); dst += (2 * dst_stride); src10_r = src32_r; @@ -4537,13 +4565,11 @@ static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7, in4, in5, in6, in7, weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst6_r, dst7_r, - dst4_l, dst5_l, dst6_l, dst7_l); + tmp2, tmp3, tmp6, tmp7); /* next 16width */ - HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r, - dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r); - ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride); + PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3); + ST_SH2(tmp2, tmp3, dst_tmp, dst_stride); dst_tmp += (2 * dst_stride); src76_r = src98_r; @@ -4678,7 +4704,6 @@ static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, v16i8 vec0, vec1, vec2, vec3, vec4, vec5; v8i16 dst0, dst1, dst2, dst3, dst4, dst5; v8i16 tmp0, tmp1; - v4i32 dst0_l, dst1_l; v4i32 dst0_r, dst1_r, dst2_r, dst3_r; v8i16 dst10_r, dst32_r, dst21_r, dst43_r; v4i32 weight_vec, offset_vec, rnd_vec; @@ -4756,10 +4781,10 @@ static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); + tmp0, tmp1); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride); + tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); + ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride); } static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, @@ -4789,7 +4814,6 @@ static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, v16i8 vec0, vec1, vec2, vec3, vec4, vec5; v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9; v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; - v4i32 dst0_l, dst1_l, dst2_l, dst3_l; v8i16 tmp0, tmp1, tmp2, tmp3; v8i16 dst10_r, dst32_r, dst54_r, dst76_r; v8i16 dst21_r, dst43_r, dst65_r, dst87_r; @@ -4899,12 +4923,10 @@ static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST4x8_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + ST4x8_UB(tmp0, tmp1, dst, dst_stride); dst += (8 * dst_stride); } } @@ -5065,12 +5087,10 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST6x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + ST6x4_UB(tmp0, tmp1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -5175,9 +5195,9 @@ static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst0_l, dst1_l); - HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r); - ST8x2_UB(dst0_r, dst, dst_stride); + tmp0, tmp1); + tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); + ST8x2_UB(tmp0, dst, dst_stride); } static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, @@ -5309,12 +5329,10 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST8x4_UB(dst0_r, dst1_r, dst, dst_stride); + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); dst += (4 * dst_stride); LD_SB2(src0_ptr, src_stride, src7, src8); @@ -5344,10 +5362,10 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, weight_vec, rnd_vec, offset_vec, - dst4_r, dst5_r, dst4_l, dst5_l); + tmp4, tmp5); - HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst2_r); - ST8x2_UB(dst2_r, dst, dst_stride); + tmp4 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); + ST8x2_UB(tmp4, dst, dst_stride); } static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, @@ -5484,12 +5502,10 @@ static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3, weight_vec, rnd_vec, offset_vec, - dst0_r, dst1_r, dst2_r, dst3_r, - dst0_l, dst1_l, dst2_l, dst3_l); + tmp0, tmp1, tmp2, tmp3); - HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r, - dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r); - ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride); + PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst_tmp, dst_stride); dst_tmp += (4 * dst_stride); } -- cgit v1.1