diff options
author | Shiyou Yin <yinshiyou-hf@loongson.cn> | 2019-07-09 20:43:37 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2019-07-10 12:54:57 +0200 |
commit | a45e8ade2d2d46fde48ee0567ab18e23dc8c71d1 (patch) | |
tree | 1ca939b0771080f7b9402acfd4c0cd79f5f52035 | |
parent | 24f7a8a1688f88af153de4587de50cbf3084ee7d (diff) | |
download | ffmpeg-streaming-a45e8ade2d2d46fde48ee0567ab18e23dc8c71d1.zip ffmpeg-streaming-a45e8ade2d2d46fde48ee0567ab18e23dc8c71d1.tar.gz |
avutil/mips: optimize UNPCK&SAD macros with MSA2.0 instruction.
Loongson 3A4000 and 2k1000 has supported MSA2.0.
This patch optimized SAD_UB2_UH,UNPCK_R_SH_SW,UNPCK_SB_SH and UNPCK_SH_SW with MSA2.0 instruction.
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rwxr-xr-x | configure | 5 | ||||
-rw-r--r-- | libavutil/mips/generic_macros_msa.h | 42 |
2 files changed, 44 insertions, 3 deletions
@@ -441,6 +441,7 @@ Optimization options (experts only): --disable-mipsdsp disable MIPS DSP ASE R1 optimizations --disable-mipsdspr2 disable MIPS DSP ASE R2 optimizations --disable-msa disable MSA optimizations + --disable-msa2 disable MSA2 optimizations --disable-mipsfpu disable floating point MIPS optimizations --disable-mmi disable Loongson SIMD optimizations --disable-fast-unaligned consider unaligned accesses slow @@ -1999,6 +2000,7 @@ ARCH_EXT_LIST_MIPS=" mipsdsp mipsdspr2 msa + msa2 " ARCH_EXT_LIST_LOONGSON=" @@ -2527,6 +2529,7 @@ mipsdsp_deps="mips" mipsdspr2_deps="mips" mmi_deps="mips" msa_deps="mipsfpu" +msa2_deps="msa" cpunop_deps="i686" x86_64_select="i686" @@ -5753,6 +5756,7 @@ elif enabled mips; then enabled mipsfpu && enabled msa && check_inline_asm_flags msa '"addvi.b $w0, $w1, 1"' '-mmsa' && check_headers msa.h || disable msa enabled mipsdsp && check_inline_asm_flags mipsdsp '"addu.qb $t0, $t1, $t2"' '-mdsp' enabled mipsdspr2 && check_inline_asm_flags mipsdspr2 '"absq_s.qb $t0, $t1"' '-mdspr2' + enabled msa && enabled msa2 && check_inline_asm_flags msa2 '"nxbits.any.b $w0, $w0"' '-mmsa2' && check_headers msa2.h || disable msa2 if enabled bigendian && enabled msa; then disable msa @@ -7128,6 +7132,7 @@ if enabled mips; then echo "MIPS DSP R1 enabled ${mipsdsp-no}" echo "MIPS DSP R2 enabled ${mipsdspr2-no}" echo "MIPS MSA enabled ${msa-no}" + echo "MIPS MSA2 enabled ${msa2-no}" echo "LOONGSON MMI enabled ${mmi-no}" fi if enabled ppc; then diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h index 6a46704..a377428 100644 --- a/libavutil/mips/generic_macros_msa.h +++ b/libavutil/mips/generic_macros_msa.h @@ -23,6 +23,11 @@ #include <stdint.h> #include <msa.h> +#include <config.h> + +#if HAVE_MSA2 +#include <msa2.h> +#endif #define ALIGNMENT 16 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1))) @@ -1234,6 +1239,15 @@ unsigned absolute diff values, even-odd pairs are added together to generate 8 halfword results. */ +#if HAVE_MSA2 +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ +( { \ + v8u16 sad_m = { 0 }; \ + sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \ + sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \ + sad_m; \ +} ) +#else #define SAD_UB2_UH(in0, in1, ref0, ref1) \ ( { \ v16u8 diff0_m, diff1_m; \ @@ -1247,6 +1261,7 @@ \ sad_m; \ } ) +#endif // #if HAVE_MSA2 /* Description : Insert specified word elements from input vectors to 1 destination vector @@ -2287,6 +2302,12 @@ extracted and interleaved with same vector 'in0' to generate 4 word elements keeping sign intact */ +#if HAVE_MSA2 +#define UNPCK_R_SH_SW(in, out) \ +{ \ + out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \ +} +#else #define UNPCK_R_SH_SW(in, out) \ { \ v8i16 sign_m; \ @@ -2294,6 +2315,7 @@ sign_m = __msa_clti_s_h((v8i16) in, 0); \ out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \ } +#endif // #if HAVE_MSA2 /* Description : Sign extend byte elements from input vector and return halfword results in pair of vectors @@ -2306,6 +2328,13 @@ Then interleaved left with same vector 'in0' to generate 8 signed halfword elements in 'out1' */ +#if HAVE_MSA2 +#define UNPCK_SB_SH(in, out0, out1) \ +{ \ + out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \ + out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \ +} +#else #define UNPCK_SB_SH(in, out0, out1) \ { \ v16i8 tmp_m; \ @@ -2313,6 +2342,7 @@ tmp_m = __msa_clti_s_b((v16i8) in, 0); \ ILVRL_B2_SH(tmp_m, in, out0, out1); \ } +#endif // #if HAVE_MSA2 /* Description : Zero extend unsigned byte elements to halfword elements Arguments : Inputs - in (1 input unsigned byte vector) @@ -2339,6 +2369,13 @@ Then interleaved left with same vector 'in0' to generate 4 signed word elements in 'out1' */ +#if HAVE_MSA2 +#define UNPCK_SH_SW(in, out0, out1) \ +{ \ + out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \ + out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \ +} +#else #define UNPCK_SH_SW(in, out0, out1) \ { \ v8i16 tmp_m; \ @@ -2346,6 +2383,7 @@ tmp_m = __msa_clti_s_h((v8i16) in, 0); \ ILVRL_H2_SW(tmp_m, in, out0, out1); \ } +#endif // #if HAVE_MSA2 /* Description : Swap two variables Arguments : Inputs - in0, in1 @@ -2850,13 +2888,11 @@ */ #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ ( { \ - v8i16 tmp1_m; \ v8i16 out0_m; \ \ out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \ out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \ - tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \ - out0_m = __msa_adds_s_h(out0_m, tmp1_m); \ + out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \ \ out0_m; \ } ) |