From 4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1 Mon Sep 17 00:00:00 2001 From: Martin Vignali Date: Sat, 2 Dec 2017 19:09:58 +0100 Subject: avcodec/x86/lossless_videodsp : add avx2 version for add_left_pred --- libavcodec/x86/lossless_videodsp.asm | 63 +++++++++++++++++++++------------ libavcodec/x86/lossless_videodsp_init.c | 3 ++ 2 files changed, 44 insertions(+), 22 deletions(-) (limited to 'libavcodec') diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index 663bf61..cfa0620 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -114,40 +114,54 @@ MEDIAN_PRED add dstq, wq neg wq %%.loop: + pshufb xm0, xm5 %if %2 mova m1, [srcq+wq] %else movu m1, [srcq+wq] %endif - mova m2, m1 - psllw m1, 8 + psllw m2, m1, 8 paddb m1, m2 - mova m2, m1 - pshufb m1, m3 + pshufb m2, m1, m3 paddb m1, m2 - pshufb m0, m5 - mova m2, m1 - pshufb m1, m4 + pshufb m2, m1, m4 paddb m1, m2 -%if mmsize == 16 - mova m2, m1 - pshufb m1, m6 +%if mmsize >= 16 + pshufb m2, m1, m6 paddb m1, m2 %endif - paddb m0, m1 + paddb xm0, xm1 %if %1 - mova [dstq+wq], m0 + mova [dstq+wq], xm0 %else - movq [dstq+wq], m0 - movhps [dstq+wq+8], m0 + movq [dstq+wq], xm0 + movhps [dstq+wq+8], xm0 +%endif + +%if mmsize == 32 + vextracti128 xm2, m1, 1 ; get second lane of the ymm + pshufb xm0, xm5 ; set alls val to last val of the first lane + paddb xm0, xm2 +;store val +%if %1 + mova [dstq+wq+16], xm0 +%else; + movq [dstq+wq+16], xm0 + movhps [dstq+wq+16+8], xm0 +%endif %endif add wq, mmsize jl %%.loop +%if mmsize == 32 + mov eax, [dstq -1] + and eax, 0xff +%else; mov eax, mmsize-1 sub eax, wd movd m1, eax pshufb m0, m1 movd eax, m0 +%endif RET %endmacro @@ -166,15 +180,15 @@ cglobal add_left_pred, 3,3,7, dst, src, w, left %macro ADD_LEFT_PRED_UNALIGNED 0 cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left - mova m5, [pb_15] - mova m6, [pb_zzzzzzzz77777777] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - pslldq m0, 15 - test srcq, 15 + mova xm5, [pb_15] + VBROADCASTI128 m6, [pb_zzzzzzzz77777777] + VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb] + VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd] + movd xm0, leftm + pslldq xm0, 15 + test srcq, mmsize - 1 jnz .src_unaligned - test dstq, 15 + test dstq, mmsize - 1 jnz .dst_unaligned ADD_LEFT_LOOP 1, 1 .dst_unaligned: @@ -186,6 +200,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left INIT_XMM ssse3 ADD_LEFT_PRED_UNALIGNED +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +ADD_LEFT_PRED_UNALIGNED +%endif + ;------------------------------------------------------------------------------ ; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w); ;------------------------------------------------------------------------------ diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c index 4f20c1c..beae317 100644 --- a/libavcodec/x86/lossless_videodsp_init.c +++ b/libavcodec/x86/lossless_videodsp_init.c @@ -38,6 +38,8 @@ int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src, ptrdiff_t w, int left); int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src, ptrdiff_t w, int left); +int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src, + ptrdiff_t w, int left); int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); @@ -118,5 +120,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) } if (EXTERNAL_AVX2_FAST(cpu_flags)) { c->add_bytes = ff_add_bytes_avx2; + c->add_left_pred = ff_add_left_pred_unaligned_avx2; } } -- cgit v1.1