summaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorMartin Vignali <martin.vignali@gmail.com>2017-12-02 19:09:58 +0100
committerMartin Vignali <martin.vignali@gmail.com>2017-12-09 15:16:03 +0100
commit4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1 (patch)
tree5107ae8a627ffa2b6d7f496f5115c481c289f369 /libavcodec
parentcfbcea1cca7f4d5b92a17778f78427794057eb29 (diff)
downloadffmpeg-streaming-4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1.zip
ffmpeg-streaming-4353c3506742c9fecce4cf9f68cc6a7ab7ea05b1.tar.gz
avcodec/x86/lossless_videodsp : add avx2 version for add_left_pred
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/x86/lossless_videodsp.asm63
-rw-r--r--libavcodec/x86/lossless_videodsp_init.c3
2 files changed, 44 insertions, 22 deletions
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index 663bf61..cfa0620 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -114,40 +114,54 @@ MEDIAN_PRED
add dstq, wq
neg wq
%%.loop:
+ pshufb xm0, xm5
%if %2
mova m1, [srcq+wq]
%else
movu m1, [srcq+wq]
%endif
- mova m2, m1
- psllw m1, 8
+ psllw m2, m1, 8
paddb m1, m2
- mova m2, m1
- pshufb m1, m3
+ pshufb m2, m1, m3
paddb m1, m2
- pshufb m0, m5
- mova m2, m1
- pshufb m1, m4
+ pshufb m2, m1, m4
paddb m1, m2
-%if mmsize == 16
- mova m2, m1
- pshufb m1, m6
+%if mmsize >= 16
+ pshufb m2, m1, m6
paddb m1, m2
%endif
- paddb m0, m1
+ paddb xm0, xm1
%if %1
- mova [dstq+wq], m0
+ mova [dstq+wq], xm0
%else
- movq [dstq+wq], m0
- movhps [dstq+wq+8], m0
+ movq [dstq+wq], xm0
+ movhps [dstq+wq+8], xm0
+%endif
+
+%if mmsize == 32
+ vextracti128 xm2, m1, 1 ; get second lane of the ymm
+ pshufb xm0, xm5 ; set alls val to last val of the first lane
+ paddb xm0, xm2
+;store val
+%if %1
+ mova [dstq+wq+16], xm0
+%else;
+ movq [dstq+wq+16], xm0
+ movhps [dstq+wq+16+8], xm0
+%endif
%endif
add wq, mmsize
jl %%.loop
+%if mmsize == 32
+ mov eax, [dstq -1]
+ and eax, 0xff
+%else;
mov eax, mmsize-1
sub eax, wd
movd m1, eax
pshufb m0, m1
movd eax, m0
+%endif
RET
%endmacro
@@ -166,15 +180,15 @@ cglobal add_left_pred, 3,3,7, dst, src, w, left
%macro ADD_LEFT_PRED_UNALIGNED 0
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
- mova m5, [pb_15]
- mova m6, [pb_zzzzzzzz77777777]
- mova m4, [pb_zzzz3333zzzzbbbb]
- mova m3, [pb_zz11zz55zz99zzdd]
- movd m0, leftm
- pslldq m0, 15
- test srcq, 15
+ mova xm5, [pb_15]
+ VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
+ VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
+ VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
+ movd xm0, leftm
+ pslldq xm0, 15
+ test srcq, mmsize - 1
jnz .src_unaligned
- test dstq, 15
+ test dstq, mmsize - 1
jnz .dst_unaligned
ADD_LEFT_LOOP 1, 1
.dst_unaligned:
@@ -186,6 +200,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
INIT_XMM ssse3
ADD_LEFT_PRED_UNALIGNED
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_LEFT_PRED_UNALIGNED
+%endif
+
;------------------------------------------------------------------------------
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
index 4f20c1c..beae317 100644
--- a/libavcodec/x86/lossless_videodsp_init.c
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -38,6 +38,8 @@ int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
+int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t w, int left);
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
@@ -118,5 +120,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->add_bytes = ff_add_bytes_avx2;
+ c->add_left_pred = ff_add_left_pred_unaligned_avx2;
}
}
OpenPOWER on IntegriCloud