summaryrefslogtreecommitdiffstats
path: root/libavfilter/x86/vf_blend.asm
diff options
context:
space:
mode:
authorMartin Vignali <martin.vignali@gmail.com>2018-01-17 20:59:58 +0100
committerMartin Vignali <martin.vignali@gmail.com>2018-01-28 20:21:32 +0100
commit3a230ce5fa10b21312236b362df9eeddd99e7ac2 (patch)
treec1ed49dc5130afd5d7748518cc862fc4ba292157 /libavfilter/x86/vf_blend.asm
parent4d95c6d5d7d8d79b5acafcf526a1b7c1797a1060 (diff)
downloadffmpeg-streaming-3a230ce5fa10b21312236b362df9eeddd99e7ac2.zip
ffmpeg-streaming-3a230ce5fa10b21312236b362df9eeddd99e7ac2.tar.gz
avfilter/x86/vf_blend : avfilter/x86/vf_blend : add AVX2 version for each func except divide
and optimize average, grainextract, multiply, screen, grain merge
Diffstat (limited to 'libavfilter/x86/vf_blend.asm')
-rw-r--r--libavfilter/x86/vf_blend.asm229
1 files changed, 145 insertions, 84 deletions
diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index 4916aaf..680e266 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -2,6 +2,8 @@
;* x86-optimized functions for blend filter
;*
;* Copyright (C) 2015 Paul B Mahol
+;* Copyright (C) 2018 Henrik Gramner
+;* Copyright (C) 2018 Jokyo Images
;*
;* This file is part of FFmpeg.
;*
@@ -74,39 +76,36 @@ BLEND_INIT %1, 2
BLEND_END
%endmacro
-INIT_XMM sse2
-BLEND_SIMPLE xor, xor
-BLEND_SIMPLE or, or
-BLEND_SIMPLE and, and
-BLEND_SIMPLE addition, addusb
-BLEND_SIMPLE subtract, subusb
-BLEND_SIMPLE darken, minub
-BLEND_SIMPLE lighten, maxub
-
-BLEND_INIT grainextract, 4
- pxor m2, m2
- mova m3, [pw_128]
+%macro GRAINEXTRACT 0
+BLEND_INIT grainextract, 6
+ pxor m4, m4
+ VBROADCASTI128 m5, [pw_128]
.nextrow:
mov xq, widthq
-
.loop:
- movh m0, [topq + xq]
- movh m1, [bottomq + xq]
- punpcklbw m0, m2
- punpcklbw m1, m2
- paddw m0, m3
- psubw m0, m1
- packuswb m0, m0
- movh [dstq + xq], m0
- add xq, mmsize / 2
+ movu m1, [topq + xq]
+ movu m3, [bottomq + xq]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+
+ paddw m0, m5
+ paddw m1, m5
+ psubw m0, m2
+ psubw m1, m3
+
+ packuswb m0, m1
+ mova [dstq + xq], m0
+ add xq, mmsize
jl .loop
BLEND_END
+%endmacro
%macro MULTIPLY 3 ; a, b, pw_1
pmullw %1, %2 ; xxxxxxxx a * b
paddw %1, %3
- mova %2, %1
- psrlw %2, 8
+ psrlw %2, %1, 8
paddw %1, %2
psrlw %1, 8 ; 00xx00xx a * b / 255
%endmacro
@@ -118,92 +117,112 @@ BLEND_END
pxor %1, %4 ; 00xx00xx 255 - x / 255
%endmacro
-BLEND_INIT multiply, 4
- pxor m2, m2
- mova m3, [pw_1]
+%macro BLEND_MULTIPLY 0
+BLEND_INIT multiply, 6
+ pxor m4, m4
+ VBROADCASTI128 m5, [pw_1]
.nextrow:
mov xq, widthq
.loop:
- ; word
- ; |--|
- movh m0, [topq + xq] ; 0000xxxx
- movh m1, [bottomq + xq]
- punpcklbw m0, m2 ; 00xx00xx
- punpcklbw m1, m2
-
- MULTIPLY m0, m1, m3
-
- packuswb m0, m0 ; 0000xxxx
- movh [dstq + xq], m0
- add xq, mmsize / 2
-
+ movu m1, [topq + xq]
+ movu m3, [bottomq + xq]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+
+ MULTIPLY m0, m2, m5
+ MULTIPLY m1, m3, m5
+
+ packuswb m0, m1
+ mova [dstq + xq], m0
+ add xq, mmsize
jl .loop
BLEND_END
+%endmacro
-BLEND_INIT screen, 5
- pxor m2, m2
- mova m3, [pw_1]
- mova m4, [pw_255]
+%macro BLEND_SCREEN 0
+BLEND_INIT screen, 7
+ pxor m4, m4
+
+ VBROADCASTI128 m5, [pw_1]
+ VBROADCASTI128 m6, [pw_255]
.nextrow:
mov xq, widthq
.loop:
- movh m0, [topq + xq] ; 0000xxxx
- movh m1, [bottomq + xq]
- punpcklbw m0, m2 ; 00xx00xx
- punpcklbw m1, m2
-
- SCREEN m0, m1, m3, m4
-
- packuswb m0, m0 ; 0000xxxx
- movh [dstq + xq], m0
- add xq, mmsize / 2
-
+ movu m1, [topq + xq]
+ movu m3, [bottomq + xq]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+
+ SCREEN m0, m2, m5, m6
+ SCREEN m1, m3, m5, m6
+
+ packuswb m0, m1
+ mova [dstq + xq], m0
+ add xq, mmsize
jl .loop
BLEND_END
+%endmacro
+%macro AVERAGE 0
BLEND_INIT average, 3
- pxor m2, m2
+ pcmpeqb m2, m2
+
.nextrow:
mov xq, widthq
- .loop:
- movh m0, [topq + xq]
- movh m1, [bottomq + xq]
- punpcklbw m0, m2
- punpcklbw m1, m2
- paddw m0, m1
- psrlw m0, 1
- packuswb m0, m0
- movh [dstq + xq], m0
- add xq, mmsize / 2
+.loop:
+ movu m0, [topq + xq]
+ movu m1, [bottomq + xq]
+ pxor m0, m2
+ pxor m1, m2
+ pavgb m0, m1
+ pxor m0, m2
+ mova [dstq + xq], m0
+ add xq, mmsize
jl .loop
BLEND_END
+%endmacro
-BLEND_INIT grainmerge, 4
- pxor m2, m2
- mova m3, [pw_128]
+
+%macro GRAINMERGE 0
+BLEND_INIT grainmerge, 6
+ pxor m4, m4
+
+ VBROADCASTI128 m5, [pw_128]
.nextrow:
mov xq, widthq
.loop:
- movh m0, [topq + xq]
- movh m1, [bottomq + xq]
- punpcklbw m0, m2
- punpcklbw m1, m2
- paddw m0, m1
- psubw m0, m3
- packuswb m0, m0
- movh [dstq + xq], m0
- add xq, mmsize / 2
+ movu m1, [topq + xq]
+ movu m3, [bottomq + xq]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+
+ paddw m0, m2
+ paddw m1, m3
+ psubw m0, m5
+ psubw m1, m5
+
+ packuswb m0, m1
+ mova [dstq + xq], m0
+ add xq, mmsize
jl .loop
BLEND_END
+%endmacro
+%macro HARDMIX 0
BLEND_INIT hardmix, 5
- mova m2, [pb_255]
- mova m3, [pb_128]
- mova m4, [pb_127]
+ VBROADCASTI128 m2, [pb_255]
+ VBROADCASTI128 m3, [pb_128]
+ VBROADCASTI128 m4, [pb_127]
.nextrow:
mov xq, widthq
@@ -218,7 +237,9 @@ BLEND_INIT hardmix, 5
add xq, mmsize
jl .loop
BLEND_END
+%endmacro
+%macro DIVIDE 0
BLEND_INIT divide, 4
pxor m2, m2
mova m3, [ps_255]
@@ -247,9 +268,11 @@ BLEND_INIT divide, 4
jl .loop
BLEND_END
+%endmacro
+%macro PHOENIX 0
BLEND_INIT phoenix, 4
- mova m3, [pb_255]
+ VBROADCASTI128 m3, [pb_255]
.nextrow:
mov xq, widthq
@@ -266,6 +289,7 @@ BLEND_INIT phoenix, 4
add xq, mmsize
jl .loop
BLEND_END
+%endmacro
%macro BLEND_ABS 0
BLEND_INIT difference, 5
@@ -291,7 +315,7 @@ BLEND_END
BLEND_INIT extremity, 8
pxor m2, m2
- mova m4, [pw_255]
+ VBROADCASTI128 m4, [pw_255]
.nextrow:
mov xq, widthq
@@ -315,7 +339,7 @@ BLEND_END
BLEND_INIT negation, 8
pxor m2, m2
- mova m4, [pw_255]
+ VBROADCASTI128 m4, [pw_255]
.nextrow:
mov xq, widthq
@@ -341,6 +365,43 @@ BLEND_END
%endmacro
INIT_XMM sse2
+BLEND_SIMPLE xor, xor
+BLEND_SIMPLE or, or
+BLEND_SIMPLE and, and
+BLEND_SIMPLE addition, addusb
+BLEND_SIMPLE subtract, subusb
+BLEND_SIMPLE darken, minub
+BLEND_SIMPLE lighten, maxub
+GRAINEXTRACT
+BLEND_MULTIPLY
+BLEND_SCREEN
+AVERAGE
+GRAINMERGE
+HARDMIX
+PHOENIX
+DIVIDE
+
BLEND_ABS
+
INIT_XMM ssse3
BLEND_ABS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+BLEND_SIMPLE xor, xor
+BLEND_SIMPLE or, or
+BLEND_SIMPLE and, and
+BLEND_SIMPLE addition, addusb
+BLEND_SIMPLE subtract, subusb
+BLEND_SIMPLE darken, minub
+BLEND_SIMPLE lighten, maxub
+GRAINEXTRACT
+BLEND_MULTIPLY
+BLEND_SCREEN
+AVERAGE
+GRAINMERGE
+HARDMIX
+PHOENIX
+
+BLEND_ABS
+%endif
OpenPOWER on IntegriCloud