summaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2015-12-06 02:42:34 -0300
committerJames Almer <jamrial@gmail.com>2015-12-20 16:42:36 -0300
commit34b2bd03cf94e26e0352b4edd5f9ca86c5e95b0b (patch)
tree35ca715e4ff0feb80ea17b64cf4e244c677a9860 /libavcodec
parent9e569abe995dc41a3e8fb30fb6d9c4d002b8ba4d (diff)
downloadffmpeg-streaming-34b2bd03cf94e26e0352b4edd5f9ca86c5e95b0b.zip
ffmpeg-streaming-34b2bd03cf94e26e0352b4edd5f9ca86c5e95b0b.tar.gz
x86/hevc_sao: simplify sao_band_filter 10/12bit
Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/x86/hevc_sao_10bit.asm142
1 files changed, 57 insertions, 85 deletions
diff --git a/libavcodec/x86/hevc_sao_10bit.asm b/libavcodec/x86/hevc_sao_10bit.asm
index f45fc56..3a7048a 100644
--- a/libavcodec/x86/hevc_sao_10bit.asm
+++ b/libavcodec/x86/hevc_sao_10bit.asm
@@ -83,7 +83,6 @@ SECTION .text
mova [rsp+mmsize*6], m6
mova m1, [pw_mask %+ %1]
pxor m0, m0
- %assign MMSIZE mmsize
%define m14 m0
%define m13 m1
%define m9 m2
@@ -93,37 +92,6 @@ DEFINE_ARGS dst, src, dststride, srcstride, offset, height
mov heightd, r7m
%endmacro
-%macro HEVC_SAO_BAND_FILTER_COMPUTE 3
- psraw %2, %3, %1-5
-%if ARCH_X86_64
- pcmpeqw m10, %2, m0
- pcmpeqw m11, %2, m1
- pcmpeqw m12, %2, m2
- pcmpeqw %2, m3
- pand m10, m4
- pand m11, m5
- pand m12, m6
- pand %2, m7
- por m10, m11
- por m12, %2
- por m10, m12
- paddw %3, m10
-%else ; ARCH_X86_32
- pcmpeqw m4, %2, [rsp+MMSIZE*0]
- pcmpeqw m5, %2, [rsp+MMSIZE*1]
- pcmpeqw m6, %2, [rsp+MMSIZE*2]
- pcmpeqw %2, [rsp+MMSIZE*3]
- pand m4, [rsp+MMSIZE*4]
- pand m5, [rsp+MMSIZE*5]
- pand m6, [rsp+MMSIZE*6]
- pand %2, m7
- por m4, m5
- por m6, %2
- por m4, m6
- paddw %3, m4
-%endif ; ARCH
-%endmacro
-
;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
; int16_t *sao_offset_val, int sao_left_class, int width, int height);
%macro HEVC_SAO_BAND_FILTER 3
@@ -132,43 +100,47 @@ cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, ds
align 16
.loop:
-%if %2 == 8
- movu m8, [srcq]
- HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
- CLIPW m8, m14, m13
- movu [dstq], m8
-%endif
%assign i 0
+%assign j 0
%rep %3
- mova m8, [srcq + i]
- HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
- CLIPW m8, m14, m13
- mova [dstq + i], m8
-
- mova m9, [srcq + i + mmsize]
- HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9
- CLIPW m9, m14, m13
- mova [dstq + i + mmsize], m9
-%assign i i+mmsize*2
+%assign k 8+(j&1)
+%assign l 9-(j&1)
+ mova m %+ k, [srcq + i]
+ psraw m %+ l, m %+ k, %1-5
+%if ARCH_X86_64
+ pcmpeqw m10, m %+ l, m0
+ pcmpeqw m11, m %+ l, m1
+ pcmpeqw m12, m %+ l, m2
+ pcmpeqw m %+ l, m3
+ pand m10, m4
+ pand m11, m5
+ pand m12, m6
+ pand m %+ l, m7
+ por m10, m11
+ por m12, m %+ l
+ por m10, m12
+ paddw m %+ k, m10
+%else ; ARCH_X86_32
+ pcmpeqw m4, m %+ l, [rsp+mmsize*0]
+ pcmpeqw m5, m %+ l, [rsp+mmsize*1]
+ pcmpeqw m6, m %+ l, [rsp+mmsize*2]
+ pcmpeqw m %+ l, [rsp+mmsize*3]
+ pand m4, [rsp+mmsize*4]
+ pand m5, [rsp+mmsize*5]
+ pand m6, [rsp+mmsize*6]
+ pand m %+ l, m7
+ por m4, m5
+ por m6, m %+ l
+ por m4, m6
+ paddw m %+ k, m4
+%endif ; ARCH
+ CLIPW m %+ k, m14, m13
+ mova [dstq + i], m %+ k
+%assign i i+mmsize
+%assign j j+1
%endrep
-%if %2 == 48
-INIT_XMM cpuname
- mova m8, [srcq + i]
- HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
- CLIPW m8, m14, m13
- mova [dstq + i], m8
-
- mova m9, [srcq + i + mmsize]
- HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9
- CLIPW m9, m14, m13
- mova [dstq + i + mmsize], m9
-%if cpuflag(avx2)
-INIT_YMM cpuname
-%endif
-%endif ; %1 == 48
-
add dstq, dststrideq
add srcq, srcstrideq
dec heightd
@@ -177,17 +149,17 @@ INIT_YMM cpuname
%endmacro
%macro HEVC_SAO_BAND_FILTER_FUNCS 0
-HEVC_SAO_BAND_FILTER 10, 8, 0
-HEVC_SAO_BAND_FILTER 10, 16, 1
-HEVC_SAO_BAND_FILTER 10, 32, 2
-HEVC_SAO_BAND_FILTER 10, 48, 2
-HEVC_SAO_BAND_FILTER 10, 64, 4
-
-HEVC_SAO_BAND_FILTER 12, 8, 0
-HEVC_SAO_BAND_FILTER 12, 16, 1
-HEVC_SAO_BAND_FILTER 12, 32, 2
-HEVC_SAO_BAND_FILTER 12, 48, 2
-HEVC_SAO_BAND_FILTER 12, 64, 4
+HEVC_SAO_BAND_FILTER 10, 8, 1
+HEVC_SAO_BAND_FILTER 10, 16, 2
+HEVC_SAO_BAND_FILTER 10, 32, 4
+HEVC_SAO_BAND_FILTER 10, 48, 6
+HEVC_SAO_BAND_FILTER 10, 64, 8
+
+HEVC_SAO_BAND_FILTER 12, 8, 1
+HEVC_SAO_BAND_FILTER 12, 16, 2
+HEVC_SAO_BAND_FILTER 12, 32, 4
+HEVC_SAO_BAND_FILTER 12, 48, 6
+HEVC_SAO_BAND_FILTER 12, 64, 8
%endmacro
INIT_XMM sse2
@@ -197,20 +169,20 @@ HEVC_SAO_BAND_FILTER_FUNCS
%if HAVE_AVX2_EXTERNAL
INIT_XMM avx2
-HEVC_SAO_BAND_FILTER 10, 8, 0
-HEVC_SAO_BAND_FILTER 10, 16, 1
+HEVC_SAO_BAND_FILTER 10, 8, 1
INIT_YMM avx2
-HEVC_SAO_BAND_FILTER 10, 32, 1
-HEVC_SAO_BAND_FILTER 10, 48, 1
-HEVC_SAO_BAND_FILTER 10, 64, 2
+HEVC_SAO_BAND_FILTER 10, 16, 1
+HEVC_SAO_BAND_FILTER 10, 32, 2
+HEVC_SAO_BAND_FILTER 10, 48, 3
+HEVC_SAO_BAND_FILTER 10, 64, 4
INIT_XMM avx2
-HEVC_SAO_BAND_FILTER 12, 8, 0
-HEVC_SAO_BAND_FILTER 12, 16, 1
+HEVC_SAO_BAND_FILTER 12, 8, 1
INIT_YMM avx2
-HEVC_SAO_BAND_FILTER 12, 32, 1
-HEVC_SAO_BAND_FILTER 12, 48, 1
-HEVC_SAO_BAND_FILTER 12, 64, 2
+HEVC_SAO_BAND_FILTER 12, 16, 1
+HEVC_SAO_BAND_FILTER 12, 32, 2
+HEVC_SAO_BAND_FILTER 12, 48, 3
+HEVC_SAO_BAND_FILTER 12, 64, 4
%endif
;******************************************************************************
OpenPOWER on IntegriCloud