summaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/fft_mmx.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86/fft_mmx.asm')
-rw-r--r--libavcodec/x86/fft_mmx.asm108
1 files changed, 57 insertions, 51 deletions
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 225c666..b60d8b0 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -297,7 +297,7 @@ IF%1 mova Z(1), m5
%define Z2(x) [r0+mmsize*x]
%define ZH(x) [r0+mmsize*x+mmsize/2]
-INIT_YMM
+INIT_YMM avx
%if HAVE_AVX
align 16
@@ -390,7 +390,7 @@ fft32_interleave_avx:
ret
%endif
-INIT_XMM
+INIT_XMM sse
%define movdqa movaps
align 16
@@ -439,11 +439,9 @@ fft16_sse:
ret
-INIT_MMX
-
-%macro FFT48_3DN 1
+%macro FFT48_3DN 0
align 16
-fft4%1:
+fft4 %+ SUFFIX:
T2_3DN m0, m1, Z(0), Z(1)
mova m2, Z(2)
mova m3, Z(3)
@@ -457,7 +455,7 @@ fft4%1:
ret
align 16
-fft8%1:
+fft8 %+ SUFFIX:
T2_3DN m0, m1, Z(0), Z(1)
mova m2, Z(2)
mova m3, Z(3)
@@ -495,7 +493,8 @@ fft8%1:
ret
%endmacro
-FFT48_3DN _3dn2
+INIT_MMX 3dnow2
+FFT48_3DN
%macro pswapd 2
%ifidn %1, %2
@@ -508,7 +507,8 @@ FFT48_3DN _3dn2
%endif
%endmacro
-FFT48_3DN _3dn
+INIT_MMX 3dnow
+FFT48_3DN
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
@@ -532,7 +532,7 @@ DEFINE_ARGS z, w, n, o1, o3
rep ret
%endmacro
-INIT_YMM
+INIT_YMM avx
%if HAVE_AVX
%macro INTERL_AVX 5
@@ -550,7 +550,7 @@ DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
%endif
-INIT_XMM
+INIT_XMM sse
%macro INTERL_SSE 5
mova %3, %2
@@ -565,16 +565,16 @@ INIT_XMM
DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0
-INIT_MMX
+INIT_MMX 3dnow
%define mulps pfmul
%define addps pfadd
%define subps pfsub
%define unpcklps punpckldq
%define unpckhps punpckhdq
-DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
-DECL_PASS pass_interleave_3dn, PASS_BIG 0
-%define pass_3dn2 pass_3dn
-%define pass_interleave_3dn2 pass_interleave_3dn
+DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
+DECL_PASS pass_interleave_3dnow, PASS_BIG 0
+%define pass_3dnow2 pass_3dnow
+%define pass_interleave_3dnow2 pass_interleave_3dnow
%ifdef PIC
%define SECTION_REL - $$
@@ -592,67 +592,73 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
call r2
%endmacro ; FFT_DISPATCH
-%macro DECL_FFT 2-3 ; nbits, cpu, suffix
-%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
+%macro DECL_FFT 1-2 ; nbits, suffix
+%ifidn %0, 1
+%xdefine fullsuffix SUFFIX
+%else
+%xdefine fullsuffix %2 %+ SUFFIX
+%endif
+%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
%if %1>=5
-%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
+%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
%endif
%if %1>=6
-%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
+%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
%endif
%assign n 1<<%1
%rep 17-%1
%assign n2 n/2
%assign n4 n/4
-%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
+%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
align 16
-fft %+ n %+ %3%2:
- call fft %+ n2 %+ %2
+fft %+ n %+ fullsuffix:
+ call fft %+ n2 %+ SUFFIX
add r0, n*4 - (n&(-2<<%1))
- call fft %+ n4 %+ %2
+ call fft %+ n4 %+ SUFFIX
add r0, n*2 - (n2&(-2<<%1))
- call fft %+ n4 %+ %2
+ call fft %+ n4 %+ SUFFIX
sub r0, n*6 + (n2&(-2<<%1))
lea r1, [cos_ %+ n]
mov r2d, n4/2
- jmp pass%3%2
+ jmp pass %+ fullsuffix
%assign n n*2
%endrep
%undef n
align 8
-dispatch_tab%3%2: pointer list_of_fft
+dispatch_tab %+ fullsuffix: pointer list_of_fft
section .text
; On x86_32, this function does the register saving and restoring for all of fft.
; The others pass args in registers and don't spill anything.
-cglobal fft_dispatch%3%2, 2,5,8, z, nbits
- FFT_DISPATCH %3%2, nbits
-%ifidn %2, _avx
+cglobal fft_dispatch%2, 2,5,8, z, nbits
+ FFT_DISPATCH fullsuffix, nbits
+%if mmsize == 32
vzeroupper
%endif
RET
%endmacro ; DECL_FFT
%if HAVE_AVX
-INIT_YMM
-DECL_FFT 6, _avx
-DECL_FFT 6, _avx, _interleave
+INIT_YMM avx
+DECL_FFT 6
+DECL_FFT 6, _interleave
%endif
-INIT_XMM
-DECL_FFT 5, _sse
-DECL_FFT 5, _sse, _interleave
-INIT_MMX
-DECL_FFT 4, _3dn
-DECL_FFT 4, _3dn, _interleave
-DECL_FFT 4, _3dn2
-DECL_FFT 4, _3dn2, _interleave
-
-INIT_XMM
+INIT_XMM sse
+DECL_FFT 5
+DECL_FFT 5, _interleave
+INIT_MMX 3dnow
+DECL_FFT 4
+DECL_FFT 4, _interleave
+INIT_MMX 3dnow2
+DECL_FFT 4
+DECL_FFT 4, _interleave
+
+INIT_XMM sse
%undef mulps
%undef addps
%undef subps
@@ -748,8 +754,8 @@ INIT_XMM
jl .post
%endmacro
-%macro DECL_IMDCT 2
-cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
+%macro DECL_IMDCT 1
+cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
%if ARCH_X86_64
%define rrevtab r7
%define rtcos r8
@@ -821,7 +827,7 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample
mov r0, r1
mov r1d, [r5+FFTContext.nbits]
- FFT_DISPATCH %1, r1
+ FFT_DISPATCH SUFFIX, r1
mov r0d, [r5+FFTContext.mdctsize]
add r6, r0
@@ -835,20 +841,20 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample
neg r0
mov r1, -mmsize
sub r1, r0
- %2 r0, r1, r6, rtcos, rtsin
+ %1 r0, r1, r6, rtcos, rtsin
%if ARCH_X86_64 == 0
add esp, 12
%endif
-%ifidn avx_enabled, 1
+%if mmsize == 32
vzeroupper
%endif
RET
%endmacro
-DECL_IMDCT _sse, POSROTATESHUF
+DECL_IMDCT POSROTATESHUF
-INIT_YMM
+INIT_YMM avx
%if HAVE_AVX
-DECL_IMDCT _avx, POSROTATESHUF_AVX
+DECL_IMDCT POSROTATESHUF_AVX
%endif
OpenPOWER on IntegriCloud