1 files changed, 82 insertions, 75 deletions
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index 63e92f7..a671e8f 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -68,11 +68,12 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
 ps_m1p1: dd 1<<31, 0
 
+cextern ps_neg
+
 %assign i 16
-%rep 13
+%rep 14
 cextern cos_ %+ i
 %assign i i<<1
 %endrep
@@ -198,7 +199,7 @@ SECTION .text
     vextractf128  %4 %+ H(%5), %3, 0
     vextractf128   %4(%5 + 1), %2, 1
     vextractf128  %4 %+ H(%5 + 1), %3, 1
-%elif cpuflag(sse)
+%elif cpuflag(sse) || cpuflag(3dnow)
     mova     %3, %2
     unpcklps %2, %1
     unpckhps %3, %1
@@ -321,6 +322,7 @@ IF%1 mova  Z(1), m5
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 align 16
 fft8_avx:
     mova      m0, Z(0)
@@ -410,6 +412,8 @@ fft32_interleave_avx:
     jg .deint_loop
     ret
 
+%endif
+
 INIT_XMM sse
 
 align 16
@@ -553,6 +557,7 @@ DEFINE_ARGS zc, w, n, o1, o3
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 DECL_PASS pass_avx, PASS_BIG 1
 DECL_PASS pass_interleave_avx, PASS_BIG 0
 
@@ -563,6 +568,7 @@ cglobal fft_calc, 2,5,8
     FFT_DISPATCH _interleave %+ SUFFIX, r1
     REP_RET
 
+%endif
 
 INIT_XMM sse
 
@@ -650,6 +656,68 @@ cglobal fft_permute, 2,7,1
     jl      .loopcopy
     REP_RET
 
+%macro IMDCT_CALC_FUNC 0
+cglobal imdct_calc, 3,5,3
+    mov     r3d, [r0 + FFTContext.mdctsize]
+    mov     r4,  [r0 + FFTContext.imdcthalf]
+    add     r1,  r3
+    PUSH    r3
+    PUSH    r1
+%if ARCH_X86_32
+    push    r2
+    push    r1
+    push    r0
+%else
+    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
+%endif
+    call    r4
+%if ARCH_X86_32
+    add     esp, 12
+%else
+    add     rsp, 8+32*WIN64
+%endif
+    POP     r1
+    POP     r3
+    lea     r0, [r1 + 2*r3]
+    mov     r2, r3
+    sub     r3, mmsize
+    neg     r2
+    mova    m2, [ps_neg]
+.loop:
+%if mmsize == 8
+    PSWAPD  m0, [r1 + r3]
+    PSWAPD  m1, [r0 + r2]
+    pxor    m0, m2
+%else
+    mova    m0, [r1 + r3]
+    mova    m1, [r0 + r2]
+    shufps  m0, m0, 0x1b
+    shufps  m1, m1, 0x1b
+    xorps   m0, m2
+%endif
+    mova [r0 + r3], m1
+    mova [r1 + r2], m0
+    sub     r3, mmsize
+    add     r2, mmsize
+    jl      .loop
+%if cpuflag(3dnow)
+    femms
+    RET
+%else
+    REP_RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+IMDCT_CALC_FUNC
+INIT_MMX 3dnowext
+IMDCT_CALC_FUNC
+%endif
+
+INIT_XMM sse
+IMDCT_CALC_FUNC
+
 %if ARCH_X86_32
 INIT_MMX 3dnow
 %define mulps pfmul
@@ -684,7 +752,7 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
 %endif
 
 %assign n 1<<%1
-%rep 17-%1
+%rep 18-%1
 %assign n2 n/2
 %assign n4 n/4
 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
@@ -709,9 +777,11 @@ align 8
 dispatch_tab %+ fullsuffix: pointer list_of_fft
 %endmacro ; DECL_FFT
 
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 DECL_FFT 6
 DECL_FFT 6, _interleave
+%endif
 INIT_XMM sse
 DECL_FFT 5
 DECL_FFT 5, _interleave
@@ -724,70 +794,6 @@ DECL_FFT 4
 DECL_FFT 4, _interleave
 %endif
 
-%if CONFIG_MDCT
-
-%macro IMDCT_CALC_FUNC 0
-cglobal imdct_calc, 3,5,3
-    mov     r3d, [r0 + FFTContext.mdctsize]
-    mov     r4,  [r0 + FFTContext.imdcthalf]
-    add     r1,  r3
-    PUSH    r3
-    PUSH    r1
-%if ARCH_X86_32
-    push    r2
-    push    r1
-    push    r0
-%else
-    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
-%endif
-    call    r4
-%if ARCH_X86_32
-    add     esp, 12
-%else
-    add     rsp, 8+32*WIN64
-%endif
-    POP     r1
-    POP     r3
-    lea     r0, [r1 + 2*r3]
-    mov     r2, r3
-    sub     r3, mmsize
-    neg     r2
-    mova    m2, [ps_m1m1m1m1]
-.loop:
-%if mmsize == 8
-    PSWAPD  m0, [r1 + r3]
-    PSWAPD  m1, [r0 + r2]
-    pxor    m0, m2
-%else
-    mova    m0, [r1 + r3]
-    mova    m1, [r0 + r2]
-    shufps  m0, m0, 0x1b
-    shufps  m1, m1, 0x1b
-    xorps   m0, m2
-%endif
-    mova [r0 + r3], m1
-    mova [r1 + r2], m0
-    sub     r3, mmsize
-    add     r2, mmsize
-    jl      .loop
-%if cpuflag(3dnow)
-    femms
-    RET
-%else
-    REP_RET
-%endif
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-IMDCT_CALC_FUNC
-INIT_MMX 3dnowext
-IMDCT_CALC_FUNC
-%endif
-
-INIT_XMM sse
-IMDCT_CALC_FUNC
-
 INIT_XMM sse
 %undef mulps
 %undef addps
@@ -985,7 +991,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
     sub   r4, r3
 %endif
 %if notcpuflag(3dnowext) && mmsize == 8
-    movd  m7, [ps_m1m1m1m1]
+    movd  m7, [ps_neg]
 %endif
 .pre:
 %if ARCH_X86_64 == 0
@@ -1073,6 +1079,7 @@ DECL_IMDCT
 %endif
 
 INIT_YMM avx
-DECL_IMDCT
 
-%endif ; CONFIG_MDCT
+%if HAVE_AVX_EXTERNAL
+DECL_IMDCT
+%endif