summaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/dcadsp.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86/dcadsp.asm')
-rw-r--r--libavcodec/x86/dcadsp.asm537
1 files changed, 251 insertions, 286 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index fa8d3cb..055361a 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -1,336 +1,301 @@
;******************************************************************************
-;* SSE-optimized functions for the DCA decoder
-;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;* SIMD-optimized functions for the DCA decoder
+;* Copyright (C) 2016 James Almer
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-pf_inv16: times 4 dd 0x3D800000 ; 1/16
-
SECTION .text
-; %1=v0/v1 %2=in1 %3=in2
-%macro FIR_LOOP 2-3
-.loop%1:
-%define va m1
-%define vb m2
-%if %1
-%define OFFSET 0
-%else
-%define OFFSET NUM_COEF*count
-%endif
-; for v0, incrementing and for v1, decrementing
- mova va, [cf0q + OFFSET]
- mova vb, [cf0q + OFFSET + 4*NUM_COEF]
-%if %0 == 3
- mova m4, [cf0q + OFFSET + mmsize]
- mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
-%endif
- mulps va, %2
- mulps vb, %2
-%if %0 == 3
- mulps m4, %3
- mulps m0, %3
- addps va, m4
- addps vb, m0
-%endif
- ; va = va1 va2 va3 va4
- ; vb = vb1 vb2 vb3 vb4
-%if %1
- SWAP va, vb
-%endif
- mova m4, va
- unpcklps va, vb ; va3 vb3 va4 vb4
- unpckhps m4, vb ; va1 vb1 va2 vb2
- addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
- movhlps vb, m4 ; va1+3 vb1+3
- addps vb, m4 ; va0..4 vb0..4
- movlps [outq + count], vb
-%if %1
- sub cf0q, 8*NUM_COEF
-%endif
- add count, 8
- jl .loop%1
-%endmacro
+%define sizeof_float 4
+%define FMA3_OFFSET (8 * cpuflag(fma3))
-; void dca_lfe_fir(float *out, float *in, float *coefs)
-%macro DCA_LFE_FIR 1
-cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
-%define IN1 m3
-%define IN2 m5
-%define count inq
-%define NUM_COEF 4*(2-%1)
-%define NUM_OUT 32*(%1+1)
+%macro LFE_FIR0_FLOAT 0
+cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
+ shr nblocksd, 1
+ sub lfeq, 7*sizeof_float
+ mov cnt1d, 32*sizeof_float
+ mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+ lea coeffq, [coeffq+cnt1q*8]
+ add samplesq, cnt1q
+ neg cnt1q
- movu IN1, [inq + 4 - 1*mmsize]
- shufps IN1, IN1, q0123
-%if %1 == 0
- movu IN2, [inq + 4 - 2*mmsize]
- shufps IN2, IN2, q0123
-%endif
-
- mov count, -4*NUM_OUT
- add cf0q, 4*NUM_COEF*NUM_OUT
- add outq, 4*NUM_OUT
- ; compute v0 first
-%if %1 == 0
- FIR_LOOP 0, IN1, IN2
-%else
- FIR_LOOP 0, IN1
-%endif
- shufps IN1, IN1, q0123
- mov count, -4*NUM_OUT
- ; cf1 already correctly positioned
- add outq, 4*NUM_OUT ; outq now at out2
- sub cf0q, 8*NUM_COEF
-%if %1 == 0
- shufps IN2, IN2, q0123
- FIR_LOOP 1, IN2, IN1
+.loop:
+%if cpuflag(avx)
+ cvtdq2ps m4, [lfeq+16]
+ cvtdq2ps m5, [lfeq ]
+ shufps m7, m4, m4, q0123
+ shufps m6, m5, m5, q0123
+%elif cpuflag(sse2)
+ movu m4, [lfeq+16]
+ movu m5, [lfeq ]
+ cvtdq2ps m4, m4
+ cvtdq2ps m5, m5
+ pshufd m7, m4, q0123
+ pshufd m6, m5, q0123
%else
- FIR_LOOP 1, IN1
+ cvtpi2ps m4, [lfeq+16]
+ cvtpi2ps m0, [lfeq+24]
+ cvtpi2ps m5, [lfeq ]
+ cvtpi2ps m1, [lfeq+8 ]
+ shufps m4, m0, q1010
+ shufps m5, m1, q1010
+ shufps m7, m4, m4, q0123
+ shufps m6, m5, m5, q0123
%endif
- RET
-%endmacro
-INIT_XMM sse
-DCA_LFE_FIR 0
-DCA_LFE_FIR 1
+.inner_loop:
+%if ARCH_X86_64
+ movaps m8, [coeffq+cnt1q*8 ]
+ movaps m9, [coeffq+cnt1q*8+16]
+ movaps m10, [coeffq+cnt1q*8+32]
+ movaps m11, [coeffq+cnt1q*8+48]
+%if cpuflag(fma3)
+ movaps m12, [coeffq+cnt1q*8+64]
+ movaps m13, [coeffq+cnt1q*8+80]
+ movaps m14, [coeffq+cnt1q*8+96]
+ movaps m15, [coeffq+cnt1q*8+112]
+ mulps m0, m7, m8
+ mulps m1, m7, m10
+ mulps m2, m7, m12
+ mulps m3, m7, m14
+ fmaddps m0, m6, m9, m0
+ fmaddps m1, m6, m11, m1
+ fmaddps m2, m6, m13, m2
+ fmaddps m3, m6, m15, m3
-%macro SETZERO 1
-%if cpuflag(sse2) && notcpuflag(avx)
- pxor %1, %1
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
%else
- xorps %1, %1, %1
-%endif
-%endmacro
+ mulps m0, m7, m8
+ mulps m1, m6, m9
+ mulps m2, m7, m10
+ mulps m3, m6, m11
+ addps m0, m1
+ addps m2, m3
-%macro SHUF 3
-%if cpuflag(avx)
- mova %3, [%2 - 16]
- vperm2f128 %1, %3, %3, 1
- vshufps %1, %1, %1, q0123
-%elif cpuflag(sse2)
- pshufd %1, [%2], q0123
+ unpckhps m3, m0, m2
+ unpcklps m0, m2
+ addps m3, m0
+ movhlps m2, m3
+ addps m2, m3
+ movlps [samplesq+cnt1q], m2
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+ mulps m0, m7, [coeffq+cnt1q*8 ]
+ mulps m1, m7, [coeffq+cnt1q*8+32 ]
+ mulps m2, m7, [coeffq+cnt1q*8+64 ]
+ mulps m3, m7, [coeffq+cnt1q*8+96 ]
+ fmaddps m0, m6, [coeffq+cnt1q*8+16 ], m0
+ fmaddps m1, m6, [coeffq+cnt1q*8+48 ], m1
+ fmaddps m2, m6, [coeffq+cnt1q*8+80 ], m2
+ fmaddps m3, m6, [coeffq+cnt1q*8+112], m3
+
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
%else
- mova %1, [%2]
- shufps %1, %1, q0123
-%endif
-%endmacro
+ mulps m0, m7, [coeffq+cnt1q*8 ]
+ mulps m1, m6, [coeffq+cnt1q*8+16]
+ mulps m2, m7, [coeffq+cnt1q*8+32]
+ mulps m3, m6, [coeffq+cnt1q*8+48]
+ addps m0, m1
+ addps m2, m3
+
+ unpckhps m3, m0, m2
+ unpcklps m0, m2
+ addps m3, m0
+ movhlps m2, m3
+ addps m2, m3
+ movlps [samplesq+cnt1q], m2
+%endif
+%endif; ARCH
-%macro INNER_LOOP 1
- ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
- ;~ a += window[i + j] * (-synth_buf[15 - i + j])
- ;~ b += window[i + j + 16] * (synth_buf[i + j])
- SHUF m5, ptr2 + j + (15 - 3) * 4, m6
- mova m6, [ptr1 + j]
-%if ARCH_X86_64
- SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
- mova m12, [ptr1 + j + mmsize]
-%endif
-%if cpuflag(fma3)
- fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
- fnmaddps m1, m5, [win + %1 + j], m1
-%if ARCH_X86_64
- fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
- fnmaddps m7, m11, [win + %1 + j + mmsize], m7
-%endif
-%else ; non-FMA
- mulps m6, m6, [win + %1 + j + 16 * 4]
- mulps m5, m5, [win + %1 + j]
-%if ARCH_X86_64
- mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
- mulps m11, m11, [win + %1 + j + mmsize]
-%endif
- addps m2, m2, m6
- subps m1, m1, m5
-%if ARCH_X86_64
- addps m8, m8, m12
- subps m7, m7, m11
-%endif
-%endif ; cpuflag(fma3)
- ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
- ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
- SHUF m6, ptr2 + j + (31 - 3) * 4, m5
- mova m5, [ptr1 + j + 16 * 4]
%if ARCH_X86_64
- SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
- mova m11, [ptr1 + j + mmsize + 16 * 4]
-%endif
%if cpuflag(fma3)
- fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
- fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
-%if ARCH_X86_64
- fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
- fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
-%endif
-%else ; non-FMA
- mulps m5, m5, [win + %1 + j + 32 * 4]
- mulps m6, m6, [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
- mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
- mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
- addps m3, m3, m5
- addps m4, m4, m6
-%if ARCH_X86_64
- addps m9, m9, m11
- addps m10, m10, m12
-%endif
-%endif ; cpuflag(fma3)
- sub j, 64 * 4
-%endmacro
+ mulps m8, m5
+ mulps m10, m5
+ mulps m12, m5
+ mulps m14, m5
+ fmaddps m8, m4, m9, m8
+ fmaddps m10, m4, m11, m10
+ fmaddps m12, m4, m13, m12
+ fmaddps m14, m4, m15, m14
-; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
-; const float window[512], float out[32],
-; intptr_t offset, float scale)
-%macro SYNTH_FILTER 0
-cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
- synth_buf, synth_buf2, window, out, off, scale
-%define scale m0
-%if ARCH_X86_32 || WIN64
-%if cpuflag(sse2) && notcpuflag(avx)
- movd scale, scalem
- SPLATD m0
-%else
- VBROADCASTSS m0, scalem
-%endif
-; Make sure offset is in a register and not on the stack
-%define OFFQ r4q
+ haddps m10, m8
+ haddps m14, m12
+ haddps m14, m10
+ movaps [samplesq+cnt2q], m14
%else
- SPLATD xmm0
-%if cpuflag(avx)
- vinsertf128 m0, m0, xmm0, 1
-%endif
-%define OFFQ offq
-%endif
- ; prepare inner counter limit 1
- mov r5q, 480
- sub r5q, offmp
- and r5q, -64
- shl r5q, 2
-%if ARCH_X86_32 || notcpuflag(avx)
- mov OFFQ, r5q
-%define i r5q
- mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
+ mulps m8, m5
+ mulps m9, m4
+ mulps m10, m5
+ mulps m11, m4
+ addps m8, m9
+ addps m10, m11
+
+ unpckhps m11, m10, m8
+ unpcklps m10, m8
+ addps m11, m10
+ movhlps m8, m11
+ addps m8, m11
+ movlps [samplesq+cnt2q], m8
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+ mulps m0, m5, [coeffq+cnt1q*8 ]
+ mulps m1, m5, [coeffq+cnt1q*8+32 ]
+ mulps m2, m5, [coeffq+cnt1q*8+64 ]
+ mulps m3, m5, [coeffq+cnt1q*8+96 ]
+ fmaddps m0, m4, [coeffq+cnt1q*8+16 ], m0
+ fmaddps m1, m4, [coeffq+cnt1q*8+48 ], m1
+ fmaddps m2, m4, [coeffq+cnt1q*8+80 ], m2
+ fmaddps m3, m4, [coeffq+cnt1q*8+112], m3
+
+ haddps m1, m0
+ haddps m3, m2
+ haddps m3, m1
+ movaps [samplesq+cnt2q], m3
%else
-%define i 0
-%define OFFQ r5q
-%endif
+ mulps m0, m5, [coeffq+cnt1q*8 ]
+ mulps m1, m4, [coeffq+cnt1q*8+16]
+ mulps m2, m5, [coeffq+cnt1q*8+32]
+ mulps m3, m4, [coeffq+cnt1q*8+48]
+ addps m0, m1
+ addps m2, m3
+
+ unpckhps m3, m2, m0
+ unpcklps m2, m0
+ addps m3, m2
+ movhlps m0, m3
+ addps m0, m3
+ movlps [samplesq+cnt2q], m0
+%endif
+%endif; ARCH
+
+ sub cnt2d, 8 + FMA3_OFFSET
+ add cnt1q, 8 + FMA3_OFFSET
+ jl .inner_loop
+
+ add lfeq, 4
+ add samplesq, 64*sizeof_float
+ mov cnt1q, -32*sizeof_float
+ mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+ sub nblocksd, 1
+ jg .loop
+ RET
+%endmacro
-%define buf2 synth_buf2q
-%if ARCH_X86_32
- mov buf2, synth_buf2mp
-%endif
-.mainloop:
- ; m1 = a m2 = b m3 = c m4 = d
- SETZERO m3
- SETZERO m4
- mova m1, [buf2 + i]
- mova m2, [buf2 + i + 16 * 4]
%if ARCH_X86_32
-%define ptr1 r0q
-%define ptr2 r1q
-%define win r2q
-%define j r3q
- mov win, windowm
- mov ptr1, synth_bufm
-%if ARCH_X86_32 || notcpuflag(avx)
- add win, i
- add ptr1, i
+INIT_XMM sse
+LFE_FIR0_FLOAT
%endif
-%else ; ARCH_X86_64
-%define ptr1 r6q
-%define ptr2 r7q ; must be loaded
-%define win r8q
-%define j r9q
- SETZERO m9
- SETZERO m10
- mova m7, [buf2 + i + mmsize]
- mova m8, [buf2 + i + mmsize + 16 * 4]
- lea win, [windowq + i]
- lea ptr1, [synth_bufq + i]
+INIT_XMM sse2
+LFE_FIR0_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR0_FLOAT
%endif
- mov ptr2, synth_bufmp
- ; prepare the inner loop counter
- mov j, OFFQ
-%if ARCH_X86_32 || notcpuflag(avx)
- sub ptr2, i
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+LFE_FIR0_FLOAT
%endif
-.loop1:
- INNER_LOOP 0
- jge .loop1
- mov j, 448 * 4
- sub j, OFFQ
- jz .end
- sub ptr1, j
- sub ptr2, j
- add win, OFFQ ; now at j-64, so define OFFSET
- sub j, 64 * 4
-.loop2:
- INNER_LOOP 64 * 4
- jge .loop2
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+ shr nblocksd, 2
+ sub lfeq, 3*sizeof_float
+ mov cnt1d, 64*sizeof_float
+ mov cnt2d, 64*sizeof_float-16
+ lea coeffq, [coeffq+cnt1q*4]
+ add samplesq, cnt1q
+ neg cnt1q
-.end:
-%if ARCH_X86_32
- mov buf2, synth_buf2m ; needed for next iteration anyway
- mov outq, outmp ; j, which will be set again during it
-%endif
- ;~ out[i] = a * scale;
- ;~ out[i + 16] = b * scale;
- mulps m1, m1, scale
- mulps m2, m2, scale
-%if ARCH_X86_64
- mulps m7, m7, scale
- mulps m8, m8, scale
+.loop:
+%if cpuflag(avx)
+ cvtdq2ps m4, [lfeq]
+ shufps m5, m4, m4, q0123
+%elif cpuflag(sse2)
+ movu m4, [lfeq]
+ cvtdq2ps m4, m4
+ pshufd m5, m4, q0123
%endif
- ;~ synth_buf2[i] = c;
- ;~ synth_buf2[i + 16] = d;
- mova [buf2 + i + 0 * 4], m3
- mova [buf2 + i + 16 * 4], m4
+
+.inner_loop:
+ movaps m6, [coeffq+cnt1q*4 ]
+ movaps m7, [coeffq+cnt1q*4+16]
+ mulps m0, m5, m6
+ mulps m1, m5, m7
%if ARCH_X86_64
- mova [buf2 + i + 0 * 4 + mmsize], m9
- mova [buf2 + i + 16 * 4 + mmsize], m10
+ movaps m8, [coeffq+cnt1q*4+32]
+ movaps m9, [coeffq+cnt1q*4+48]
+ mulps m2, m5, m8
+ mulps m3, m5, m9
+%else
+ mulps m2, m5, [coeffq+cnt1q*4+32]
+ mulps m3, m5, [coeffq+cnt1q*4+48]
%endif
- ;~ out[i] = a;
- ;~ out[i + 16] = a;
- mova [outq + i + 0 * 4], m1
- mova [outq + i + 16 * 4], m2
+
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
+
+ mulps m6, m4
+ mulps m7, m4
%if ARCH_X86_64
- mova [outq + i + 0 * 4 + mmsize], m7
- mova [outq + i + 16 * 4 + mmsize], m8
-%endif
-%if ARCH_X86_32 || notcpuflag(avx)
- sub i, (ARCH_X86_64 + 1) * mmsize
- jge .mainloop
+ mulps m8, m4
+ mulps m9, m4
+
+ haddps m6, m7
+ haddps m8, m9
+ haddps m6, m8
+%else
+ mulps m2, m4, [coeffq+cnt1q*4+32]
+ mulps m3, m4, [coeffq+cnt1q*4+48]
+
+ haddps m6, m7
+ haddps m2, m3
+ haddps m6, m2
%endif
+ movaps [samplesq+cnt2q], m6
+
+ sub cnt2d, 16
+ add cnt1q, 16
+ jl .inner_loop
+
+ add lfeq, sizeof_float
+ add samplesq, 128*sizeof_float
+ mov cnt1q, -64*sizeof_float
+ mov cnt2d, 64*sizeof_float-16
+ sub nblocksd, 1
+ jg .loop
RET
%endmacro
-%if ARCH_X86_32
-INIT_XMM sse
-SYNTH_FILTER
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
%endif
-INIT_XMM sse2
-SYNTH_FILTER
-INIT_YMM avx
-SYNTH_FILTER
-INIT_YMM fma3
-SYNTH_FILTER
OpenPOWER on IntegriCloud