diff options
Diffstat (limited to 'libavcodec/x86/dcadsp.asm')
-rw-r--r-- | libavcodec/x86/dcadsp.asm | 518 |
1 files changed, 232 insertions, 286 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index 89d4ac4..c5bf21a 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -1,336 +1,282 @@ ;****************************************************************************** -;* SSE-optimized functions for the DCA decoder -;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com> +;* SIMD-optimized functions for the DCA decoder +;* Copyright (C) 2016 James Almer ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" -SECTION_RODATA -pf_inv16: times 4 dd 0x3D800000 ; 1/16 - SECTION .text -; %1=v0/v1 %2=in1 %3=in2 -%macro FIR_LOOP 2-3 -.loop%1: -%define va m1 -%define vb m2 -%if %1 -%define OFFSET 0 -%else -%define OFFSET NUM_COEF*count -%endif -; for v0, incrementing and for v1, decrementing - mova va, [cf0q + OFFSET] - mova vb, [cf0q + OFFSET + 4*NUM_COEF] -%if %0 == 3 - mova m4, [cf0q + OFFSET + mmsize] - mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize] -%endif - mulps va, %2 - mulps vb, %2 -%if %0 == 3 - mulps m4, %3 - mulps m0, %3 - addps va, m4 - addps vb, m0 -%endif - ; va = va1 va2 va3 va4 - ; vb = vb1 vb2 vb3 vb4 -%if %1 - SWAP va, vb -%endif - mova m4, va - unpcklps va, vb ; va3 vb3 va4 vb4 - unpckhps m4, vb ; va1 vb1 va2 vb2 - addps m4, va ; va1+3 vb1+3 va2+4 vb2+4 - movhlps vb, m4 ; va1+3 vb1+3 - addps vb, m4 ; va0..4 vb0..4 - movlps [outq + count], vb -%if %1 - sub cf0q, 8*NUM_COEF -%endif - add count, 8 - jl .loop%1 -%endmacro - -; void dca_lfe_fir(float *out, float *in, float *coefs) -%macro DCA_LFE_FIR 1 -cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0 -%define IN1 m3 -%define IN2 m5 -%define count inq -%define NUM_COEF 4*(2-%1) -%define NUM_OUT 32*(%1+1) +%define sizeof_float 4 +%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64) - movu IN1, [inq + 4 - 1*mmsize] - shufps IN1, IN1, q0123 -%if %1 == 0 - movu IN2, [inq + 4 - 2*mmsize] - shufps IN2, IN2, q0123 -%endif +%macro LFE_FIR0_FLOAT 0 +cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2 + shr nblocksd, 1 + sub lfeq, 7*sizeof_float + mov cnt1d, 32*sizeof_float + mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET + lea coeffq, [coeffq+cnt1q*8] + add samplesq, cnt1q + neg cnt1q - mov count, -4*NUM_OUT - add cf0q, 4*NUM_COEF*NUM_OUT - add outq, 4*NUM_OUT - ; compute v0 first -%if %1 == 0 - FIR_LOOP 0, IN1, IN2 -%else - FIR_LOOP 0, IN1 -%endif - shufps IN1, IN1, q0123 - mov count, -4*NUM_OUT - ; cf1 already correctly positioned - add outq, 4*NUM_OUT ; outq now at out2 - sub cf0q, 8*NUM_COEF -%if %1 == 0 - shufps IN2, IN2, q0123 - FIR_LOOP 1, IN2, IN1 +.loop: +%if cpuflag(avx) + cvtdq2ps m4, [lfeq+16] + cvtdq2ps m5, [lfeq ] + shufps m7, m4, m4, q0123 + shufps m6, m5, m5, q0123 +%elif cpuflag(sse2) + movu m4, [lfeq+16] + movu m5, [lfeq ] + cvtdq2ps m4, m4 + cvtdq2ps m5, m5 + pshufd m7, m4, q0123 + pshufd m6, m5, q0123 %else - FIR_LOOP 1, IN1 + cvtpi2ps m4, [lfeq+16] + cvtpi2ps m0, [lfeq+24] + cvtpi2ps m5, [lfeq ] + cvtpi2ps m1, [lfeq+8 ] + shufps m4, m0, q1010 + shufps m5, m1, q1010 + shufps m7, m4, m4, q0123 + shufps m6, m5, m5, q0123 %endif - RET -%endmacro -INIT_XMM sse -DCA_LFE_FIR 0 -DCA_LFE_FIR 1 +.inner_loop: +%if ARCH_X86_64 + movaps m8, [coeffq+cnt1q*8 ] + movaps m9, [coeffq+cnt1q*8+16] + movaps m10, [coeffq+cnt1q*8+32] + movaps m11, [coeffq+cnt1q*8+48] +%if cpuflag(fma3) + movaps m12, [coeffq+cnt1q*8+64] + movaps m13, [coeffq+cnt1q*8+80] + movaps m14, [coeffq+cnt1q*8+96] + movaps m15, [coeffq+cnt1q*8+112] + mulps m0, m7, m8 + mulps m1, m7, m10 + mulps m2, m7, m12 + mulps m3, m7, m14 + fmaddps m0, m6, m9, m0 + fmaddps m1, m6, m11, m1 + fmaddps m2, m6, m13, m2 + fmaddps m3, m6, m15, m3 -%macro SETZERO 1 -%if cpuflag(sse2) && notcpuflag(avx) - pxor %1, %1 + haddps m0, m1 + haddps m2, m3 + haddps m0, m2 + movaps [samplesq+cnt1q], m0 %else - xorps %1, %1, %1 -%endif -%endmacro + mulps m0, m7, m8 + mulps m1, m6, m9 + mulps m2, m7, m10 + mulps m3, m6, m11 + addps m0, m1 + addps m2, m3 -%macro SHUF 3 -%if cpuflag(avx) - mova %3, [%2 - 16] - vperm2f128 %1, %3, %3, 1 - vshufps %1, %1, %1, q0123 -%elif cpuflag(sse2) - pshufd %1, [%2], q0123 + unpckhps m3, m0, m2 + unpcklps m0, m2 + addps m3, m0 + movhlps m2, m3 + addps m2, m3 + movlps [samplesq+cnt1q], m2 +%endif +%else ; ARCH_X86_32 +%if cpuflag(fma3) + mulps m0, m7, [coeffq+cnt1q*8 ] + movaps m1, [coeffq+cnt1q*8+16] + mulps m2, m7, [coeffq+cnt1q*8+32] + fmaddps m0, m6, m1, m0 + fmaddps m2, m6, [coeffq+cnt1q*8+48], m2 %else - mova %1, [%2] - shufps %1, %1, q0123 -%endif -%endmacro + mulps m0, m7, [coeffq+cnt1q*8 ] + mulps m1, m6, [coeffq+cnt1q*8+16] + mulps m2, m7, [coeffq+cnt1q*8+32] + mulps m3, m6, [coeffq+cnt1q*8+48] + addps m0, m1 + addps m2, m3 +%endif + unpckhps m3, m0, m2 + unpcklps m0, m2 + addps m3, m0 + movhlps m2, m3 + addps m2, m3 + movlps [samplesq+cnt1q], m2 +%endif; ARCH -%macro INNER_LOOP 1 - ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i - ;~ a += window[i + j] * (-synth_buf[15 - i + j]) - ;~ b += window[i + j + 16] * (synth_buf[i + j]) - SHUF m5, ptr2 + j + (15 - 3) * 4, m6 - mova m6, [ptr1 + j] %if ARCH_X86_64 - SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12 - mova m12, [ptr1 + j + mmsize] -%endif %if cpuflag(fma3) - fmaddps m2, m6, [win + %1 + j + 16 * 4], m2 - fnmaddps m1, m5, [win + %1 + j], m1 -%if ARCH_X86_64 - fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8 - fnmaddps m7, m11, [win + %1 + j + mmsize], m7 -%endif -%else ; non-FMA - mulps m6, m6, [win + %1 + j + 16 * 4] - mulps m5, m5, [win + %1 + j] -%if ARCH_X86_64 - mulps m12, m12, [win + %1 + j + mmsize + 16 * 4] - mulps m11, m11, [win + %1 + j + mmsize] -%endif - addps m2, m2, m6 - subps m1, m1, m5 -%if ARCH_X86_64 - addps m8, m8, m12 - subps m7, m7, m11 -%endif -%endif ; cpuflag(fma3) - ;~ c += window[i + j + 32] * (synth_buf[16 + i + j]) - ;~ d += window[i + j + 48] * (synth_buf[31 - i + j]) - SHUF m6, ptr2 + j + (31 - 3) * 4, m5 - mova m5, [ptr1 + j + 16 * 4] -%if ARCH_X86_64 - SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11 - mova m11, [ptr1 + j + mmsize + 16 * 4] -%endif -%if cpuflag(fma3) - fmaddps m3, m5, [win + %1 + j + 32 * 4], m3 - fmaddps m4, m6, [win + %1 + j + 48 * 4], m4 -%if ARCH_X86_64 - fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9 - fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10 -%endif -%else ; non-FMA - mulps m5, m5, [win + %1 + j + 32 * 4] - mulps m6, m6, [win + %1 + j + 48 * 4] -%if ARCH_X86_64 - mulps m11, m11, [win + %1 + j + mmsize + 32 * 4] - mulps m12, m12, [win + %1 + j + mmsize + 48 * 4] -%endif - addps m3, m3, m5 - addps m4, m4, m6 -%if ARCH_X86_64 - addps m9, m9, m11 - addps m10, m10, m12 -%endif -%endif ; cpuflag(fma3) - sub j, 64 * 4 -%endmacro + mulps m8, m5 + mulps m10, m5 + mulps m12, m5 + mulps m14, m5 + fmaddps m8, m4, m9, m8 + fmaddps m10, m4, m11, m10 + fmaddps m12, m4, m13, m12 + fmaddps m14, m4, m15, m14 -; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32], -; const float window[512], float out[32], -; intptr_t offset, float scale) -%macro SYNTH_FILTER 0 -cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ - synth_buf, synth_buf2, window, out, off, scale -%define scale m0 -%if ARCH_X86_32 || WIN64 -%if cpuflag(sse2) && notcpuflag(avx) - movd scale, scalem - SPLATD m0 -%else - VBROADCASTSS m0, scalem -%endif -; Make sure offset is in a register and not on the stack -%define OFFQ r4q + haddps m10, m8 + haddps m14, m12 + haddps m14, m10 + movaps [samplesq+cnt2q], m14 %else - SPLATD xmm0 -%if cpuflag(avx) - vinsertf128 m0, m0, xmm0, 1 -%endif -%define OFFQ offq -%endif - ; prepare inner counter limit 1 - mov r5q, 480 - sub r5q, offmp - and r5q, -64 - shl r5q, 2 -%if ARCH_X86_32 || notcpuflag(avx) - mov OFFQ, r5q -%define i r5q - mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter + mulps m8, m5 + mulps m9, m4 + mulps m10, m5 + mulps m11, m4 + addps m8, m9 + addps m10, m11 + + unpckhps m11, m10, m8 + unpcklps m10, m8 + addps m11, m10 + movhlps m8, m11 + addps m8, m11 + movlps [samplesq+cnt2q], m8 +%endif +%else ; ARCH_X86_32 +%if cpuflag(fma3) + mulps m0, m5, [coeffq+cnt1q*8 ] + mulps m2, m5, [coeffq+cnt1q*8+32] + fmaddps m0, m4, m1, m0 + fmaddps m2, m4, [coeffq+cnt1q*8+48], m2 %else -%define i 0 -%define OFFQ r5q -%endif + mulps m0, m5, [coeffq+cnt1q*8 ] + mulps m1, m4, [coeffq+cnt1q*8+16] + mulps m2, m5, [coeffq+cnt1q*8+32] + mulps m3, m4, [coeffq+cnt1q*8+48] + addps m0, m1 + addps m2, m3 +%endif + unpckhps m3, m2, m0 + unpcklps m2, m0 + addps m3, m2 + movhlps m0, m3 + addps m0, m3 + movlps [samplesq+cnt2q], m0 +%endif; ARCH + + sub cnt2d, 8 + FMA3_OFFSET + add cnt1q, 8 + FMA3_OFFSET + jl .inner_loop + + add lfeq, 4 + add samplesq, 64*sizeof_float + mov cnt1q, -32*sizeof_float + mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET + sub nblocksd, 1 + jg .loop + RET +%endmacro -%define buf2 synth_buf2q -%if ARCH_X86_32 - mov buf2, synth_buf2mp -%endif -.mainloop - ; m1 = a m2 = b m3 = c m4 = d - SETZERO m3 - SETZERO m4 - mova m1, [buf2 + i] - mova m2, [buf2 + i + 16 * 4] %if ARCH_X86_32 -%define ptr1 r0q -%define ptr2 r1q -%define win r2q -%define j r3q - mov win, windowm - mov ptr1, synth_bufm -%if ARCH_X86_32 || notcpuflag(avx) - add win, i - add ptr1, i +INIT_XMM sse +LFE_FIR0_FLOAT %endif -%else ; ARCH_X86_64 -%define ptr1 r6q -%define ptr2 r7q ; must be loaded -%define win r8q -%define j r9q - SETZERO m9 - SETZERO m10 - mova m7, [buf2 + i + mmsize] - mova m8, [buf2 + i + mmsize + 16 * 4] - lea win, [windowq + i] - lea ptr1, [synth_bufq + i] +INIT_XMM sse2 +LFE_FIR0_FLOAT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +LFE_FIR0_FLOAT %endif - mov ptr2, synth_bufmp - ; prepare the inner loop counter - mov j, OFFQ -%if ARCH_X86_32 || notcpuflag(avx) - sub ptr2, i +%if HAVE_FMA3_EXTERNAL +INIT_XMM fma3 +LFE_FIR0_FLOAT %endif -.loop1: - INNER_LOOP 0 - jge .loop1 - mov j, 448 * 4 - sub j, OFFQ - jz .end - sub ptr1, j - sub ptr2, j - add win, OFFQ ; now at j-64, so define OFFSET - sub j, 64 * 4 -.loop2: - INNER_LOOP 64 * 4 - jge .loop2 +%macro LFE_FIR1_FLOAT 0 +cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2 + shr nblocksd, 2 + sub lfeq, 3*sizeof_float + mov cnt1d, 64*sizeof_float + mov cnt2d, 64*sizeof_float-16 + lea coeffq, [coeffq+cnt1q*4] + add samplesq, cnt1q + neg cnt1q -.end: -%if ARCH_X86_32 - mov buf2, synth_buf2m ; needed for next iteration anyway - mov outq, outmp ; j, which will be set again during it -%endif - ;~ out[i] = a * scale; - ;~ out[i + 16] = b * scale; - mulps m1, m1, scale - mulps m2, m2, scale -%if ARCH_X86_64 - mulps m7, m7, scale - mulps m8, m8, scale +.loop: +%if cpuflag(avx) + cvtdq2ps m4, [lfeq] + shufps m5, m4, m4, q0123 +%elif cpuflag(sse2) + movu m4, [lfeq] + cvtdq2ps m4, m4 + pshufd m5, m4, q0123 %endif - ;~ synth_buf2[i] = c; - ;~ synth_buf2[i + 16] = d; - mova [buf2 + i + 0 * 4], m3 - mova [buf2 + i + 16 * 4], m4 + +.inner_loop: + movaps m6, [coeffq+cnt1q*4 ] + movaps m7, [coeffq+cnt1q*4+16] + mulps m0, m5, m6 + mulps m1, m5, m7 %if ARCH_X86_64 - mova [buf2 + i + 0 * 4 + mmsize], m9 - mova [buf2 + i + 16 * 4 + mmsize], m10 + movaps m8, [coeffq+cnt1q*4+32] + movaps m9, [coeffq+cnt1q*4+48] + mulps m2, m5, m8 + mulps m3, m5, m9 +%else + mulps m2, m5, [coeffq+cnt1q*4+32] + mulps m3, m5, [coeffq+cnt1q*4+48] %endif - ;~ out[i] = a; - ;~ out[i + 16] = a; - mova [outq + i + 0 * 4], m1 - mova [outq + i + 16 * 4], m2 + + haddps m0, m1 + haddps m2, m3 + haddps m0, m2 + movaps [samplesq+cnt1q], m0 + + mulps m6, m4 + mulps m7, m4 %if ARCH_X86_64 - mova [outq + i + 0 * 4 + mmsize], m7 - mova [outq + i + 16 * 4 + mmsize], m8 -%endif -%if ARCH_X86_32 || notcpuflag(avx) - sub i, (ARCH_X86_64 + 1) * mmsize - jge .mainloop + mulps m8, m4 + mulps m9, m4 + + haddps m6, m7 + haddps m8, m9 + haddps m6, m8 +%else + mulps m2, m4, [coeffq+cnt1q*4+32] + mulps m3, m4, [coeffq+cnt1q*4+48] + + haddps m6, m7 + haddps m2, m3 + haddps m6, m2 %endif + movaps [samplesq+cnt2q], m6 + + sub cnt2d, 16 + add cnt1q, 16 + jl .inner_loop + + add lfeq, sizeof_float + add samplesq, 128*sizeof_float + mov cnt1q, -64*sizeof_float + mov cnt2d, 64*sizeof_float-16 + sub nblocksd, 1 + jg .loop RET %endmacro -%if ARCH_X86_32 -INIT_XMM sse -SYNTH_FILTER +INIT_XMM sse3 +LFE_FIR1_FLOAT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +LFE_FIR1_FLOAT %endif -INIT_XMM sse2 -SYNTH_FILTER -INIT_YMM avx -SYNTH_FILTER -INIT_YMM fma3 -SYNTH_FILTER |