diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2013-06-18 21:30:43 +0000 |
---|---|---|
committer | Luca Barbato <lu_zero@gentoo.org> | 2013-06-29 13:23:57 +0200 |
commit | b545179fdff1ccfbbb9d422e4e9720cb6c6d9191 (patch) | |
tree | 0476bc87fd03fd512c49103a36ff05681a000e00 | |
parent | 502ab21af0ca68f76d6112722c46d2f35c004053 (diff) | |
download | ffmpeg-streaming-b545179fdff1ccfbbb9d422e4e9720cb6c6d9191.zip ffmpeg-streaming-b545179fdff1ccfbbb9d422e4e9720cb6c6d9191.tar.gz |
x86: lpc: simd av_evaluate_lls
1.5x-1.8x faster on sandybridge
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
-rw-r--r-- | libavutil/x86/lls.asm | 38 | ||||
-rw-r--r-- | libavutil/x86/lls_init.c | 3 |
2 files changed, 41 insertions, 0 deletions
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm index 92c00fc..92b7f95 100644 --- a/libavutil/x86/lls.asm +++ b/libavutil/x86/lls.asm @@ -194,3 +194,41 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 jle .loop2x1 .ret: REP_RET + + +INIT_XMM sse2 +cglobal evaluate_lls, 2,4,2, ctx, var, order, i + ; This function is often called on the same buffer as update_lls, but with + ; an offset. They can't both be aligned. + ; Load halves rather than movu to avoid store-forwarding stalls, since the + ; input was initialized immediately prior to this function using scalar math. + %define coefsq ctxq + mov id, orderd + imul orderd, MAX_VARS + lea coefsq, [ctxq + LLSModel.coeff + orderq*8] + movsd m0, [varq] + movhpd m0, [varq + 8] + mulpd m0, [coefsq] + lea coefsq, [coefsq + iq*8] + lea varq, [varq + iq*8] + neg iq + add iq, 2 +.loop: + movsd m1, [varq + iq*8] + movhpd m1, [varq + iq*8 + 8] + mulpd m1, [coefsq + iq*8] + addpd m0, m1 + add iq, 2 + jl .loop + jg .skip1 + movsd m1, [varq + iq*8] + mulsd m1, [coefsq + iq*8] + addpd m0, m1 +.skip1: + movhlps m1, m0 + addsd m0, m1 +%if ARCH_X86_32 + movsd r0m, m0 + fld qword r0m +%endif + RET diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c index 8a80f83..888bc54 100644 --- a/libavutil/x86/lls_init.c +++ b/libavutil/x86/lls_init.c @@ -25,12 +25,15 @@ void ff_update_lls_sse2(LLSModel *m, double *var); void ff_update_lls_avx(LLSModel *m, double *var); +double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order); av_cold void ff_init_lls_x86(LLSModel *m) { int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_SSE2(cpu_flags)) { m->update_lls = ff_update_lls_sse2; + if (m->indep_count >= 4) + m->evaluate_lls = ff_evaluate_lls_sse2; } if (EXTERNAL_AVX(cpu_flags)) { m->update_lls = ff_update_lls_avx; |