summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2017-05-23 15:19:39 -0300
committerJames Almer <jamrial@gmail.com>2017-06-02 11:06:24 -0300
commitb5a0971ff041badbdd1482e4ae2a0a16700a748f (patch)
tree13a08aa486992682b02fce7dc7ebcc8d1a372093
parent3385989b98be7940044e4f0a6b431a0a00abf2fa (diff)
downloadffmpeg-streaming-b5a0971ff041badbdd1482e4ae2a0a16700a748f.zip
ffmpeg-streaming-b5a0971ff041badbdd1482e4ae2a0a16700a748f.tar.gz
x86/aacps: add ff_ps_stereo_interpolate_ipdopd_sse3()
About 2x faster than the c version. Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r--libavcodec/x86/aacpsdsp.asm51
-rw-r--r--libavcodec/x86/aacpsdsp_init.c4
2 files changed, 55 insertions, 0 deletions
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index e92cbbc..bb8a7f5 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -117,6 +117,57 @@ align 16
.ret:
REP_RET
+;***************************************************************************
+;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+; float h[2][4], float h_step[2][4],
+; int len);
+;***************************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
+ cmp nd, 0
+ jle .ret
+ movaps m0, [hq]
+ movaps m1, [hq+mmsize]
+%if ARCH_X86_64
+ movaps m8, [h_stepq]
+ movaps m9, [h_stepq+mmsize]
+ %define H_STEP0 m8
+ %define H_STEP1 m9
+%else
+ %define H_STEP0 [h_stepq]
+ %define H_STEP1 [h_stepq+mmsize]
+%endif
+ shl nd, 3
+ add lq, nq
+ add rq, nq
+ neg nq
+
+align 16
+.loop:
+ addps m0, H_STEP0
+ addps m1, H_STEP1
+ movddup m2, [lq+nq]
+ movddup m3, [rq+nq]
+ shufps m4, m2, m2, q2301
+ shufps m5, m3, m3, q2301
+ unpcklps m6, m0, m0
+ unpckhps m7, m0, m0
+ mulps m2, m6
+ mulps m3, m7
+ unpcklps m6, m1, m1
+ unpckhps m7, m1, m1
+ mulps m4, m6
+ mulps m5, m7
+ addps m2, m3
+ addsubps m4, m5
+ addsubps m2, m4
+ movsd [lq+nq], m2
+ movhps [rq+nq], m2
+ add nq, 8
+ jl .loop
+.ret:
+ REP_RET
+
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
index f6d6c03..767ae65 100644
--- a/libavcodec/x86/aacpsdsp_init.c
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -37,6 +37,9 @@ void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
+void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4],
+ int len);
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
{
@@ -50,6 +53,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
if (EXTERNAL_SSE3(cpu_flags)) {
s->add_squares = ff_ps_add_squares_sse3;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
+ s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
}
}
OpenPOWER on IntegriCloud