diff options
author | Ben Avison <bavison@riscosopen.org> | 2014-07-11 00:12:31 +0100 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2014-07-18 01:34:08 +0300 |
commit | 5c22e8e4ad0852d61d5c4ba8d67d33fd72339497 (patch) | |
tree | 0203e0320cec765670ccd3388987963262c6272d | |
parent | 2d60444331fca1910510038dd3817bea885c2367 (diff) | |
download | ffmpeg-streaming-5c22e8e4ad0852d61d5c4ba8d67d33fd72339497.zip ffmpeg-streaming-5c22e8e4ad0852d61d5c4ba8d67d33fd72339497.tar.gz |
armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6)
The previous implementation targeted DTS Coherent Acoustics, which only
requires mdct_bits == 6. This relatively small size lent itself to
unrolling the loops a small number of times, and encoding offsets
calculated at assembly time within the load/store instructions of each
iteration.
In the more general case (codecs such as AAC and AC3) much larger arrays
are used - mdct_bits == [8, 9, 11]. The old method does not scale for
these cases, so more integer registers are used with non-unrolled versions
of the loops (and with some stack spillage). The postrotation filter loop
is still unrolled by a factor of 2 to permit the double-buffering of some
VFP registers to facilitate overlap of neighbouring iterations.
I benchmarked the result by measuring the number of gperftools samples
that hit anywhere in the AAC decoder (starting from aac_decode_frame())
or specifically in ff_imdct_half_c / ff_imdct_half_vfp, for the same
example AAC stream:
Before After
Mean StdDev Mean StdDev Confidence Change
aac_decode_frame 2368.1 35.8 2117.2 35.3 100.0% +11.8%
ff_imdct_half_* 457.5 22.4 251.2 16.2 100.0% +82.1%
Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r-- | libavcodec/arm/mdct_vfp.S | 146 |
1 files changed, 144 insertions, 2 deletions
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S index 94db24f..f3fe668 100644 --- a/libavcodec/arm/mdct_vfp.S +++ b/libavcodec/arm/mdct_vfp.S @@ -33,6 +33,11 @@ J0 .req a2 J1 .req a4 J2 .req ip J3 .req lr +REVTAB_HI .req v5 +IN_HI .req v6 +OUT_HI .req v6 +TCOS_HI .req sl +TSIN_HI .req fp .macro prerotation_innerloop .set trig_lo, k @@ -76,6 +81,43 @@ J3 .req lr .set k, k + 2 .endm +.macro prerotation_innerloop_rolled + vldmia TCOS!, {s16,s17} + vldmdb TCOS_HI!, {s18,s19} + vldr s0, [IN_HI, #-4] + vldr s1, [IN_HI, #-12] + vldr s2, [IN, #12] + vldr s3, [IN, #4] + vmul.f s8, s0, s16 @ vector operation + vldmia TSIN!, {s20,s21} + vldmdb TSIN_HI!, {s22,s23} + vldr s4, [IN] + vldr s5, [IN, #8] + vldr s6, [IN_HI, #-16] + vldr s7, [IN_HI, #-8] + vmul.f s12, s0, s20 @ vector operation + add IN, IN, #16 + sub IN_HI, IN_HI, #16 + ldrh J0, [REVTAB], #2 + ldrh J1, [REVTAB], #2 + vmls.f s8, s4, s20 @ vector operation + ldrh J3, [REVTAB_HI, #-2]! + ldrh J2, [REVTAB_HI, #-2]! + add J0, OUT, J0, lsl #3 + vmla.f s12, s4, s16 @ vector operation + add J1, OUT, J1, lsl #3 + add J2, OUT, J2, lsl #3 + add J3, OUT, J3, lsl #3 + vstr s8, [J0] + vstr s9, [J1] + vstr s10, [J2] + vstr s11, [J3] + vstr s12, [J0, #4] + vstr s13, [J1, #4] + vstr s14, [J2, #4] + vstr s15, [J3, #4] +.endm + .macro postrotation_innerloop tail, head .set trig_lo_head, n8 - k - 2 .set trig_hi_head, n8 + k @@ -142,6 +184,49 @@ J3 .req lr .endif .endm +.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail + .ifnc "\tail","" + vmls.f s8, s0, \tcos_s0_tail @ vector operation + .endif + .ifnc "\head","" + vldmia TSIN!, {s16,s17} + vldmdb TSIN_HI!, {s18,s19} + vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head} + .endif + .ifnc "\tail","" + vmla.f s12, s4, \tcos_s0_tail @ vector operation + .endif + .ifnc "\head","" + vldr s0, [OUT, #+\out_offset_head+0] + vldr s1, [OUT, #+\out_offset_head+8] + vldr s2, [OUT_HI, #-\out_offset_head-16] + vldr s3, [OUT_HI, #-\out_offset_head-8] + vldr s4, [OUT, #+\out_offset_head+4] + vldr s5, [OUT, #+\out_offset_head+12] + vldr s6, [OUT_HI, #-\out_offset_head-12] + vldr s7, [OUT_HI, #-\out_offset_head-4] + .endif + .ifnc "\tail","" + vstr s8, [OUT, #+\out_offset_tail+0] + vstr s9, [OUT, #+\out_offset_tail+8] + vstr s10, [OUT_HI, #-\out_offset_tail-16] + vstr s11, [OUT_HI, #-\out_offset_tail-8] + .endif + .ifnc "\head","" + vmul.f s8, s4, s16 @ vector operation + .endif + .ifnc "\tail","" + vstr s12, [OUT_HI, #-\out_offset_tail-4] + vstr s13, [OUT_HI, #-\out_offset_tail-12] + vstr s14, [OUT, #+\out_offset_tail+12] + vstr s15, [OUT, #+\out_offset_tail+4] + .endif + .ifnc "\head","" + vmul.f s12, s0, s16 @ vector operation + vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head} + .endif +.endm + /* void ff_imdct_half_vfp(FFTContext *s, * FFTSample *output, @@ -150,8 +235,7 @@ J3 .req lr function ff_imdct_half_vfp, export=1 ldr ip, [CONTEXT, #5*4] @ mdct_bits teq ip, #6 - it ne - bne X(ff_imdct_half_c) @ only case currently accelerated is the one used by DCA + bne 10f .set n, 1<<6 .set n2, n/2 @@ -189,6 +273,59 @@ function ff_imdct_half_vfp, export=1 fmxr FPSCR, OLDFPSCR vpop {s16-s27} pop {v1-v5,pc} + +10: + push {v1-v6,sl,fp,lr} + vpush {s16-s27} + fmrx OLDFPSCR, FPSCR + ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + fmxr FPSCR, lr + mov lr, #1 + mov OUT, ORIGOUT + ldr REVTAB, [CONTEXT, #2*4] + ldr TCOS, [CONTEXT, #6*4] + ldr TSIN, [CONTEXT, #7*4] + mov lr, lr, lsl ip + + push {CONTEXT,OLDFPSCR} + add IN_HI, IN, lr, lsl #1 + add REVTAB_HI, REVTAB, lr, lsr #1 + add TCOS_HI, TCOS, lr + add TSIN_HI, TSIN, lr +0: prerotation_innerloop_rolled + teq IN, IN_HI + bne 0b + ldmia sp, {CONTEXT,OLDFPSCR} + + mov ORIGOUT, OUT + fmxr FPSCR, OLDFPSCR + ldr ip, [CONTEXT, #9*4] + blx ip @ s->fft_calc(s, output) + + pop {CONTEXT,OLDFPSCR} + ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + ldr ip, [CONTEXT, #5*4] @ mdct_bits + fmxr FPSCR, lr + mov lr, #1 + mov lr, lr, lsl ip + sub TCOS, TCOS, lr, lsr #1 + sub TSIN, TSIN, lr, lsr #1 + add OUT_HI, OUT, lr, lsl #1 + add TCOS_HI, TCOS, lr + add TSIN_HI, TSIN, lr + postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0 + b 1f +0: add OUT, OUT, #32 + sub OUT_HI, OUT_HI, #32 + postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16 +1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0 + teq TSIN, TSIN_HI + bne 0b + postrotation_innerloop_rolled tail,,,,,, s24,, 16 + + fmxr FPSCR, OLDFPSCR + vpop {s16-s27} + pop {v1-v6,sl,fp,pc} endfunc .unreq CONTEXT @@ -203,3 +340,8 @@ endfunc .unreq J1 .unreq J2 .unreq J3 + .unreq REVTAB_HI + .unreq IN_HI + .unreq OUT_HI + .unreq TCOS_HI + .unreq TSIN_HI |