summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2014-07-11 00:12:31 +0100
committerMartin Storsjö <martin@martin.st>2014-07-18 01:34:08 +0300
commit5c22e8e4ad0852d61d5c4ba8d67d33fd72339497 (patch)
tree0203e0320cec765670ccd3388987963262c6272d
parent2d60444331fca1910510038dd3817bea885c2367 (diff)
downloadffmpeg-streaming-5c22e8e4ad0852d61d5c4ba8d67d33fd72339497.zip
ffmpeg-streaming-5c22e8e4ad0852d61d5c4ba8d67d33fd72339497.tar.gz
armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6)
The previous implementation targeted DTS Coherent Acoustics, which only requires mdct_bits == 6. This relatively small size lent itself to unrolling the loops a small number of times, and encoding offsets calculated at assembly time within the load/store instructions of each iteration. In the more general case (codecs such as AAC and AC3) much larger arrays are used - mdct_bits == [8, 9, 11]. The old method does not scale for these cases, so more integer registers are used with non-unrolled versions of the loops (and with some stack spillage). The postrotation filter loop is still unrolled by a factor of 2 to permit the double-buffering of some VFP registers to facilitate overlap of neighbouring iterations. I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in ff_imdct_half_c / ff_imdct_half_vfp, for the same example AAC stream: Before After Mean StdDev Mean StdDev Confidence Change aac_decode_frame 2368.1 35.8 2117.2 35.3 100.0% +11.8% ff_imdct_half_* 457.5 22.4 251.2 16.2 100.0% +82.1% Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r--libavcodec/arm/mdct_vfp.S146
1 files changed, 144 insertions, 2 deletions
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index 94db24f..f3fe668 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -33,6 +33,11 @@ J0 .req a2
J1 .req a4
J2 .req ip
J3 .req lr
+REVTAB_HI .req v5
+IN_HI .req v6
+OUT_HI .req v6
+TCOS_HI .req sl
+TSIN_HI .req fp
.macro prerotation_innerloop
.set trig_lo, k
@@ -76,6 +81,43 @@ J3 .req lr
.set k, k + 2
.endm
+.macro prerotation_innerloop_rolled
+ vldmia TCOS!, {s16,s17}
+ vldmdb TCOS_HI!, {s18,s19}
+ vldr s0, [IN_HI, #-4]
+ vldr s1, [IN_HI, #-12]
+ vldr s2, [IN, #12]
+ vldr s3, [IN, #4]
+ vmul.f s8, s0, s16 @ vector operation
+ vldmia TSIN!, {s20,s21}
+ vldmdb TSIN_HI!, {s22,s23}
+ vldr s4, [IN]
+ vldr s5, [IN, #8]
+ vldr s6, [IN_HI, #-16]
+ vldr s7, [IN_HI, #-8]
+ vmul.f s12, s0, s20 @ vector operation
+ add IN, IN, #16
+ sub IN_HI, IN_HI, #16
+ ldrh J0, [REVTAB], #2
+ ldrh J1, [REVTAB], #2
+ vmls.f s8, s4, s20 @ vector operation
+ ldrh J3, [REVTAB_HI, #-2]!
+ ldrh J2, [REVTAB_HI, #-2]!
+ add J0, OUT, J0, lsl #3
+ vmla.f s12, s4, s16 @ vector operation
+ add J1, OUT, J1, lsl #3
+ add J2, OUT, J2, lsl #3
+ add J3, OUT, J3, lsl #3
+ vstr s8, [J0]
+ vstr s9, [J1]
+ vstr s10, [J2]
+ vstr s11, [J3]
+ vstr s12, [J0, #4]
+ vstr s13, [J1, #4]
+ vstr s14, [J2, #4]
+ vstr s15, [J3, #4]
+.endm
+
.macro postrotation_innerloop tail, head
.set trig_lo_head, n8 - k - 2
.set trig_hi_head, n8 + k
@@ -142,6 +184,49 @@ J3 .req lr
.endif
.endm
+.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
+ .ifnc "\tail",""
+ vmls.f s8, s0, \tcos_s0_tail @ vector operation
+ .endif
+ .ifnc "\head",""
+ vldmia TSIN!, {s16,s17}
+ vldmdb TSIN_HI!, {s18,s19}
+ vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
+ .endif
+ .ifnc "\tail",""
+ vmla.f s12, s4, \tcos_s0_tail @ vector operation
+ .endif
+ .ifnc "\head",""
+ vldr s0, [OUT, #+\out_offset_head+0]
+ vldr s1, [OUT, #+\out_offset_head+8]
+ vldr s2, [OUT_HI, #-\out_offset_head-16]
+ vldr s3, [OUT_HI, #-\out_offset_head-8]
+ vldr s4, [OUT, #+\out_offset_head+4]
+ vldr s5, [OUT, #+\out_offset_head+12]
+ vldr s6, [OUT_HI, #-\out_offset_head-12]
+ vldr s7, [OUT_HI, #-\out_offset_head-4]
+ .endif
+ .ifnc "\tail",""
+ vstr s8, [OUT, #+\out_offset_tail+0]
+ vstr s9, [OUT, #+\out_offset_tail+8]
+ vstr s10, [OUT_HI, #-\out_offset_tail-16]
+ vstr s11, [OUT_HI, #-\out_offset_tail-8]
+ .endif
+ .ifnc "\head",""
+ vmul.f s8, s4, s16 @ vector operation
+ .endif
+ .ifnc "\tail",""
+ vstr s12, [OUT_HI, #-\out_offset_tail-4]
+ vstr s13, [OUT_HI, #-\out_offset_tail-12]
+ vstr s14, [OUT, #+\out_offset_tail+12]
+ vstr s15, [OUT, #+\out_offset_tail+4]
+ .endif
+ .ifnc "\head",""
+ vmul.f s12, s0, s16 @ vector operation
+ vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
+ .endif
+.endm
+
/* void ff_imdct_half_vfp(FFTContext *s,
* FFTSample *output,
@@ -150,8 +235,7 @@ J3 .req lr
function ff_imdct_half_vfp, export=1
ldr ip, [CONTEXT, #5*4] @ mdct_bits
teq ip, #6
- it ne
- bne X(ff_imdct_half_c) @ only case currently accelerated is the one used by DCA
+ bne 10f
.set n, 1<<6
.set n2, n/2
@@ -189,6 +273,59 @@ function ff_imdct_half_vfp, export=1
fmxr FPSCR, OLDFPSCR
vpop {s16-s27}
pop {v1-v5,pc}
+
+10:
+ push {v1-v6,sl,fp,lr}
+ vpush {s16-s27}
+ fmrx OLDFPSCR, FPSCR
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, lr
+ mov lr, #1
+ mov OUT, ORIGOUT
+ ldr REVTAB, [CONTEXT, #2*4]
+ ldr TCOS, [CONTEXT, #6*4]
+ ldr TSIN, [CONTEXT, #7*4]
+ mov lr, lr, lsl ip
+
+ push {CONTEXT,OLDFPSCR}
+ add IN_HI, IN, lr, lsl #1
+ add REVTAB_HI, REVTAB, lr, lsr #1
+ add TCOS_HI, TCOS, lr
+ add TSIN_HI, TSIN, lr
+0: prerotation_innerloop_rolled
+ teq IN, IN_HI
+ bne 0b
+ ldmia sp, {CONTEXT,OLDFPSCR}
+
+ mov ORIGOUT, OUT
+ fmxr FPSCR, OLDFPSCR
+ ldr ip, [CONTEXT, #9*4]
+ blx ip @ s->fft_calc(s, output)
+
+ pop {CONTEXT,OLDFPSCR}
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ ldr ip, [CONTEXT, #5*4] @ mdct_bits
+ fmxr FPSCR, lr
+ mov lr, #1
+ mov lr, lr, lsl ip
+ sub TCOS, TCOS, lr, lsr #1
+ sub TSIN, TSIN, lr, lsr #1
+ add OUT_HI, OUT, lr, lsl #1
+ add TCOS_HI, TCOS, lr
+ add TSIN_HI, TSIN, lr
+ postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
+ b 1f
+0: add OUT, OUT, #32
+ sub OUT_HI, OUT_HI, #32
+ postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
+1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
+ teq TSIN, TSIN_HI
+ bne 0b
+ postrotation_innerloop_rolled tail,,,,,, s24,, 16
+
+ fmxr FPSCR, OLDFPSCR
+ vpop {s16-s27}
+ pop {v1-v6,sl,fp,pc}
endfunc
.unreq CONTEXT
@@ -203,3 +340,8 @@ endfunc
.unreq J1
.unreq J2
.unreq J3
+ .unreq REVTAB_HI
+ .unreq IN_HI
+ .unreq OUT_HI
+ .unreq TCOS_HI
+ .unreq TSIN_HI
OpenPOWER on IntegriCloud