diff options
author | Clément Bœsch <cboesch@gopro.com> | 2017-06-22 11:04:26 +0200 |
---|---|---|
committer | Clément Bœsch <u@pkh.me> | 2017-06-28 11:59:34 +0200 |
commit | e4a27e2f2dea60fb0cce6e555a6a8296e50edc54 (patch) | |
tree | 627422a43dd9181b9af4fdc15461e0de093c66a2 /libavcodec | |
parent | d2ef9e6e7f9ef71aae15e9493189515a857928b1 (diff) | |
download | ffmpeg-streaming-e4a27e2f2dea60fb0cce6e555a6a8296e50edc54.zip ffmpeg-streaming-e4a27e2f2dea60fb0cce6e555a6a8296e50edc54.tar.gz |
lavc/arm: fix lack of precision in ff_ps_stereo_interpolate_neon
The code originally pre-multiply by 2 the steps, causing the running sum
of the h factors to drift away due to the lack of precision. It quickly
causes an inaccuracy > 0.01.
I tried diverse approaches such as multiply by 2.0 (instead of adding
the value itself) without success.
I'm unable to bench the impact of this change, feel free to compare.
This commit fixes the incoming aacpsdsp tests.
Following is an alternative simplified function (matching the incoming
AArch64 code) that may be used:
function ff_ps_stereo_interpolate_neon, export=1
vld1.32 {q0}, [r2]
vld1.32 {q1}, [r3]
ldr r12, [sp]
vmov.f32 q8, q0
vmov.f32 q9, q1
vzip.32 q8, q0
vzip.32 q9, q1
1:
vld1.32 {d4}, [r0,:64]
vld1.32 {d6}, [r1,:64]
vadd.f32 q8, q8, q9
vadd.f32 q0, q0, q1
vmov.f32 d5, d4
vmov.f32 d7, d6
vmul.f32 q2, q2, q8
vmla.f32 q2, q3, q0
vst1.32 {d4}, [r0,:64]!
vst1.32 {d5}, [r1,:64]!
subs r12, r12, #1
bgt 1b
bx lr
endfunc
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/arm/aacpsdsp_neon.S | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/libavcodec/arm/aacpsdsp_neon.S b/libavcodec/arm/aacpsdsp_neon.S index a93bbfe..3b1bed2 100644 --- a/libavcodec/arm/aacpsdsp_neon.S +++ b/libavcodec/arm/aacpsdsp_neon.S @@ -232,12 +232,11 @@ endfunc function ff_ps_stereo_interpolate_neon, export=1 vld1.32 {q0}, [r2] vld1.32 {q14}, [r3] - vadd.f32 q15, q14, q14 mov r2, r0 mov r3, r1 ldr r12, [sp] vadd.f32 q1, q0, q14 - vadd.f32 q0, q0, q15 + vadd.f32 q0, q1, q14 vld1.32 {q2}, [r0,:64]! vld1.32 {q3}, [r1,:64]! subs r12, r12, #1 @@ -251,8 +250,10 @@ function ff_ps_stereo_interpolate_neon, export=1 vmla.f32 d17, d7, d1[0] vmla.f32 d18, d6, d3[1] vmla.f32 d19, d7, d1[1] - vadd.f32 q1, q1, q15 - vadd.f32 q0, q0, q15 + vadd.f32 q1, q1, q14 + vadd.f32 q0, q0, q14 + vadd.f32 q1, q1, q14 + vadd.f32 q0, q0, q14 vld1.32 {q2}, [r0,:64]! vld1.32 {q3}, [r1,:64]! vst1.32 {q8}, [r2,:64]! |