diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2013-01-20 13:20:30 -0800 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2013-01-22 11:55:42 -0800 |
commit | 42d324694883cdf1fff1612ac70fa403692a1ad4 (patch) | |
tree | 99b6a8629388614ec2395c847b79a2c448583341 /libavcodec/arm/dsputil_vfp.S | |
parent | 55aa03b9f8f11ebb7535424cc0e5635558590f49 (diff) | |
download | ffmpeg-streaming-42d324694883cdf1fff1612ac70fa403692a1ad4.zip ffmpeg-streaming-42d324694883cdf1fff1612ac70fa403692a1ad4.tar.gz |
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp.
Now, nellymoserenc and aacenc no longer depends on dsputil. Independent
of this patch, wmaprodec also does not depend on dsputil, so I removed
it from there also.
Diffstat (limited to 'libavcodec/arm/dsputil_vfp.S')
-rw-r--r-- | libavcodec/arm/dsputil_vfp.S | 106 |
1 files changed, 0 insertions, 106 deletions
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S deleted file mode 100644 index 9df955d..0000000 --- a/libavcodec/arm/dsputil_vfp.S +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/arm/asm.S" - -/* - * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle - * throughput for almost all the instructions (except for double precision - * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles - * for arithmetic operations. Scheduling code to avoid pipeline stalls is very - * important for performance. One more interesting feature is that VFP has - * independent load/store and arithmetics pipelines, so it is possible to make - * them work simultaneously and get more than 1 operation per cycle. Load/store - * pipeline can process 2 single precision floating point values per cycle and - * supports bulk loads and stores for large sets of registers. Arithmetic operations - * can be done on vectors, which allows to keep the arithmetics pipeline busy, - * while the processor may issue and execute other instructions. Detailed - * optimization manuals can be found at http://www.arm.com - */ - -/** - * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. - * Assume that len is a positive number and is multiple of 8 - */ -@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, -@ const float *src1, int len) -function ff_vector_fmul_reverse_vfp, export=1 - vpush {d8-d15} - add r2, r2, r3, lsl #2 - vldmdb r2!, {s0-s3} - vldmia r1!, {s8-s11} - vldmdb r2!, {s4-s7} - vldmia r1!, {s12-s15} - vmul.f32 s8, s3, s8 - vmul.f32 s9, s2, s9 - vmul.f32 s10, s1, s10 - vmul.f32 s11, s0, s11 -1: - subs r3, r3, #16 - it ge - vldmdbge r2!, {s16-s19} - vmul.f32 s12, s7, s12 - it ge - vldmiage r1!, {s24-s27} - vmul.f32 s13, s6, s13 - it ge - vldmdbge r2!, {s20-s23} - vmul.f32 s14, s5, s14 - it ge - vldmiage r1!, {s28-s31} - vmul.f32 s15, s4, s15 - it ge - vmulge.f32 s24, s19, s24 - it gt - vldmdbgt r2!, {s0-s3} - it ge - vmulge.f32 s25, s18, s25 - vstmia r0!, {s8-s13} - it ge - vmulge.f32 s26, s17, s26 - it gt - vldmiagt r1!, {s8-s11} - itt ge - vmulge.f32 s27, s16, s27 - vmulge.f32 s28, s23, s28 - it gt - vldmdbgt r2!, {s4-s7} - it ge - vmulge.f32 s29, s22, s29 - vstmia r0!, {s14-s15} - ittt ge - vmulge.f32 s30, s21, s30 - vmulge.f32 s31, s20, s31 - vmulge.f32 s8, s3, s8 - it gt - vldmiagt r1!, {s12-s15} - itttt ge - vmulge.f32 s9, s2, s9 - vmulge.f32 s10, s1, s10 - vstmiage r0!, {s24-s27} - vmulge.f32 s11, s0, s11 - it ge - vstmiage r0!, {s28-s31} - bgt 1b - - vpop {d8-d15} - bx lr -endfunc |