From 1a094af638281295bf087945923d258b5acd1ab1 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Sat, 30 Jan 2016 14:45:28 +0100 Subject: fft: Split MDCT bits off from FFT --- libavcodec/aarch64/Makefile | 1 + libavcodec/aarch64/fft_init_aarch64.c | 12 --- libavcodec/aarch64/mdct_init.c | 39 +++++++++ libavcodec/arm/Makefile | 2 + libavcodec/arm/fft_fixed_init_arm.c | 10 --- libavcodec/arm/fft_init_arm.c | 15 ---- libavcodec/arm/mdct_fixed_init_arm.c | 40 +++++++++ libavcodec/arm/mdct_init_arm.c | 47 +++++++++++ libavcodec/fft.h | 7 ++ libavcodec/fft_template.c | 7 -- libavcodec/mdct_template.c | 20 +++++ libavcodec/ppc/Makefile | 1 + libavcodec/ppc/fft_init.c | 124 +-------------------------- libavcodec/ppc/mdct_init.c | 154 ++++++++++++++++++++++++++++++++++ libavcodec/x86/Makefile | 1 + libavcodec/x86/fft.asm | 128 ++++++++++++++-------------- libavcodec/x86/fft.h | 8 -- libavcodec/x86/fft_init.c | 7 -- libavcodec/x86/mdct.h | 32 +++++++ libavcodec/x86/mdct_init.c | 51 +++++++++++ 20 files changed, 463 insertions(+), 243 deletions(-) create mode 100644 libavcodec/aarch64/mdct_init.c create mode 100644 libavcodec/arm/mdct_fixed_init_arm.c create mode 100644 libavcodec/arm/mdct_init_arm.c create mode 100644 libavcodec/ppc/mdct_init.c create mode 100644 libavcodec/x86/mdct.h create mode 100644 libavcodec/x86/mdct_init.c (limited to 'libavcodec') diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 022ed84..311cd2c 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o +OBJS-$(CONFIG_MDCT) += aarch64/mdct_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c index f85091e..9cc57d3 100644 --- a/libavcodec/aarch64/fft_init_aarch64.c +++ b/libavcodec/aarch64/fft_init_aarch64.c @@ -18,8 +18,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "config.h" - #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/aarch64/cpu.h" @@ -29,10 +27,6 @@ void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); -void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); - av_cold void ff_fft_init_aarch64(FFTContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -40,11 +34,5 @@ av_cold void ff_fft_init_aarch64(FFTContext *s) if (have_neon(cpu_flags)) { s->fft_permute = ff_fft_permute_neon; s->fft_calc = ff_fft_calc_neon; -#if CONFIG_MDCT - s->imdct_calc = ff_imdct_calc_neon; - s->imdct_half = ff_imdct_half_neon; - s->mdct_calc = ff_mdct_calc_neon; - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; -#endif } } diff --git a/libavcodec/aarch64/mdct_init.c b/libavcodec/aarch64/mdct_init.c new file mode 100644 index 0000000..816111a --- /dev/null +++ b/libavcodec/aarch64/mdct_init.c @@ -0,0 +1,39 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" + +#include "libavcodec/fft.h" + +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); + +av_cold void ff_mdct_init_aarch64(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; + } +} diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 89ec237..a684c8b 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -21,6 +21,8 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ OBJS-$(CONFIG_FLACDSP) += arm/flacdsp_init_arm.o \ arm/flacdsp_arm.o OBJS-$(CONFIG_G722DSP) += arm/g722dsp_init_arm.o +OBJS-$(CONFIG_MDCT) += arm/mdct_init_arm.o \ + arm/mdct_fixed_init_arm.o OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o diff --git a/libavcodec/arm/fft_fixed_init_arm.c b/libavcodec/arm/fft_fixed_init_arm.c index 1f941a1..5132b09 100644 --- a/libavcodec/arm/fft_fixed_init_arm.c +++ b/libavcodec/arm/fft_fixed_init_arm.c @@ -26,8 +26,6 @@ #include "libavcodec/fft.h" void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z); -void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i); -void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i); av_cold void ff_fft_fixed_init_arm(FFTContext *s) { @@ -36,13 +34,5 @@ av_cold void ff_fft_fixed_init_arm(FFTContext *s) if (have_neon(cpu_flags)) { s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; s->fft_calc = ff_fft_fixed_calc_neon; - -#if CONFIG_MDCT - if (!s->inverse && s->nbits >= 3) { - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; - s->mdct_calc = ff_mdct_fixed_calc_neon; - s->mdct_calcw = ff_mdct_fixed_calcw_neon; - } -#endif } } diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c index b6c2fd9..4d047ea 100644 --- a/libavcodec/arm/fft_init_arm.c +++ b/libavcodec/arm/fft_init_arm.c @@ -29,31 +29,16 @@ void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z); void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); -void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input); - -void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); - av_cold void ff_fft_init_arm(FFTContext *s) { int cpu_flags = av_get_cpu_flags(); if (have_vfp_vm(cpu_flags)) { s->fft_calc = ff_fft_calc_vfp; -#if CONFIG_MDCT - s->imdct_half = ff_imdct_half_vfp; -#endif } if (have_neon(cpu_flags)) { s->fft_permute = ff_fft_permute_neon; s->fft_calc = ff_fft_calc_neon; -#if CONFIG_MDCT - s->imdct_calc = ff_imdct_calc_neon; - s->imdct_half = ff_imdct_half_neon; - s->mdct_calc = ff_mdct_calc_neon; - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; -#endif } } diff --git a/libavcodec/arm/mdct_fixed_init_arm.c b/libavcodec/arm/mdct_fixed_init_arm.c new file mode 100644 index 0000000..606c80c --- /dev/null +++ b/libavcodec/arm/mdct_fixed_init_arm.c @@ -0,0 +1,40 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" + +#define FFT_FLOAT 0 +#include "libavcodec/fft.h" + +void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i); +void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i); + +av_cold void ff_mdct_fixed_init_arm(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + if (!s->inverse && s->nbits >= 3) { + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; + s->mdct_calc = ff_mdct_fixed_calc_neon; + s->mdct_calcw = ff_mdct_fixed_calcw_neon; + } + } +} diff --git a/libavcodec/arm/mdct_init_arm.c b/libavcodec/arm/mdct_init_arm.c new file mode 100644 index 0000000..24678dd --- /dev/null +++ b/libavcodec/arm/mdct_init_arm.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" + +#include "libavcodec/fft.h" + +void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input); + +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); + +av_cold void ff_mdct_init_arm(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_vfp_vm(cpu_flags)) { + s->imdct_half = ff_imdct_half_vfp; + } + + if (have_neon(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; + } +} diff --git a/libavcodec/fft.h b/libavcodec/fft.h index 7daae24..57dc17f 100644 --- a/libavcodec/fft.h +++ b/libavcodec/fft.h @@ -154,4 +154,11 @@ void ff_fft_end(FFTContext *s); int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale); void ff_mdct_end(FFTContext *s); +void ff_mdct_init_aarch64(FFTContext *s); +void ff_mdct_init_arm(FFTContext *s); +void ff_mdct_init_ppc(FFTContext *s); +void ff_mdct_init_x86(FFTContext *s); + +void ff_mdct_fixed_init_arm(FFTContext *s); + #endif /* AVCODEC_FFT_H */ diff --git a/libavcodec/fft_template.c b/libavcodec/fft_template.c index 808f317..3642b43 100644 --- a/libavcodec/fft_template.c +++ b/libavcodec/fft_template.c @@ -151,20 +151,13 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) s->fft_permute = fft_permute_c; s->fft_calc = fft_calc_c; -#if CONFIG_MDCT - s->imdct_calc = ff_imdct_calc_c; - s->imdct_half = ff_imdct_half_c; - s->mdct_calc = ff_mdct_calc_c; -#endif #if FFT_FLOAT if (ARCH_AARCH64) ff_fft_init_aarch64(s); if (ARCH_ARM) ff_fft_init_arm(s); if (ARCH_PPC) ff_fft_init_ppc(s); if (ARCH_X86) ff_fft_init_x86(s); - if (CONFIG_MDCT) s->mdct_calcw = s->mdct_calc; #else - if (CONFIG_MDCT) s->mdct_calcw = ff_mdct_calcw_c; if (ARCH_ARM) ff_fft_fixed_init_arm(s); #endif diff --git a/libavcodec/mdct_template.c b/libavcodec/mdct_template.c index bad890e..5b3a6ff 100644 --- a/libavcodec/mdct_template.c +++ b/libavcodec/mdct_template.c @@ -56,6 +56,26 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale) if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0) goto fail; + s->imdct_calc = ff_imdct_calc_c; + s->imdct_half = ff_imdct_half_c; + s->mdct_calc = ff_mdct_calc_c; + +#if FFT_FLOAT + if (ARCH_AARCH64) + ff_mdct_init_aarch64(s); + if (ARCH_ARM) + ff_mdct_init_arm(s); + if (ARCH_PPC) + ff_mdct_init_ppc(s); + if (ARCH_X86) + ff_mdct_init_x86(s); + s->mdct_calcw = s->mdct_calc; +#else + s->mdct_calcw = ff_mdct_calcw_c; + if (ARCH_ARM) + ff_mdct_fixed_init_arm(s); +#endif + s->tcos = av_malloc(n/2 * sizeof(FFTSample)); if (!s->tcos) goto fail; diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index 2b6c81b..759888b 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_HPELDSP) += ppc/hpeldsp_altivec.o OBJS-$(CONFIG_HUFFYUVDSP) += ppc/huffyuvdsp_altivec.o OBJS-$(CONFIG_FDCTDSP) += ppc/fdctdsp.o OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o +OBJS-$(CONFIG_MDCT) += ppc/mdct_init.o OBJS-$(CONFIG_ME_CMP) += ppc/me_cmp.o OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ diff --git a/libavcodec/ppc/fft_init.c b/libavcodec/ppc/fft_init.c index a749900..56eafb9 100644 --- a/libavcodec/ppc/fft_init.c +++ b/libavcodec/ppc/fft_init.c @@ -1,8 +1,4 @@ /* - * FFT/IFFT transforms - * AltiVec-enabled - * Copyright (c) 2009 Loren Merritt - * * This file is part of Libav. * * Libav is free software; you can redistribute it and/or @@ -21,126 +17,14 @@ */ #include "config.h" + #include "libavutil/cpu.h" #include "libavutil/ppc/cpu.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/fft.h" -/** - * Do a complex FFT with the parameters defined in ff_fft_init(). - * The input data must be permuted before with s->revtab table. - * No 1.0 / sqrt(n) normalization is done. - * AltiVec-enabled: - * This code assumes that the 'z' pointer is 16 bytes-aligned. - * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats. - */ +#include "libavcodec/fft.h" -void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); -#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN -static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int j, k; - int n = 1 << s->mdct_bits; - int n4 = n >> 2; - int n8 = n >> 3; - int n32 = n >> 5; - const uint16_t *revtabj = s->revtab; - const uint16_t *revtabk = s->revtab+n4; - const vec_f *tcos = (const vec_f*)(s->tcos+n8); - const vec_f *tsin = (const vec_f*)(s->tsin+n8); - const vec_f *pin = (const vec_f*)(input+n4); - vec_f *pout = (vec_f*)(output+n4); - - /* pre rotation */ - k = n32-1; - do { - vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; -#define CMULA(p,o0,o1,o2,o3)\ - a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ - b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ - re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ - im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ - cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ - sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ - r##p = im*cos - re*sin;\ - i##p = re*cos + im*sin; -#define STORE2(v,dst)\ - j = dst;\ - vec_ste(v, 0, output+j*2);\ - vec_ste(v, 4, output+j*2); -#define STORE8(p)\ - a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ - b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ - c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ - d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ - STORE2(a, revtabk[ p*2-4]);\ - STORE2(b, revtabk[ p*2-3]);\ - STORE2(c, revtabj[-p*2+2]);\ - STORE2(d, revtabj[-p*2+3]); - - cos0 = tcos[k]; - sin0 = tsin[k]; - cos1 = tcos[-k-1]; - sin1 = tsin[-k-1]; - CMULA(0, 0,1,2,3); - CMULA(1, 2,3,0,1); - STORE8(0); - STORE8(1); - revtabj += 4; - revtabk -= 4; - k--; - } while(k >= 0); - - ff_fft_calc_altivec(s, (FFTComplex*)output); - - /* post rotation + reordering */ - j = -n32; - k = n32-1; - do { - vec_f cos,sin,re,im,a,b,c,d; -#define CMULB(d0,d1,o)\ - re = pout[o*2];\ - im = pout[o*2+1];\ - cos = tcos[o];\ - sin = tsin[o];\ - d0 = im*sin - re*cos;\ - d1 = re*sin + im*cos; - - CMULB(a,b,j); - CMULB(c,d,k); - pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); - pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); - pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); - pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); - j++; - k--; - } while(k >= 0); -} - -static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int k; - int n = 1 << s->mdct_bits; - int n4 = n >> 2; - int n16 = n >> 4; - vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31}; - vec_u32 *p0 = (vec_u32*)(output+n4); - vec_u32 *p1 = (vec_u32*)(output+n4*3); - - imdct_half_altivec(s, output + n4, input); - - for (k = 0; k < n16; k++) { - vec_u32 a = p0[k] ^ sign; - vec_u32 b = p1[-k-1]; - p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); - p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); - } -} -#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ - av_cold void ff_fft_init_ppc(FFTContext *s) { #if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN @@ -148,9 +32,5 @@ av_cold void ff_fft_init_ppc(FFTContext *s) return; s->fft_calc = ff_fft_calc_interleave_altivec; - if (s->mdct_bits >= 5) { - s->imdct_calc = imdct_calc_altivec; - s->imdct_half = imdct_half_altivec; - } #endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ } diff --git a/libavcodec/ppc/mdct_init.c b/libavcodec/ppc/mdct_init.c new file mode 100644 index 0000000..d3582bc --- /dev/null +++ b/libavcodec/ppc/mdct_init.c @@ -0,0 +1,154 @@ +/* + * FFT/IFFT transforms + * AltiVec-enabled + * Copyright (c) 2009 Loren Merritt + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/types_altivec.h" +#include "libavutil/ppc/util_altivec.h" +#include "libavcodec/fft.h" + +/** + * Do a complex FFT with the parameters defined in ff_fft_init(). + * The input data must be permuted before with s->revtab table. + * No 1.0 / sqrt(n) normalization is done. + * AltiVec-enabled: + * This code assumes that the 'z' pointer is 16 bytes-aligned. + * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats. + */ + +void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); + +#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN +static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + int j, k; + int n = 1 << s->mdct_bits; + int n4 = n >> 2; + int n8 = n >> 3; + int n32 = n >> 5; + const uint16_t *revtabj = s->revtab; + const uint16_t *revtabk = s->revtab+n4; + const vec_f *tcos = (const vec_f*)(s->tcos+n8); + const vec_f *tsin = (const vec_f*)(s->tsin+n8); + const vec_f *pin = (const vec_f*)(input+n4); + vec_f *pout = (vec_f*)(output+n4); + + /* pre rotation */ + k = n32-1; + do { + vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; +#define CMULA(p,o0,o1,o2,o3)\ + a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ + b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ + re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ + im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ + cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ + sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ + r##p = im*cos - re*sin;\ + i##p = re*cos + im*sin; +#define STORE2(v,dst)\ + j = dst;\ + vec_ste(v, 0, output+j*2);\ + vec_ste(v, 4, output+j*2); +#define STORE8(p)\ + a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ + b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ + c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ + d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ + STORE2(a, revtabk[ p*2-4]);\ + STORE2(b, revtabk[ p*2-3]);\ + STORE2(c, revtabj[-p*2+2]);\ + STORE2(d, revtabj[-p*2+3]); + + cos0 = tcos[k]; + sin0 = tsin[k]; + cos1 = tcos[-k-1]; + sin1 = tsin[-k-1]; + CMULA(0, 0,1,2,3); + CMULA(1, 2,3,0,1); + STORE8(0); + STORE8(1); + revtabj += 4; + revtabk -= 4; + k--; + } while(k >= 0); + + ff_fft_calc_altivec(s, (FFTComplex*)output); + + /* post rotation + reordering */ + j = -n32; + k = n32-1; + do { + vec_f cos,sin,re,im,a,b,c,d; +#define CMULB(d0,d1,o)\ + re = pout[o*2];\ + im = pout[o*2+1];\ + cos = tcos[o];\ + sin = tsin[o];\ + d0 = im*sin - re*cos;\ + d1 = re*sin + im*cos; + + CMULB(a,b,j); + CMULB(c,d,k); + pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); + pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); + pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); + pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); + j++; + k--; + } while(k >= 0); +} + +static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + int k; + int n = 1 << s->mdct_bits; + int n4 = n >> 2; + int n16 = n >> 4; + vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31}; + vec_u32 *p0 = (vec_u32*)(output+n4); + vec_u32 *p1 = (vec_u32*)(output+n4*3); + + imdct_half_altivec(s, output + n4, input); + + for (k = 0; k < n16; k++) { + vec_u32 a = p0[k] ^ sign; + vec_u32 b = p1[-k-1]; + p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); + p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); + } +} +#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ + +av_cold void ff_mdct_init_ppc(FFTContext *s) +{ +#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + if (s->mdct_bits >= 5) { + s->imdct_calc = imdct_calc_altivec; + s->imdct_half = imdct_half_altivec; + } +#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ +} diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 8ae1283..1a2cfb1 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -19,6 +19,7 @@ OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_mmx.o OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o OBJS-$(CONFIG_LPC) += x86/lpc.o +OBJS-$(CONFIG_MDCT) += x86/mdct_init.o OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \ diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm index d3be72e..ef007f4 100644 --- a/libavcodec/x86/fft.asm +++ b/libavcodec/x86/fft.asm @@ -655,68 +655,6 @@ cglobal fft_permute, 2,7,1 jl .loopcopy REP_RET -%macro IMDCT_CALC_FUNC 0 -cglobal imdct_calc, 3,5,3 - mov r3d, [r0 + FFTContext.mdctsize] - mov r4, [r0 + FFTContext.imdcthalf] - add r1, r3 - PUSH r3 - PUSH r1 -%if ARCH_X86_32 - push r2 - push r1 - push r0 -%else - sub rsp, 8+32*WIN64 ; allocate win64 shadow space -%endif - call r4 -%if ARCH_X86_32 - add esp, 12 -%else - add rsp, 8+32*WIN64 -%endif - POP r1 - POP r3 - lea r0, [r1 + 2*r3] - mov r2, r3 - sub r3, mmsize - neg r2 - mova m2, [ps_m1m1m1m1] -.loop: -%if mmsize == 8 - PSWAPD m0, [r1 + r3] - PSWAPD m1, [r0 + r2] - pxor m0, m2 -%else - mova m0, [r1 + r3] - mova m1, [r0 + r2] - shufps m0, m0, 0x1b - shufps m1, m1, 0x1b - xorps m0, m2 -%endif - mova [r0 + r3], m1 - mova [r1 + r2], m0 - sub r3, mmsize - add r2, mmsize - jl .loop -%if cpuflag(3dnow) - femms - RET -%else - REP_RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX 3dnow -IMDCT_CALC_FUNC -INIT_MMX 3dnowext -IMDCT_CALC_FUNC -%endif - -INIT_XMM sse -IMDCT_CALC_FUNC - %if ARCH_X86_32 INIT_MMX 3dnow %define mulps pfmul @@ -791,6 +729,70 @@ DECL_FFT 4 DECL_FFT 4, _interleave %endif +%if CONFIG_MDCT + +%macro IMDCT_CALC_FUNC 0 +cglobal imdct_calc, 3,5,3 + mov r3d, [r0 + FFTContext.mdctsize] + mov r4, [r0 + FFTContext.imdcthalf] + add r1, r3 + PUSH r3 + PUSH r1 +%if ARCH_X86_32 + push r2 + push r1 + push r0 +%else + sub rsp, 8+32*WIN64 ; allocate win64 shadow space +%endif + call r4 +%if ARCH_X86_32 + add esp, 12 +%else + add rsp, 8+32*WIN64 +%endif + POP r1 + POP r3 + lea r0, [r1 + 2*r3] + mov r2, r3 + sub r3, mmsize + neg r2 + mova m2, [ps_m1m1m1m1] +.loop: +%if mmsize == 8 + PSWAPD m0, [r1 + r3] + PSWAPD m1, [r0 + r2] + pxor m0, m2 +%else + mova m0, [r1 + r3] + mova m1, [r0 + r2] + shufps m0, m0, 0x1b + shufps m1, m1, 0x1b + xorps m0, m2 +%endif + mova [r0 + r3], m1 + mova [r1 + r2], m0 + sub r3, mmsize + add r2, mmsize + jl .loop +%if cpuflag(3dnow) + femms + RET +%else + REP_RET +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnow +IMDCT_CALC_FUNC +INIT_MMX 3dnowext +IMDCT_CALC_FUNC +%endif + +INIT_XMM sse +IMDCT_CALC_FUNC + INIT_XMM sse %undef mulps %undef addps @@ -1081,3 +1083,5 @@ DECL_IMDCT POSROTATESHUF_3DNOW INIT_YMM avx DECL_IMDCT POSROTATESHUF_AVX + +%endif ; CONFIG_MDCT diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index a604956..94405d0 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -27,12 +27,4 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z); -void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); - #endif /* AVCODEC_X86_FFT_H */ diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c index 2234d76..ed12909 100644 --- a/libavcodec/x86/fft_init.c +++ b/libavcodec/x86/fft_init.c @@ -30,28 +30,21 @@ av_cold void ff_fft_init_x86(FFTContext *s) #if ARCH_X86_32 if (EXTERNAL_AMD3DNOW(cpu_flags)) { - s->imdct_calc = ff_imdct_calc_3dnow; - s->imdct_half = ff_imdct_half_3dnow; s->fft_calc = ff_fft_calc_3dnow; } if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) { - s->imdct_calc = ff_imdct_calc_3dnowext; - s->imdct_half = ff_imdct_half_3dnowext; s->fft_calc = ff_fft_calc_3dnowext; } #endif /* ARCH_X86_32 */ if (EXTERNAL_SSE(cpu_flags)) { - s->imdct_calc = ff_imdct_calc_sse; - s->imdct_half = ff_imdct_half_sse; s->fft_permute = ff_fft_permute_sse; s->fft_calc = ff_fft_calc_sse; s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; } if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) { - s->imdct_half = ff_imdct_half_avx; s->fft_calc = ff_fft_calc_avx; s->fft_permutation = FF_FFT_PERM_AVX; } diff --git a/libavcodec/x86/mdct.h b/libavcodec/x86/mdct.h new file mode 100644 index 0000000..cc107cb --- /dev/null +++ b/libavcodec/x86/mdct.h @@ -0,0 +1,32 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_MDCT_H +#define AVCODEC_X86_MDCT_H + +#include "libavcodec/fft.h" + +void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); + +#endif /* AVCODEC_X86_MDCT_H */ diff --git a/libavcodec/x86/mdct_init.c b/libavcodec/x86/mdct_init.c new file mode 100644 index 0000000..db642d8 --- /dev/null +++ b/libavcodec/x86/mdct_init.c @@ -0,0 +1,51 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" + +#include "mdct.h" + +av_cold void ff_mdct_init_x86(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + +#if ARCH_X86_32 + if (EXTERNAL_AMD3DNOW(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_3dnow; + s->imdct_half = ff_imdct_half_3dnow; + } + + if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_3dnowext; + s->imdct_half = ff_imdct_half_3dnowext; + } +#endif /* ARCH_X86_32 */ + + if (EXTERNAL_SSE(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_sse; + s->imdct_half = ff_imdct_half_sse; + } + + if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) { + s->imdct_half = ff_imdct_half_avx; + } +} -- cgit v1.1