diff options
Diffstat (limited to 'libavcodec/arm')
142 files changed, 6450 insertions, 1379 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index b48745a..1eeac54 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -21,8 +21,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ arm/idctdsp_arm.o \ arm/jrevdct_arm.o \ arm/simple_idct_arm.o -OBJS-$(CONFIG_MDCT) += arm/mdct_init_arm.o \ - arm/mdct_fixed_init_arm.o +OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o @@ -39,14 +38,15 @@ OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o # decoders/encoders OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ arm/sbrdsp_init_arm.o -OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o -OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o +OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o -OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_arm.o +OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \ + arm/vp9dsp_init_12bpp_arm.o \ + arm/vp9dsp_init_arm.o # ARMv5 optimizations @@ -91,8 +91,7 @@ VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o # decoders/encoders -VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \ - arm/synth_filter_vfp.o +VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o # NEON optimizations @@ -132,15 +131,20 @@ NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \ # decoders/encoders NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ arm/sbrdsp_neon.o -NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o -NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ - arm/synth_filter_neon.o -NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevc_idct.o +NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o +NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o +NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ + arm/hevcdsp_deblock_neon.o \ + arm/hevcdsp_idct_neon.o \ + arm/hevcdsp_qpel_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o -NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_neon.o \ +NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_16bpp_neon.o \ + arm/vp9itxfm_neon.o \ + arm/vp9lpf_16bpp_neon.o \ arm/vp9lpf_neon.o \ + arm/vp9mc_16bpp_neon.o \ arm/vp9mc_neon.o diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h index 4f143cb..cafa881 100644 --- a/libavcodec/arm/aac.h +++ b/libavcodec/arm/aac.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/aacpsdsp_init_arm.c b/libavcodec/arm/aacpsdsp_init_arm.c index 6326376..6eb979e 100644 --- a/libavcodec/arm/aacpsdsp_init_arm.c +++ b/libavcodec/arm/aacpsdsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2012 Mans Rullgard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -29,7 +29,7 @@ void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2], float *src1, int n); void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2], const float (*filter)[8][2], - int stride, int n); + ptrdiff_t stride, int n); void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64], int i, int len); void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2], diff --git a/libavcodec/arm/aacpsdsp_neon.S b/libavcodec/arm/aacpsdsp_neon.S index fb00900..3b1bed2 100644 --- a/libavcodec/arm/aacpsdsp_neon.S +++ b/libavcodec/arm/aacpsdsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2012 Mans Rullgard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -232,12 +232,11 @@ endfunc function ff_ps_stereo_interpolate_neon, export=1 vld1.32 {q0}, [r2] vld1.32 {q14}, [r3] - vadd.f32 q15, q14, q14 mov r2, r0 mov r3, r1 ldr r12, [sp] vadd.f32 q1, q0, q14 - vadd.f32 q0, q0, q15 + vadd.f32 q0, q1, q14 vld1.32 {q2}, [r0,:64]! vld1.32 {q3}, [r1,:64]! subs r12, r12, #1 @@ -251,8 +250,10 @@ function ff_ps_stereo_interpolate_neon, export=1 vmla.f32 d17, d7, d1[0] vmla.f32 d18, d6, d3[1] vmla.f32 d19, d7, d1[1] - vadd.f32 q1, q1, q15 - vadd.f32 q0, q0, q15 + vadd.f32 q1, q1, q14 + vadd.f32 q0, q0, q14 + vadd.f32 q1, q1, q14 + vadd.f32 q0, q0, q14 vld1.32 {q2}, [r0,:64]! vld1.32 {q3}, [r1,:64]! vst1.32 {q8}, [r2,:64]! diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S index ed8eb37..1aea190 100644 --- a/libavcodec/arm/ac3dsp_arm.S +++ b/libavcodec/arm/ac3dsp_arm.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S index 2028d0b..1d2563d 100644 --- a/libavcodec/arm/ac3dsp_armv6.S +++ b/libavcodec/arm/ac3dsp_armv6.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/ac3dsp_init_arm.c b/libavcodec/arm/ac3dsp_init_arm.c index a48353a..a3c32ff 100644 --- a/libavcodec/arm/ac3dsp_init_arm.c +++ b/libavcodec/arm/ac3dsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -33,6 +33,14 @@ void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len); void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs); void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src, const int16_t *window, unsigned n); +void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4], + const int32_t *coef0, + const int32_t *coef1, + int len); +void ff_ac3_sum_square_butterfly_float_neon(float sum[4], + const float *coef0, + const float *coef1, + int len); void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd, int start, int end, @@ -59,5 +67,7 @@ av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact) c->float_to_fixed24 = ff_float_to_fixed24_neon; c->extract_exponents = ff_ac3_extract_exponents_neon; c->apply_window_int16 = ff_apply_window_int16_neon; + c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon; + c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon; } } diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S index f97b190..89d0ae8 100644 --- a/libavcodec/arm/ac3dsp_neon.S +++ b/libavcodec/arm/ac3dsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -131,3 +131,47 @@ function ff_apply_window_int16_neon, export=1 pop {r4,pc} endfunc + +function ff_ac3_sum_square_butterfly_int32_neon, export=1 + vmov.i64 q0, #0 + vmov.i64 q1, #0 + vmov.i64 q2, #0 + vmov.i64 q3, #0 +1: + vld1.32 {d16}, [r1]! + vld1.32 {d17}, [r2]! + vadd.s32 d18, d16, d17 + vsub.s32 d19, d16, d17 + vmlal.s32 q0, d16, d16 + vmlal.s32 q1, d17, d17 + vmlal.s32 q2, d18, d18 + vmlal.s32 q3, d19, d19 + subs r3, r3, #2 + bgt 1b + vadd.s64 d0, d0, d1 + vadd.s64 d1, d2, d3 + vadd.s64 d2, d4, d5 + vadd.s64 d3, d6, d7 + vst1.64 {q0-q1}, [r0] + bx lr +endfunc + +function ff_ac3_sum_square_butterfly_float_neon, export=1 + vmov.f32 q0, #0.0 + vmov.f32 q1, #0.0 +1: + vld1.32 {d16}, [r1]! + vld1.32 {d17}, [r2]! + vadd.f32 d18, d16, d17 + vsub.f32 d19, d16, d17 + vmla.f32 d0, d16, d16 + vmla.f32 d1, d17, d17 + vmla.f32 d2, d18, d18 + vmla.f32 d3, d19, d19 + subs r3, r3, #2 + bgt 1b + vpadd.f32 d0, d0, d1 + vpadd.f32 d1, d2, d3 + vst1.32 {q0}, [r0] + bx lr +endfunc diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h index 0ea2f04..a2174b0 100644 --- a/libavcodec/arm/asm-offsets.h +++ b/libavcodec/arm/asm-offsets.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/audiodsp_arm.h b/libavcodec/arm/audiodsp_arm.h index e97e804..213660d 100644 --- a/libavcodec/arm/audiodsp_arm.h +++ b/libavcodec/arm/audiodsp_arm.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/audiodsp_init_arm.c b/libavcodec/arm/audiodsp_init_arm.c index ea9ec3c..74aa52a 100644 --- a/libavcodec/arm/audiodsp_init_arm.c +++ b/libavcodec/arm/audiodsp_init_arm.c @@ -1,20 +1,20 @@ /* * ARM optimized audio functions * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/audiodsp_init_neon.c b/libavcodec/arm/audiodsp_init_neon.c index 08405cb..6902db8 100644 --- a/libavcodec/arm/audiodsp_init_neon.c +++ b/libavcodec/arm/audiodsp_init_neon.c @@ -2,20 +2,20 @@ * ARM NEON optimised audio functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/audiodsp_neon.S b/libavcodec/arm/audiodsp_neon.S index 5871b82..cea700c 100644 --- a/libavcodec/arm/audiodsp_neon.S +++ b/libavcodec/arm/audiodsp_neon.S @@ -2,20 +2,20 @@ * ARM NEON optimised audio functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/blockdsp_arm.h b/libavcodec/arm/blockdsp_arm.h index d26630e..59ebeb8 100644 --- a/libavcodec/arm/blockdsp_arm.h +++ b/libavcodec/arm/blockdsp_arm.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/blockdsp_init_arm.c b/libavcodec/arm/blockdsp_init_arm.c index a5db201..2080d52 100644 --- a/libavcodec/arm/blockdsp_init_arm.c +++ b/libavcodec/arm/blockdsp_init_arm.c @@ -1,20 +1,20 @@ /* * ARM optimized block operations * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/blockdsp_init_neon.c b/libavcodec/arm/blockdsp_init_neon.c index e285750..0600bc6 100644 --- a/libavcodec/arm/blockdsp_init_neon.c +++ b/libavcodec/arm/blockdsp_init_neon.c @@ -2,20 +2,20 @@ * ARM NEON optimised block operations * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/blockdsp_neon.S b/libavcodec/arm/blockdsp_neon.S index 98df2c6..9fc63cb 100644 --- a/libavcodec/arm/blockdsp_neon.S +++ b/libavcodec/arm/blockdsp_neon.S @@ -2,20 +2,20 @@ * ARM NEON optimised block functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h index 6ff5f1a..fdbf86b 100644 --- a/libavcodec/arm/cabac.h +++ b/libavcodec/arm/cabac.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -59,12 +59,18 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c, "tst %[r_c] , %[r_c] \n\t" "bne 2f \n\t" "ldr %[r_c] , [%[c], %[byte]] \n\t" +#if UNCHECKED_BITSTREAM_READER + "ldrh %[tmp] , [%[r_c]] \n\t" + "add %[r_c] , %[r_c] , #2 \n\t" + "str %[r_c] , [%[c], %[byte]] \n\t" +#else "ldr %[r_b] , [%[c], %[end]] \n\t" "ldrh %[tmp] , [%[r_c]] \n\t" "cmp %[r_c] , %[r_b] \n\t" "itt lt \n\t" "addlt %[r_c] , %[r_c] , #2 \n\t" "strlt %[r_c] , [%[c], %[byte]] \n\t" +#endif "sub %[r_c] , %[low] , #1 \n\t" "add %[r_b] , %[tables] , %[norm_off] \n\t" "eor %[r_c] , %[low] , %[r_c] \n\t" diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h index 4aed576..ae4b730 100644 --- a/libavcodec/arm/dca.h +++ b/libavcodec/arm/dca.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -24,10 +24,9 @@ #include <stdint.h> #include "config.h" -#include "libavcodec/dcadsp.h" #include "libavcodec/mathops.h" -#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) +#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB #define decode_blockcodes decode_blockcodes static inline int decode_blockcodes(int code1, int code2, int levels, @@ -35,46 +34,44 @@ static inline int decode_blockcodes(int code1, int code2, int levels, { int32_t v0, v1, v2, v3, v4, v5; - __asm__ ("smmul %8, %14, %18 \n" - "smmul %11, %15, %18 \n" - "smlabb %14, %8, %17, %14 \n" - "smlabb %15, %11, %17, %15 \n" - "smmul %9, %8, %18 \n" - "smmul %12, %11, %18 \n" - "sub %14, %14, %16, lsr #1 \n" - "sub %15, %15, %16, lsr #1 \n" - "smlabb %8, %9, %17, %8 \n" - "smlabb %11, %12, %17, %11 \n" - "smmul %10, %9, %18 \n" - "smmul %13, %12, %18 \n" - "str %14, %0 \n" - "str %15, %4 \n" - "sub %8, %8, %16, lsr #1 \n" - "sub %11, %11, %16, lsr #1 \n" - "smlabb %9, %10, %17, %9 \n" - "smlabb %12, %13, %17, %12 \n" - "smmul %14, %10, %18 \n" - "smmul %15, %13, %18 \n" - "str %8, %1 \n" - "str %11, %5 \n" - "sub %9, %9, %16, lsr #1 \n" - "sub %12, %12, %16, lsr #1 \n" - "smlabb %10, %14, %17, %10 \n" - "smlabb %13, %15, %17, %13 \n" - "str %9, %2 \n" - "str %12, %6 \n" - "sub %10, %10, %16, lsr #1 \n" - "sub %13, %13, %16, lsr #1 \n" - "str %10, %3 \n" - "str %13, %7 \n" - : "=m"(values[0]), "=m"(values[1]), - "=m"(values[2]), "=m"(values[3]), - "=m"(values[4]), "=m"(values[5]), - "=m"(values[6]), "=m"(values[7]), - "=&r"(v0), "=&r"(v1), "=&r"(v2), + __asm__ ("smmul %0, %6, %10 \n" + "smmul %3, %7, %10 \n" + "smlabb %6, %0, %9, %6 \n" + "smlabb %7, %3, %9, %7 \n" + "smmul %1, %0, %10 \n" + "smmul %4, %3, %10 \n" + "sub %6, %6, %8, lsr #1 \n" + "sub %7, %7, %8, lsr #1 \n" + "smlabb %0, %1, %9, %0 \n" + "smlabb %3, %4, %9, %3 \n" + "smmul %2, %1, %10 \n" + "smmul %5, %4, %10 \n" + "str %6, [%11, #0] \n" + "str %7, [%11, #16] \n" + "sub %0, %0, %8, lsr #1 \n" + "sub %3, %3, %8, lsr #1 \n" + "smlabb %1, %2, %9, %1 \n" + "smlabb %4, %5, %9, %4 \n" + "smmul %6, %2, %10 \n" + "smmul %7, %5, %10 \n" + "str %0, [%11, #4] \n" + "str %3, [%11, #20] \n" + "sub %1, %1, %8, lsr #1 \n" + "sub %4, %4, %8, lsr #1 \n" + "smlabb %2, %6, %9, %2 \n" + "smlabb %5, %7, %9, %5 \n" + "str %1, [%11, #8] \n" + "str %4, [%11, #24] \n" + "sub %2, %2, %8, lsr #1 \n" + "sub %5, %5, %8, lsr #1 \n" + "str %2, [%11, #12] \n" + "str %5, [%11, #28] \n" + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "=&r"(v4), "=&r"(v5), "+&r"(code1), "+&r"(code2) - : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels])); + : "r"(levels - 1), "r"(-levels), + "r"(ff_inverse[levels]), "r"(values) + : "memory"); return code1 | code2; } diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S deleted file mode 100644 index 735c4c2..0000000 --- a/libavcodec/arm/dcadsp_neon.S +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_dca_lfe_fir0_neon, export=1 - push {r4-r6,lr} - mov r3, #32 @ decifactor - mov r6, #256/32 - b dca_lfe_fir -endfunc - -function ff_dca_lfe_fir1_neon, export=1 - push {r4-r6,lr} - mov r3, #64 @ decifactor - mov r6, #256/64 -dca_lfe_fir: - add r4, r0, r3, lsl #2 @ out2 - add r5, r2, #256*4-16 @ cf1 - sub r1, r1, #12 - mov lr, #-16 -1: - vmov.f32 q2, #0.0 @ v0 - vmov.f32 q3, #0.0 @ v1 - mov r12, r6 -2: - vld1.32 {q8}, [r2,:128]! @ cf0 - vld1.32 {q9}, [r5,:128], lr @ cf1 - vld1.32 {q1}, [r1], lr @ in - subs r12, r12, #4 - vrev64.32 q10, q8 - vmla.f32 q3, q1, q9 - vmla.f32 d4, d2, d21 - vmla.f32 d5, d3, d20 - bne 2b - - add r1, r1, r6, lsl #2 - subs r3, r3, #1 - vadd.f32 d4, d4, d5 - vadd.f32 d6, d6, d7 - vpadd.f32 d5, d4, d6 - vst1.32 {d5[0]}, [r0,:32]! - vst1.32 {d5[1]}, [r4,:32]! - bne 1b - - pop {r4-r6,pc} -endfunc diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S deleted file mode 100644 index c9114d4..0000000 --- a/libavcodec/arm/dcadsp_vfp.S +++ /dev/null @@ -1,476 +0,0 @@ -/* - * Copyright (c) 2013 RISC OS Open Ltd - * Author: Ben Avison <bavison@riscosopen.org> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -POUT .req a1 -PIN .req a2 -PCOEF .req a3 -OLDFPSCR .req a4 -COUNTER .req ip - -IN0 .req s4 -IN1 .req s5 -IN2 .req s6 -IN3 .req s7 -IN4 .req s0 -IN5 .req s1 -IN6 .req s2 -IN7 .req s3 -COEF0 .req s8 @ coefficient elements -COEF1 .req s9 -COEF2 .req s10 -COEF3 .req s11 -COEF4 .req s12 -COEF5 .req s13 -COEF6 .req s14 -COEF7 .req s15 -ACCUM0 .req s16 @ double-buffered multiply-accumulate results -ACCUM4 .req s20 -POST0 .req s24 @ do long-latency post-multiply in this vector in parallel -POST1 .req s25 -POST2 .req s26 -POST3 .req s27 - - -.macro inner_loop decifactor, dir, tail, head - .ifc "\dir","up" - .set X, 0 - .set Y, 4 - .else - .set X, 4*JMAX*4 - 4 - .set Y, -4 - .endif - .ifnc "\head","" - vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y] - vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y] - vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y] - vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y] - .endif - .ifnc "\tail","" - vadd.f POST0, ACCUM0, ACCUM4 @ vector operation - .endif - .ifnc "\head","" - vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar - vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y] - vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y] - vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y] - .endif - .ifnc "\head","" - vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y] - .ifc "\tail","" - vmul.f ACCUM4, COEF4, IN1 @ vector operation - .endif - vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y] - vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y] - .ifnc "\tail","" - vmul.f ACCUM4, COEF4, IN1 @ vector operation - .endif - vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y] - vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y] - .endif - .ifnc "\tail","" - vstmia POUT!, {POST0-POST3} - .endif - .ifnc "\head","" - vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar - vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y] - vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y] - vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y] - vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y] - vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar - .if \decifactor == 32 - vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y] - vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y] - vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y] - vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y] - vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar - vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y] - vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y] - vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y] - vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y] - vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar - vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y] - vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y] - vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y] - vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y] - vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar - vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y] - vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y] - vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y] - vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y] - vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar - .endif - .endif -.endm - -.macro dca_lfe_fir decifactor -function ff_dca_lfe_fir\decifactor\()_vfp, export=1 - fmrx OLDFPSCR, FPSCR - ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 - fmxr FPSCR, ip - vldr IN0, [PIN, #-0*4] - vldr IN1, [PIN, #-1*4] - vldr IN2, [PIN, #-2*4] - vldr IN3, [PIN, #-3*4] - .if \decifactor == 32 - .set JMAX, 8 - vpush {s16-s31} - vldr IN4, [PIN, #-4*4] - vldr IN5, [PIN, #-5*4] - vldr IN6, [PIN, #-6*4] - vldr IN7, [PIN, #-7*4] - .else - .set JMAX, 4 - vpush {s16-s27} - .endif - - mov COUNTER, #\decifactor/4 - 1 - inner_loop \decifactor, up,, head -1: add PCOEF, PCOEF, #4*JMAX*4 - subs COUNTER, COUNTER, #1 - inner_loop \decifactor, up, tail, head - bne 1b - inner_loop \decifactor, up, tail - - mov COUNTER, #\decifactor/4 - 1 - inner_loop \decifactor, down,, head -1: sub PCOEF, PCOEF, #4*JMAX*4 - subs COUNTER, COUNTER, #1 - inner_loop \decifactor, down, tail, head - bne 1b - inner_loop \decifactor, down, tail - - .if \decifactor == 32 - vpop {s16-s31} - .else - vpop {s16-s27} - .endif - fmxr FPSCR, OLDFPSCR - bx lr -endfunc -.endm - - dca_lfe_fir 64 - .ltorg - dca_lfe_fir 32 - - .unreq POUT - .unreq PIN - .unreq PCOEF - .unreq OLDFPSCR - .unreq COUNTER - - .unreq IN0 - .unreq IN1 - .unreq IN2 - .unreq IN3 - .unreq IN4 - .unreq IN5 - .unreq IN6 - .unreq IN7 - .unreq COEF0 - .unreq COEF1 - .unreq COEF2 - .unreq COEF3 - .unreq COEF4 - .unreq COEF5 - .unreq COEF6 - .unreq COEF7 - .unreq ACCUM0 - .unreq ACCUM4 - .unreq POST0 - .unreq POST1 - .unreq POST2 - .unreq POST3 - - -IN .req a1 -SBACT .req a2 -OLDFPSCR .req a3 -IMDCT .req a4 -WINDOW .req v1 -OUT .req v2 -BUF .req v3 -SCALEINT .req v4 @ only used in softfp case -COUNT .req v5 - -SCALE .req s0 - -/* Stack layout differs in softfp and hardfp cases: - * - * hardfp - * fp -> 6 arg words saved by caller - * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes) - * s16-s23 on entry - * align 16 - * buf -> 8*32*4 bytes buffer - * s0 on entry - * sp -> 3 arg words for callee - * - * softfp - * fp -> 7 arg words saved by caller - * a4,v1-v5,fp,lr on entry - * s16-s23 on entry - * align 16 - * buf -> 8*32*4 bytes buffer - * sp -> 4 arg words for callee - */ - -/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, - * SynthFilterContext *synth, FFTContext *imdct, - * float (*synth_buf_ptr)[512], - * int *synth_buf_offset, float (*synth_buf2)[32], - * const float (*window)[512], float *samples_out, - * float (*raXin)[32], float scale); - */ -function ff_dca_qmf_32_subbands_vfp, export=1 -VFP push {a3-a4,v1-v3,v5,fp,lr} -NOVFP push {a4,v1-v5,fp,lr} - add fp, sp, #8*4 - vpush {s16-s23} - @ The buffer pointed at by raXin isn't big enough for us to do a - @ complete matrix transposition as we want to, so allocate an - @ alternative buffer from the stack. Align to 4 words for speed. - sub BUF, sp, #8*32*4 - bic BUF, BUF, #15 - mov sp, BUF - ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2 - fmrx OLDFPSCR, FPSCR - fmxr FPSCR, lr - @ COUNT is used to count down 2 things at once: - @ bits 0-4 are the number of word pairs remaining in the output row - @ bits 5-31 are the number of words to copy (with possible negation) - @ from the source matrix before we start zeroing the remainder - mov COUNT, #(-4 << 5) + 16 - adds COUNT, COUNT, SBACT, lsl #5 - bmi 2f -1: - vldr s8, [IN, #(0*8+0)*4] - vldr s10, [IN, #(0*8+1)*4] - vldr s12, [IN, #(0*8+2)*4] - vldr s14, [IN, #(0*8+3)*4] - vldr s16, [IN, #(0*8+4)*4] - vldr s18, [IN, #(0*8+5)*4] - vldr s20, [IN, #(0*8+6)*4] - vldr s22, [IN, #(0*8+7)*4] - vneg.f s8, s8 - vldr s9, [IN, #(1*8+0)*4] - vldr s11, [IN, #(1*8+1)*4] - vldr s13, [IN, #(1*8+2)*4] - vldr s15, [IN, #(1*8+3)*4] - vneg.f s16, s16 - vldr s17, [IN, #(1*8+4)*4] - vldr s19, [IN, #(1*8+5)*4] - vldr s21, [IN, #(1*8+6)*4] - vldr s23, [IN, #(1*8+7)*4] - vstr d4, [BUF, #(0*32+0)*4] - vstr d5, [BUF, #(1*32+0)*4] - vstr d6, [BUF, #(2*32+0)*4] - vstr d7, [BUF, #(3*32+0)*4] - vstr d8, [BUF, #(4*32+0)*4] - vstr d9, [BUF, #(5*32+0)*4] - vstr d10, [BUF, #(6*32+0)*4] - vstr d11, [BUF, #(7*32+0)*4] - vldr s9, [IN, #(3*8+0)*4] - vldr s11, [IN, #(3*8+1)*4] - vldr s13, [IN, #(3*8+2)*4] - vldr s15, [IN, #(3*8+3)*4] - vldr s17, [IN, #(3*8+4)*4] - vldr s19, [IN, #(3*8+5)*4] - vldr s21, [IN, #(3*8+6)*4] - vldr s23, [IN, #(3*8+7)*4] - vneg.f s9, s9 - vldr s8, [IN, #(2*8+0)*4] - vldr s10, [IN, #(2*8+1)*4] - vldr s12, [IN, #(2*8+2)*4] - vldr s14, [IN, #(2*8+3)*4] - vneg.f s17, s17 - vldr s16, [IN, #(2*8+4)*4] - vldr s18, [IN, #(2*8+5)*4] - vldr s20, [IN, #(2*8+6)*4] - vldr s22, [IN, #(2*8+7)*4] - vstr d4, [BUF, #(0*32+2)*4] - vstr d5, [BUF, #(1*32+2)*4] - vstr d6, [BUF, #(2*32+2)*4] - vstr d7, [BUF, #(3*32+2)*4] - vstr d8, [BUF, #(4*32+2)*4] - vstr d9, [BUF, #(5*32+2)*4] - vstr d10, [BUF, #(6*32+2)*4] - vstr d11, [BUF, #(7*32+2)*4] - add IN, IN, #4*8*4 - add BUF, BUF, #4*4 - subs COUNT, COUNT, #(4 << 5) + 2 - bpl 1b -2: @ Now deal with trailing < 4 samples - adds COUNT, COUNT, #3 << 5 - bmi 4f @ sb_act was a multiple of 4 - bics lr, COUNT, #0x1F - bne 3f - @ sb_act was n*4+1 - vldr s8, [IN, #(0*8+0)*4] - vldr s10, [IN, #(0*8+1)*4] - vldr s12, [IN, #(0*8+2)*4] - vldr s14, [IN, #(0*8+3)*4] - vldr s16, [IN, #(0*8+4)*4] - vldr s18, [IN, #(0*8+5)*4] - vldr s20, [IN, #(0*8+6)*4] - vldr s22, [IN, #(0*8+7)*4] - vneg.f s8, s8 - vldr s9, zero - vldr s11, zero - vldr s13, zero - vldr s15, zero - vneg.f s16, s16 - vldr s17, zero - vldr s19, zero - vldr s21, zero - vldr s23, zero - vstr d4, [BUF, #(0*32+0)*4] - vstr d5, [BUF, #(1*32+0)*4] - vstr d6, [BUF, #(2*32+0)*4] - vstr d7, [BUF, #(3*32+0)*4] - vstr d8, [BUF, #(4*32+0)*4] - vstr d9, [BUF, #(5*32+0)*4] - vstr d10, [BUF, #(6*32+0)*4] - vstr d11, [BUF, #(7*32+0)*4] - add BUF, BUF, #2*4 - sub COUNT, COUNT, #1 - b 4f -3: @ sb_act was n*4+2 or n*4+3, so do the first 2 - vldr s8, [IN, #(0*8+0)*4] - vldr s10, [IN, #(0*8+1)*4] - vldr s12, [IN, #(0*8+2)*4] - vldr s14, [IN, #(0*8+3)*4] - vldr s16, [IN, #(0*8+4)*4] - vldr s18, [IN, #(0*8+5)*4] - vldr s20, [IN, #(0*8+6)*4] - vldr s22, [IN, #(0*8+7)*4] - vneg.f s8, s8 - vldr s9, [IN, #(1*8+0)*4] - vldr s11, [IN, #(1*8+1)*4] - vldr s13, [IN, #(1*8+2)*4] - vldr s15, [IN, #(1*8+3)*4] - vneg.f s16, s16 - vldr s17, [IN, #(1*8+4)*4] - vldr s19, [IN, #(1*8+5)*4] - vldr s21, [IN, #(1*8+6)*4] - vldr s23, [IN, #(1*8+7)*4] - vstr d4, [BUF, #(0*32+0)*4] - vstr d5, [BUF, #(1*32+0)*4] - vstr d6, [BUF, #(2*32+0)*4] - vstr d7, [BUF, #(3*32+0)*4] - vstr d8, [BUF, #(4*32+0)*4] - vstr d9, [BUF, #(5*32+0)*4] - vstr d10, [BUF, #(6*32+0)*4] - vstr d11, [BUF, #(7*32+0)*4] - add BUF, BUF, #2*4 - sub COUNT, COUNT, #(2 << 5) + 1 - bics lr, COUNT, #0x1F - bne 4f - @ sb_act was n*4+3 - vldr s8, [IN, #(2*8+0)*4] - vldr s10, [IN, #(2*8+1)*4] - vldr s12, [IN, #(2*8+2)*4] - vldr s14, [IN, #(2*8+3)*4] - vldr s16, [IN, #(2*8+4)*4] - vldr s18, [IN, #(2*8+5)*4] - vldr s20, [IN, #(2*8+6)*4] - vldr s22, [IN, #(2*8+7)*4] - vldr s9, zero - vldr s11, zero - vldr s13, zero - vldr s15, zero - vldr s17, zero - vldr s19, zero - vldr s21, zero - vldr s23, zero - vstr d4, [BUF, #(0*32+0)*4] - vstr d5, [BUF, #(1*32+0)*4] - vstr d6, [BUF, #(2*32+0)*4] - vstr d7, [BUF, #(3*32+0)*4] - vstr d8, [BUF, #(4*32+0)*4] - vstr d9, [BUF, #(5*32+0)*4] - vstr d10, [BUF, #(6*32+0)*4] - vstr d11, [BUF, #(7*32+0)*4] - add BUF, BUF, #2*4 - sub COUNT, COUNT, #1 -4: @ Now fill the remainder with 0 - vldr s8, zero - vldr s9, zero - ands COUNT, COUNT, #0x1F - beq 6f -5: vstr d4, [BUF, #(0*32+0)*4] - vstr d4, [BUF, #(1*32+0)*4] - vstr d4, [BUF, #(2*32+0)*4] - vstr d4, [BUF, #(3*32+0)*4] - vstr d4, [BUF, #(4*32+0)*4] - vstr d4, [BUF, #(5*32+0)*4] - vstr d4, [BUF, #(6*32+0)*4] - vstr d4, [BUF, #(7*32+0)*4] - add BUF, BUF, #2*4 - subs COUNT, COUNT, #1 - bne 5b -6: - fmxr FPSCR, OLDFPSCR - ldr WINDOW, [fp, #3*4] - ldr OUT, [fp, #4*4] - sub BUF, BUF, #32*4 -NOVFP ldr SCALEINT, [fp, #6*4] - mov COUNT, #8 -VFP vpush {SCALE} -VFP sub sp, sp, #3*4 -NOVFP sub sp, sp, #4*4 -7: -VFP ldr a1, [fp, #-7*4] @ imdct -NOVFP ldr a1, [fp, #-8*4] - ldmia fp, {a2-a4} -VFP stmia sp, {WINDOW, OUT, BUF} -NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT} -VFP vldr SCALE, [sp, #3*4] - bl X(ff_synth_filter_float_vfp) - add OUT, OUT, #32*4 - add BUF, BUF, #32*4 - subs COUNT, COUNT, #1 - bne 7b - -A sub sp, fp, #(8+8)*4 -T sub fp, fp, #(8+8)*4 -T mov sp, fp - vpop {s16-s23} -VFP pop {a3-a4,v1-v3,v5,fp,pc} -NOVFP pop {a4,v1-v5,fp,pc} -endfunc - - .unreq IN - .unreq SBACT - .unreq OLDFPSCR - .unreq IMDCT - .unreq WINDOW - .unreq OUT - .unreq BUF - .unreq SCALEINT - .unreq COUNT - - .unreq SCALE - - .align 2 -zero: .word 0 diff --git a/libavcodec/arm/fft_fixed_init_arm.c b/libavcodec/arm/fft_fixed_init_arm.c index 5132b09..11226d6 100644 --- a/libavcodec/arm/fft_fixed_init_arm.c +++ b/libavcodec/arm/fft_fixed_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -26,6 +26,8 @@ #include "libavcodec/fft.h" void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z); +void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i); +void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i); av_cold void ff_fft_fixed_init_arm(FFTContext *s) { @@ -33,6 +35,16 @@ av_cold void ff_fft_fixed_init_arm(FFTContext *s) if (have_neon(cpu_flags)) { s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; +#if CONFIG_FFT s->fft_calc = ff_fft_fixed_calc_neon; +#endif + +#if CONFIG_MDCT + if (!s->inverse && s->nbits >= 3) { + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; + s->mdct_calc = ff_mdct_fixed_calc_neon; + s->mdct_calcw = ff_mdct_fixed_calcw_neon; + } +#endif } } diff --git a/libavcodec/arm/fft_fixed_neon.S b/libavcodec/arm/fft_fixed_neon.S index c70a189..2651607 100644 --- a/libavcodec/arm/fft_fixed_neon.S +++ b/libavcodec/arm/fft_fixed_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c index 4d047ea..331bd65 100644 --- a/libavcodec/arm/fft_init_arm.c +++ b/libavcodec/arm/fft_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -29,16 +29,33 @@ void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z); void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); +void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input); + +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); + av_cold void ff_fft_init_arm(FFTContext *s) { int cpu_flags = av_get_cpu_flags(); if (have_vfp_vm(cpu_flags)) { s->fft_calc = ff_fft_calc_vfp; +#if CONFIG_MDCT + s->imdct_half = ff_imdct_half_vfp; +#endif } if (have_neon(cpu_flags)) { +#if CONFIG_FFT s->fft_permute = ff_fft_permute_neon; s->fft_calc = ff_fft_calc_neon; +#endif +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; +#endif } } diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S index b161015..48f8dfc 100644 --- a/libavcodec/arm/fft_neon.S +++ b/libavcodec/arm/fft_neon.S @@ -7,20 +7,20 @@ * This algorithm (though not any of the implementation details) is * based on libdjbfft by D. J. Bernstein. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S index c2801fa..ac60132 100644 --- a/libavcodec/arm/fft_vfp.S +++ b/libavcodec/arm/fft_vfp.S @@ -2,20 +2,20 @@ * Copyright (c) 2013 RISC OS Open Ltd * Author: Ben Avison <bavison@riscosopen.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/flacdsp_arm.S b/libavcodec/arm/flacdsp_arm.S index d4441da..f8861c5 100644 --- a/libavcodec/arm/flacdsp_arm.S +++ b/libavcodec/arm/flacdsp_arm.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2012 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/flacdsp_init_arm.c b/libavcodec/arm/flacdsp_init_arm.c index 0530cf7..564e3dc 100644 --- a/libavcodec/arm/flacdsp_init_arm.c +++ b/libavcodec/arm/flacdsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2012 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -24,9 +24,9 @@ void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order, int qlevel, int len); -av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, +av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps) { - if (bps <= 16) - c->lpc = ff_flac_lpc_16_arm; + if (CONFIG_FLAC_DECODER) + c->lpc16 = ff_flac_lpc_16_arm; } diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c index 11396e8..a734dec 100644 --- a/libavcodec/arm/fmtconvert_init_arm.c +++ b/libavcodec/arm/fmtconvert_init_arm.c @@ -1,20 +1,20 @@ /* * ARM optimized Format Conversion Utils * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S index 5d48e3d..738953e 100644 --- a/libavcodec/arm/fmtconvert_neon.S +++ b/libavcodec/arm/fmtconvert_neon.S @@ -3,20 +3,20 @@ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>b * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S index 4e43f42..b14af45 100644 --- a/libavcodec/arm/fmtconvert_vfp.S +++ b/libavcodec/arm/fmtconvert_vfp.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/g722dsp_init_arm.c b/libavcodec/arm/g722dsp_init_arm.c index 5edf619..c0e5d8b 100644 --- a/libavcodec/arm/g722dsp_init_arm.c +++ b/libavcodec/arm/g722dsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/g722dsp_neon.S b/libavcodec/arm/g722dsp_neon.S index 5fa3c27..757e53f 100644 --- a/libavcodec/arm/g722dsp_neon.S +++ b/libavcodec/arm/g722dsp_neon.S @@ -2,20 +2,20 @@ * ARM NEON optimised DSP functions for G722 coding * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/h264chroma_init_arm.c b/libavcodec/arm/h264chroma_init_arm.c index 0e84362..aae804b 100644 --- a/libavcodec/arm/h264chroma_init_arm.c +++ b/libavcodec/arm/h264chroma_init_arm.c @@ -2,20 +2,20 @@ * ARM NEON optimised H.264 chroma functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S index 779dc0b..5a4159e 100644 --- a/libavcodec/arm/h264cmc_neon.S +++ b/libavcodec/arm/h264cmc_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -455,7 +455,7 @@ endconst h264_chroma_mc4 avg, rv40 #endif -#if CONFIG_VC1_DECODER +#if CONFIG_VC1DSP h264_chroma_mc8 put, vc1 h264_chroma_mc8 avg, vc1 h264_chroma_mc4 put, vc1 diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c index 7afd350..90144d0 100644 --- a/libavcodec/arm/h264dsp_init_arm.c +++ b/libavcodec/arm/h264dsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -72,11 +72,14 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { +#if HAVE_NEON if (bit_depth == 8) { c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; + if(chroma_format_idc == 1){ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + } c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; @@ -96,6 +99,7 @@ static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth, c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; c->h264_idct8_add4 = ff_h264_idct8_add4_neon; } +#endif // HAVE_NEON } av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, @@ -103,8 +107,10 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, { int cpu_flags = av_get_cpu_flags(); +#if HAVE_ARMV6 if (have_setend(cpu_flags)) c->startcode_find_candidate = ff_startcode_find_candidate_armv6; +#endif if (have_neon(cpu_flags)) h264dsp_init_neon(c, bit_depth, chroma_format_idc); } diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S index 5e75565..274a547 100644 --- a/libavcodec/arm/h264dsp_neon.S +++ b/libavcodec/arm/h264dsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S index f588f3e..4f68bdb 100644 --- a/libavcodec/arm/h264idct_neon.S +++ b/libavcodec/arm/h264idct_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c index a445d4d..cc324d7 100644 --- a/libavcodec/arm/h264pred_init_arm.c +++ b/libavcodec/arm/h264pred_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -49,6 +49,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc) { +#if HAVE_NEON const int high_depth = bit_depth > 8; if (high_depth) @@ -81,6 +82,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; +#endif // HAVE_NEON } av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, diff --git a/libavcodec/arm/h264pred_neon.S b/libavcodec/arm/h264pred_neon.S index 332f94b..4dc47ba 100644 --- a/libavcodec/arm/h264pred_neon.S +++ b/libavcodec/arm/h264pred_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/h264qpel_init_arm.c b/libavcodec/arm/h264qpel_init_arm.c index 01615b5..71237be 100644 --- a/libavcodec/arm/h264qpel_init_arm.c +++ b/libavcodec/arm/h264qpel_init_arm.c @@ -2,20 +2,20 @@ * ARM NEON optimised DSP functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/h264qpel_neon.S b/libavcodec/arm/h264qpel_neon.S index 6c51250..21336c6 100644 --- a/libavcodec/arm/h264qpel_neon.S +++ b/libavcodec/arm/h264qpel_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/hevcdsp_arm.h b/libavcodec/arm/hevcdsp_arm.h new file mode 100644 index 0000000..47cdfa5 --- /dev/null +++ b/libavcodec/arm/hevcdsp_arm.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_HEVCDSP_ARM_H +#define AVCODEC_ARM_HEVCDSP_ARM_H + +#include "libavcodec/hevcdsp.h" + +void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth); + +#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S new file mode 100644 index 0000000..166bddb --- /dev/null +++ b/libavcodec/arm/hevcdsp_deblock_neon.S @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +#include "libavutil/arm/asm.S" +#include "neon.S" + +.macro hevc_loop_filter_chroma_start + ldr r12, [r2] + ldr r3, [r2, #4] + add r2, r3, r12 + cmp r2, #0 + it eq + bxeq lr +.endm + +.macro hevc_loop_filter_chroma_body + vsubl.u8 q3, d4, d2 + vsubl.u8 q11, d18, d19 + vshl.i16 q3, #2 + vadd.i16 q11, q3 + vdup.16 d0, r12 + vdup.16 d1, r3 + vrshr.s16 q11, q11, #3 + vneg.s16 q12, q0 + vmovl.u8 q2, d4 + vmin.s16 q11, q11, q0 + vmax.s16 q11, q11, q12 + vaddw.u8 q1, q11, d2 + vsub.i16 q2, q11 + vqmovun.s16 d2, q1 + vqmovun.s16 d4, q2 +.endm + +.macro hevc_loop_filter_luma_start + ldr r12, [r3] + ldr r3, [r3, #4] + lsl r3, #16 + orr r3, r12 + cmp r3, #0 + it eq + bxeq lr + lsr r3, #16 +.endm + +.macro hevc_loop_filter_luma_body + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 + vmovl.u8 q14, d28 + vmovl.u8 q15, d30 + + vadd.i16 q7, q9, q11 + vadd.i16 q6, q14, q12 + vsub.i16 q7, q10 + vsub.i16 q6, q13 + vabd.s16 q7, q7, q10 + vabd.s16 q6, q6, q13 + + + vdup.16 q0, r2 + vmov q4, q7 + vmov q5, q6 + vdup.16 d4, r12 + vtrn.16 q7, q4 + vtrn.16 q6, q5 + + vshl.u64 q7, #32 + vshr.u64 q4, #32 + vshl.u64 q6, #32 + vshr.u64 q5, #32 + vshr.u64 q7, #32 + vshr.u64 q6, #32 + vshl.u64 q5, #32 + vshl.u64 q4, #32 + vorr q6, q5 + vorr q7, q4 + vdup.16 d5, r3 + vadd.i16 q5, q7, q6 + + vmov q4, q5 + vmov q3, q5 + vtrn.32 q3, q4 + + vadd.i16 q4, q3 + + vshl.s16 q5, q5, #1 + vcgt.s16 q3, q0, q4 + + vmovn.i16 d6, q3 + vshr.s16 q1, q0, #2 + vmovn.i16 d6, q3 + vcgt.s16 q5, q1, q5 + vmov r7, s12 + cmp r7, #0 + beq bypasswrite + + vpadd.i32 d0, d14, d12 + vpadd.i32 d1, d15, d13 + vmov q4, q2 + vshl.s16 q2, #2 + vshr.s16 q1, q1, #1 + vrhadd.s16 q2, q4 + + vabd.s16 q7, q8, q11 + vaba.s16 q7, q15, q12 + + vmovn.i32 d0, q0 + vmov r5, r6, s0, s1 + vcgt.s16 q6, q1, q7 + vand q5, q5, q6 + vabd.s16 q7, q11, q12 + vcgt.s16 q6, q2, q7 + vand q5, q5, q6 + + vmov q2, q5 + vtrn.s16 q5, q2 + vshr.u64 q2, #32 + vshl.u64 q5, #32 + vshl.u64 q2, #32 + vshr.u64 q5, #32 + vorr q5, q2 + + vmov q2, q5 + vshl.i16 q7, q4, #1 + vtrn.32 q2, q5 + vand q5, q2 + vneg.s16 q6, q7 + vmovn.i16 d4, q5 + vmovn.i16 d4, q2 + vmov r8, s8 + + and r9, r8, r7 + cmp r9, #0 + beq weakfilter_\@ + + vadd.i16 q2, q11, q12 + vadd.i16 q4, q9, q8 + vadd.i16 q1, q2, q10 + vdup.16 d10, r9 + vadd.i16 q0, q1, q9 + vshl.i16 q4, #1 + lsr r9, #16 + vadd.i16 q1, q0 + vrshr.s16 q3, q0, #2 + vadd.i16 q1, q13 + vadd.i16 q4, q0 + vsub.i16 q3, q10 + vrshr.s16 q1, #3 + vrshr.s16 q4, #3 + vmax.s16 q3, q6 + vsub.i16 q1, q11 + vsub.i16 q4, q9 + vmin.s16 q3, q7 + vmax.s16 q4, q6 + vmax.s16 q1, q6 + vadd.i16 q3, q10 + vmin.s16 q4, q7 + vmin.s16 q1, q7 + vdup.16 d11, r9 + vadd.i16 q4, q9 + vadd.i16 q1, q11 + vbit q9, q4, q5 + vadd.i16 q4, q2, q13 + vbit q11, q1, q5 + vadd.i16 q0, q4, q14 + vadd.i16 q2, q15, q14 + vadd.i16 q4, q0 + + vshl.i16 q2, #1 + vadd.i16 q4, q10 + vbit q10, q3, q5 + vrshr.s16 q4, #3 + vadd.i16 q2, q0 + vrshr.s16 q3, q0, #2 + vsub.i16 q4, q12 + vrshr.s16 q2, #3 + vsub.i16 q3, q13 + vmax.s16 q4, q6 + vsub.i16 q2, q14 + vmax.s16 q3, q6 + vmin.s16 q4, q7 + vmax.s16 q2, q6 + vmin.s16 q3, q7 + vadd.i16 q4, q12 + vmin.s16 q2, q7 + vadd.i16 q3, q13 + vbit q12, q4, q5 + vadd.i16 q2, q14 + vbit q13, q3, q5 + vbit q14, q2, q5 + +weakfilter_\@: + mvn r8, r8 + and r9, r8, r7 + cmp r9, #0 + beq ready_\@ + + vdup.16 q4, r2 + + vdup.16 d10, r9 + lsr r9, #16 + vmov q1, q4 + vdup.16 d11, r9 + vshr.s16 q1, #1 + vsub.i16 q2, q12, q11 + vadd.i16 q4, q1 + vshl.s16 q0, q2, #3 + vshr.s16 q4, #3 + vadd.i16 q2, q0 + vsub.i16 q0, q13, q10 + vsub.i16 q2, q0 + vshl.i16 q0, q0, #1 + vsub.i16 q2, q0 + vshl.s16 q1, q7, 2 + vrshr.s16 q2, q2, #4 + vadd.i16 q1, q7 + vabs.s16 q3, q2 + vshr.s16 q6, q6, #1 + vcgt.s16 q1, q1, q3 + vand q5, q1 + vshr.s16 q7, q7, #1 + vmax.s16 q2, q2, q6 + vmin.s16 q2, q2, q7 + + vshr.s16 q7, q7, #1 + vrhadd.s16 q3, q9, q11 + vneg.s16 q6, q7 + vsub.s16 q3, q10 + vdup.16 d2, r5 + vhadd.s16 q3, q2 + vdup.16 d3, r6 + vmax.s16 q3, q3, q6 + vcgt.s16 q1, q4, q1 + vmin.s16 q3, q3, q7 + vand q1, q5 + vadd.i16 q3, q10 + lsr r5, #16 + lsr r6, #16 + vbit q10, q3, q1 + + vrhadd.s16 q3, q14, q12 + vdup.16 d2, r5 + vsub.s16 q3, q13 + vdup.16 d3, r6 + vhsub.s16 q3, q2 + vcgt.s16 q1, q4, q1 + vmax.s16 q3, q3, q6 + vand q1, q5 + vmin.s16 q3, q3, q7 + vadd.i16 q3, q13 + vbit q13, q3, q1 + vadd.i16 q0, q11, q2 + vsub.i16 q4, q12, q2 + vbit q11, q0, q5 + vbit q12, q4, q5 + +ready_\@: + vqmovun.s16 d16, q8 + vqmovun.s16 d18, q9 + vqmovun.s16 d20, q10 + vqmovun.s16 d22, q11 + vqmovun.s16 d24, q12 + vqmovun.s16 d26, q13 + vqmovun.s16 d28, q14 + vqmovun.s16 d30, q15 +.endm + +function ff_hevc_v_loop_filter_luma_neon, export=1 + hevc_loop_filter_luma_start + push {r5-r11} + vpush {d8-d15} + sub r0, #4 + vld1.8 {d16}, [r0], r1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d20}, [r0], r1 + vld1.8 {d22}, [r0], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d28}, [r0], r1 + vld1.8 {d30}, [r0], r1 + sub r0, r0, r1, lsl #3 + transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 + hevc_loop_filter_luma_body + transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 + vst1.8 {d16}, [r0], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d20}, [r0], r1 + vst1.8 {d22}, [r0], r1 + vst1.8 {d24}, [r0], r1 + vst1.8 {d26}, [r0], r1 + vst1.8 {d28}, [r0], r1 + vst1.8 {d30}, [r0] + vpop {d8-d15} + pop {r5-r11} + bx lr +endfunc + +function ff_hevc_h_loop_filter_luma_neon, export=1 + hevc_loop_filter_luma_start + push {r5-r11} + vpush {d8-d15} + sub r0, r0, r1, lsl #2 + vld1.8 {d16}, [r0], r1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d20}, [r0], r1 + vld1.8 {d22}, [r0], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d28}, [r0], r1 + vld1.8 {d30}, [r0], r1 + sub r0, r0, r1, lsl #3 + add r0, r1 + hevc_loop_filter_luma_body + vst1.8 {d18}, [r0], r1 + vst1.8 {d20}, [r0], r1 + vst1.8 {d22}, [r0], r1 + vst1.8 {d24}, [r0], r1 + vst1.8 {d26}, [r0], r1 + vst1.8 {d28}, [r0] +bypasswrite: + vpop {d8-d15} + pop {r5-r11} + bx lr +endfunc + +function ff_hevc_v_loop_filter_chroma_neon, export=1 + hevc_loop_filter_chroma_start + sub r0, #4 + vld1.8 {d16}, [r0], r1 + vld1.8 {d17}, [r0], r1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d4}, [r0], r1 + vld1.8 {d19}, [r0], r1 + vld1.8 {d20}, [r0], r1 + vld1.8 {d21}, [r0], r1 + sub r0, r0, r1, lsl #3 + transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 + hevc_loop_filter_chroma_body + transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r0], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d2}, [r0], r1 + vst1.8 {d4}, [r0], r1 + vst1.8 {d19}, [r0], r1 + vst1.8 {d20}, [r0], r1 + vst1.8 {d21}, [r0] + bx lr +endfunc + +function ff_hevc_h_loop_filter_chroma_neon, export=1 + hevc_loop_filter_chroma_start + sub r0, r0, r1, lsl #1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d4}, [r0], r1 + vld1.8 {d19}, [r0] + sub r0, r0, r1, lsl #1 + hevc_loop_filter_chroma_body + vst1.8 {d2}, [r0], r1 + vst1.8 {d4}, [r0] + bx lr +endfunc diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevcdsp_idct_neon.S index eeb81e3..83ac0c1 100644 --- a/libavcodec/arm/hevc_idct.S +++ b/libavcodec/arm/hevcdsp_idct_neon.S @@ -1,23 +1,22 @@ /* * ARM NEON optimised IDCT functions for HEVC decoding - * * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> * Copyright (c) 2017 Alexandra Hájková * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -226,6 +225,65 @@ function ff_hevc_add_residual_32x32_10_neon, export=1 bx lr endfunc +/* uses registers q2 - q9 for temp values */ +/* TODO: reorder */ +.macro tr4_luma_shift r0, r1, r2, r3, shift + vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2 + vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3 + vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3 + vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1 + + vaddl.s16 q7, \r0, \r3 // src0 + src3 + vsubw.s16 q7, q7, \r2 // src0 - src2 + src3 + vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3) + + vmul.s32 q8, q5, d0[1] // 29 * c0 + vmul.s32 q9, q2, d1[0] // 55 * c1 + vadd.s32 q8, q9 // 29 * c0 + 55 * c1 + vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3 + + vmul.s32 q2, q2, d0[1] // 29 * c1 + vmul.s32 q9, q4, d1[0] // 55 * c2 + vsub.s32 q9, q2 // 55 * c2 - 29 * c1 + vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3 + + vmul.s32 q5, q5, d1[0] // 55 * c0 + vmul.s32 q4, q4, d0[1] // 29 * c2 + vadd.s32 q5, q4 // 55 * c0 + 29 * c2 + vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3 + + vqrshrn.s32 \r0, q8, \shift + vqrshrn.s32 \r1, q9, \shift + vqrshrn.s32 \r2, q7, \shift + vqrshrn.s32 \r3, q5, \shift +.endm + +function ff_hevc_transform_luma_4x4_neon_8, export=1 + vpush {d8-d15} + vld1.16 {q14, q15}, [r0] // coeffs + ldr r3, =0x4a // 74 + vmov.32 d0[0], r3 + ldr r3, =0x1d // 29 + vmov.32 d0[1], r3 + ldr r3, =0x37 // 55 + vmov.32 d1[0], r3 + + tr4_luma_shift d28, d29, d30, d31, #7 + + vtrn.16 d28, d29 + vtrn.16 d30, d31 + vtrn.32 q14, q15 + + tr4_luma_shift d28, d29, d30, d31, #12 + + vtrn.16 d28, d29 + vtrn.16 d30, d31 + vtrn.32 q14, q15 + vst1.16 {q14, q15}, [r0] + vpop {d8-d15} + bx lr +endfunc + .macro idct_4x4_dc bitdepth function ff_hevc_idct_4x4_dc_\bitdepth\()_neon, export=1 ldrsh r1, [r0] diff --git a/libavcodec/arm/hevcdsp_init_arm.c b/libavcodec/arm/hevcdsp_init_arm.c index e3d4e4e..e8fa1f7 100644 --- a/libavcodec/arm/hevcdsp_init_arm.c +++ b/libavcodec/arm/hevcdsp_init_arm.c @@ -1,21 +1,20 @@ /* - * ARM NEON optimised HEVC IDCT - * Copyright (c) 2017 Alexandra Hájková + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -24,75 +23,12 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/hevcdsp.h" +#include "hevcdsp_arm.h" - -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); - -void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs); -void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs); -void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs); -void ff_hevc_idct_32x32_dc_8_neon(int16_t *coeffs); -void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs); -void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs); -void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs); -void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs); - -void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit); -void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); -void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); -void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit); -void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); -void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit); - -av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, int bit_depth) +av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth) { int cpu_flags = av_get_cpu_flags(); - if (have_neon(cpu_flags)) { - if (bit_depth == 8) { - c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon; - c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon; - c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon; - c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon; - - c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon; - c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; - c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; - c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; - - c->idct[0] = ff_hevc_idct_4x4_8_neon; - c->idct[1] = ff_hevc_idct_8x8_8_neon; - c->idct[2] = ff_hevc_idct_16x16_8_neon; - } - if (bit_depth == 10) { - c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; - c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon; - c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon; - c->add_residual[3] = ff_hevc_add_residual_32x32_10_neon; - - c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_neon; - c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_neon; - c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon; - c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; - - c->idct[0] = ff_hevc_idct_4x4_10_neon; - c->idct[1] = ff_hevc_idct_8x8_10_neon; - c->idct[2] = ff_hevc_idct_16x16_10_neon; - } - } + if (have_neon(cpu_flags)) + ff_hevc_dsp_init_neon(c, bit_depth); } diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c new file mode 100644 index 0000000..3320baf --- /dev/null +++ b/libavcodec/arm/hevcdsp_init_neon.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/hevcdsp.h" +#include "hevcdsp_arm.h" + +void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs); +void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs); +void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs); +void ff_hevc_idct_32x32_dc_8_neon(int16_t *coeffs); +void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs); +void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs); +void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs); +void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs); +void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit); +void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs); + +#define PUT_PIXELS(name) \ + void name(int16_t *dst, uint8_t *src, \ + ptrdiff_t srcstride, int height, \ + intptr_t mx, intptr_t my, int width) +PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8); +PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8); +#undef PUT_PIXELS + +static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, + int height, int width); +static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, + int width, int height, int16_t* src2, ptrdiff_t src2stride); +void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width); +#define QPEL_FUNC(name) \ + void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \ + int height, int width) + +QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8); +QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8); +#undef QPEL_FUNC + +#define QPEL_FUNC_UW_PIX(name) \ + void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \ + int height, intptr_t mx, intptr_t my, int width); +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8); +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8); +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8); +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8); +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8); +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8); +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8); +#undef QPEL_FUNC_UW_PIX + +#define QPEL_FUNC_UW(name) \ + void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \ + int width, int height, int16_t* src2, ptrdiff_t src2stride); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8); +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8); +#undef QPEL_FUNC_UW + +void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width) { + + put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width); +} + +void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width) { + + put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0); +} + +void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width) { + put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE); +} + +av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth) +{ + if (bit_depth == 8) { + int x; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon; + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon; + c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon; + c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon; + c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon; + c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon; + c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; + c->idct[0] = ff_hevc_idct_4x4_8_neon; + c->idct[1] = ff_hevc_idct_8x8_8_neon; + c->idct[2] = ff_hevc_idct_16x16_8_neon; + c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; + put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8; + put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8; + put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8; + put_hevc_qpel_neon[0][1] = ff_hevc_put_qpel_h1_neon_8; + put_hevc_qpel_neon[0][2] = ff_hevc_put_qpel_h2_neon_8; + put_hevc_qpel_neon[0][3] = ff_hevc_put_qpel_h3_neon_8; + put_hevc_qpel_neon[1][1] = ff_hevc_put_qpel_h1v1_neon_8; + put_hevc_qpel_neon[1][2] = ff_hevc_put_qpel_h2v1_neon_8; + put_hevc_qpel_neon[1][3] = ff_hevc_put_qpel_h3v1_neon_8; + put_hevc_qpel_neon[2][1] = ff_hevc_put_qpel_h1v2_neon_8; + put_hevc_qpel_neon[2][2] = ff_hevc_put_qpel_h2v2_neon_8; + put_hevc_qpel_neon[2][3] = ff_hevc_put_qpel_h3v2_neon_8; + put_hevc_qpel_neon[3][1] = ff_hevc_put_qpel_h1v3_neon_8; + put_hevc_qpel_neon[3][2] = ff_hevc_put_qpel_h2v3_neon_8; + put_hevc_qpel_neon[3][3] = ff_hevc_put_qpel_h3v3_neon_8; + put_hevc_qpel_uw_neon[1][0] = ff_hevc_put_qpel_uw_v1_neon_8; + put_hevc_qpel_uw_neon[2][0] = ff_hevc_put_qpel_uw_v2_neon_8; + put_hevc_qpel_uw_neon[3][0] = ff_hevc_put_qpel_uw_v3_neon_8; + put_hevc_qpel_uw_neon[0][1] = ff_hevc_put_qpel_uw_h1_neon_8; + put_hevc_qpel_uw_neon[0][2] = ff_hevc_put_qpel_uw_h2_neon_8; + put_hevc_qpel_uw_neon[0][3] = ff_hevc_put_qpel_uw_h3_neon_8; + put_hevc_qpel_uw_neon[1][1] = ff_hevc_put_qpel_uw_h1v1_neon_8; + put_hevc_qpel_uw_neon[1][2] = ff_hevc_put_qpel_uw_h2v1_neon_8; + put_hevc_qpel_uw_neon[1][3] = ff_hevc_put_qpel_uw_h3v1_neon_8; + put_hevc_qpel_uw_neon[2][1] = ff_hevc_put_qpel_uw_h1v2_neon_8; + put_hevc_qpel_uw_neon[2][2] = ff_hevc_put_qpel_uw_h2v2_neon_8; + put_hevc_qpel_uw_neon[2][3] = ff_hevc_put_qpel_uw_h3v2_neon_8; + put_hevc_qpel_uw_neon[3][1] = ff_hevc_put_qpel_uw_h1v3_neon_8; + put_hevc_qpel_uw_neon[3][2] = ff_hevc_put_qpel_uw_h2v3_neon_8; + put_hevc_qpel_uw_neon[3][3] = ff_hevc_put_qpel_uw_h3v3_neon_8; + for (x = 0; x < 10; x++) { + c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper; + c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper; + c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper; + c->put_hevc_qpel_uni[x][1][0] = ff_hevc_put_qpel_uni_neon_wrapper; + c->put_hevc_qpel_uni[x][0][1] = ff_hevc_put_qpel_uni_neon_wrapper; + c->put_hevc_qpel_uni[x][1][1] = ff_hevc_put_qpel_uni_neon_wrapper; + c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper; + c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper; + c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper; + } + c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8; + c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8; + c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8; + c->put_hevc_qpel[3][0][0] = ff_hevc_put_pixels_w8_neon_8; + c->put_hevc_qpel[4][0][0] = ff_hevc_put_pixels_w12_neon_8; + c->put_hevc_qpel[5][0][0] = ff_hevc_put_pixels_w16_neon_8; + c->put_hevc_qpel[6][0][0] = ff_hevc_put_pixels_w24_neon_8; + c->put_hevc_qpel[7][0][0] = ff_hevc_put_pixels_w32_neon_8; + c->put_hevc_qpel[8][0][0] = ff_hevc_put_pixels_w48_neon_8; + c->put_hevc_qpel[9][0][0] = ff_hevc_put_pixels_w64_neon_8; + + c->put_hevc_qpel_uni[1][0][0] = ff_hevc_put_qpel_uw_pixels_w4_neon_8; + c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_qpel_uw_pixels_w8_neon_8; + c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_qpel_uw_pixels_w16_neon_8; + c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_qpel_uw_pixels_w24_neon_8; + c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_qpel_uw_pixels_w32_neon_8; + c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8; + c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8; + } + + if (bit_depth == 10) { + c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; + c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon; + c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon; + c->add_residual[3] = ff_hevc_add_residual_32x32_10_neon; + + c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_neon; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_neon; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; + + c->idct[0] = ff_hevc_idct_4x4_10_neon; + c->idct[1] = ff_hevc_idct_8x8_10_neon; + c->idct[2] = ff_hevc_idct_16x16_10_neon; + } +} diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S new file mode 100644 index 0000000..86f92cf --- /dev/null +++ b/libavcodec/arm/hevcdsp_qpel_neon.S @@ -0,0 +1,999 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +#define MAX_PB_SIZE #64 + +.macro regshuffle_d8 + vmov d16, d17 + vmov d17, d18 + vmov d18, d19 + vmov d19, d20 + vmov d20, d21 + vmov d21, d22 + vmov d22, d23 +.endm + +.macro regshuffle_q8 + vmov q0, q1 + vmov q1, q2 + vmov q2, q3 + vmov q3, q4 + vmov q4, q5 + vmov q5, q6 + vmov q6, q7 +.endm + +.macro vextin8 + pld [r2] + vld1.8 {q11}, [r2], r3 + vext.8 d16, d22, d23, #1 + vext.8 d17, d22, d23, #2 + vext.8 d18, d22, d23, #3 + vext.8 d19, d22, d23, #4 + vext.8 d20, d22, d23, #5 + vext.8 d21, d22, d23, #6 + vext.8 d22, d22, d23, #7 +.endm + +.macro loadin8 + pld [r2] + vld1.8 {d16}, [r2], r3 + pld [r2] + vld1.8 {d17}, [r2], r3 + pld [r2] + vld1.8 {d18}, [r2], r3 + pld [r2] + vld1.8 {d19}, [r2], r3 + pld [r2] + vld1.8 {d20}, [r2], r3 + pld [r2] + vld1.8 {d21}, [r2], r3 + pld [r2] + vld1.8 {d22}, [r2], r3 + pld [r2] + vld1.8 {d23}, [r2], r3 +.endm + +.macro qpel_filter_1_32b + vmov.i16 d16, #58 + vmov.i16 d17, #10 + vmull.s16 q9, d6, d16 // 58 * d0 + vmull.s16 q10, d7, d16 // 58 * d1 + vmov.i16 d16, #17 + vmull.s16 q11, d4, d17 // 10 * c0 + vmull.s16 q12, d5, d17 // 10 * c1 + vmov.i16 d17, #5 + vmull.s16 q13, d8, d16 // 17 * e0 + vmull.s16 q14, d9, d16 // 17 * e1 + vmull.s16 q15, d10, d17 // 5 * f0 + vmull.s16 q8, d11, d17 // 5 * f1 + vsub.s32 q9, q11 // 58 * d0 - 10 * c0 + vsub.s32 q10, q12 // 58 * d1 - 10 * c1 + vshll.s16 q11, d2, #2 // 4 * b0 + vshll.s16 q12, d3, #2 // 4 * b1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + vsubl.s16 q13, d12, d0 // g0 - a0 + vsubl.s16 q14, d13, d1 // g1 - a1 + vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + vsub.s32 q13, q15 // g0 - a0 - 5 * f0 + vsub.s32 q14, q8 // g1 - a1 - 5 * f1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +// input q0 - q7 +// output q8 +.macro qpel_filter_2_32b + vmov.i32 q8, #11 + vaddl.s16 q9, d6, d8 // d0 + e0 + vaddl.s16 q10, d7, d9 // d1 + e1 + vaddl.s16 q11, d4, d10 // c0 + f0 + vaddl.s16 q12, d5, d11 // c1 + f1 + vmul.s32 q11, q8 // 11 * (c0 + f0) + vmul.s32 q12, q8 // 11 * (c1 + f1) + vmov.i32 q8, #40 + vaddl.s16 q15, d2, d12 // b0 + g0 + vmul.s32 q9, q8 // 40 * (d0 + e0) + vmul.s32 q10, q8 // 40 * (d1 + e1) + vaddl.s16 q8, d3, d13 // b1 + g1 + vaddl.s16 q13, d0, d14 // a0 + h0 + vaddl.s16 q14, d1, d15 // a1 + h1 + vshl.s32 q15, #2 // 4*(b0+g0) + vshl.s32 q8, #2 // 4*(b1+g1) + vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0 + vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1 + vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0) + vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1) + vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0) + vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1) + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +.macro qpel_filter_3_32b + vmov.i16 d16, #58 + vmov.i16 d17, #10 + vmull.s16 q9, d8, d16 // 58 * d0 + vmull.s16 q10, d9, d16 // 58 * d1 + vmov.i16 d16, #17 + vmull.s16 q11, d10, d17 // 10 * c0 + vmull.s16 q12, d11, d17 // 10 * c1 + vmov.i16 d17, #5 + vmull.s16 q13, d6, d16 // 17 * e0 + vmull.s16 q14, d7, d16 // 17 * e1 + vmull.s16 q15, d4, d17 // 5 * f0 + vmull.s16 q8, d5, d17 // 5 * f1 + vsub.s32 q9, q11 // 58 * d0 - 10 * c0 + vsub.s32 q10, q12 // 58 * d1 - 10 * c1 + vshll.s16 q11, d12, #2 // 4 * b0 + vshll.s16 q12, d13, #2 // 4 * b1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + vsubl.s16 q13, d2, d14 // g0 - a0 + vsubl.s16 q14, d3, d15 // g1 - a1 + vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + vsub.s32 q13, q15 // g0 - a0 - 5 * f0 + vsub.s32 q14, q8 // g1 - a1 - 5 * f1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +.macro qpel_filter_1 out=q7 + vmov.u8 d24, #58 + vmov.u8 d25, #10 + vshll.u8 q13, d20, #4 // 16*e + vshll.u8 q14, d21, #2 // 4*f + vmull.u8 \out, d19, d24 // 58*d + vaddw.u8 q13, q13, d20 // 17*e + vmull.u8 q15, d18, d25 // 10*c + vaddw.u8 q14, q14, d21 // 5*f + vsubl.u8 q12, d22, d16 // g - a + vadd.u16 \out, q13 // 58d + 17e + vshll.u8 q13, d17, #2 // 4*b + vadd.u16 q15, q14 // 10*c + 5*f + vadd.s16 q13, q12 // - a + 4*b + g + vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f + vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f +.endm + +.macro qpel_filter_2 out=q7 + vmov.i16 q12, #10 + vmov.i16 q14, #11 + vaddl.u8 q13, d19, d20 // d + e + vaddl.u8 q15, d18, d21 // c + f + vmul.u16 q13, q12 // 10 * (d+e) + vmul.u16 q15, q14 // 11 * ( c + f) + vaddl.u8 \out, d17, d22 // b + g + vaddl.u8 q12, d16, d23 // a + h + vadd.u16 \out, q13 // b + 10 * (d + e) + g + vadd.s16 q12, q15 + vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g) + vsub.s16 \out, q12 +.endm + +.macro qpel_filter_3 out=q7 + vmov.u8 d24, #58 + vmov.u8 d25, #10 + vshll.u8 q13, d19, #4 // 16*e + vshll.u8 q14, d18, #2 // 4*f + vmull.u8 \out, d20, d24 // 58*d + vaddw.u8 q13, q13, d19 // 17*e + vmull.u8 q15, d21, d25 // 10*c + vaddw.u8 q14, q14, d18 // 5*f + vsubl.u8 q12, d17, d23 // g - a + vadd.u16 \out, q13 // 58d + 17e + vshll.u8 q13, d22, #2 // 4*b + vadd.u16 q15, q14 // 10*c + 5*f + vadd.s16 q13, q12 // - a + 4*b + g + vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f + vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f +.endm + +.macro hevc_put_qpel_vX_neon_8 filter + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + vpush {d8-d15} + sub r2, r2, r3, lsl #1 + sub r2, r3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + lsl r1, #1 +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vst1.16 {q7}, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vst1.16 d14, [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_vX_neon_8 filter + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, r2, r3, lsl #1 + sub r2, r3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.8 d0, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.32 d0[0], [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b + b 99f +.Lbi\@: lsl r9, #1 + mov r10, r8 +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q7 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r10, #16 + mov r8, r10 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d14 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + +function ff_hevc_put_qpel_v1_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_v2_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_v3_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_3 +endfunc + + +function ff_hevc_put_qpel_uw_v1_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_uw_v2_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_uw_v3_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_3 +endfunc + +.macro hevc_put_qpel_hX_neon_8 filter + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + + vpush {d8-d15} + sub r2, #4 + lsl r1, #1 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r5, #4 + beq 4f +8: subs r4, #1 + vextin8 + \filter + vst1.16 {q7}, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vst1.16 d14, [r0], r1 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_hX_neon_8 filter + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, #4 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ + cmp r5, #4 + beq 4f +8: subs r4, #1 + vextin8 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.8 d0, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.32 d0[0], [r0], r1 + bne 4b + b 99f +.Lbi\@: + lsl r9, #1 + cmp r5, #4 + beq 4f + mov r10, r8 +8: subs r4, #1 + vextin8 + \filter + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q7 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + add r10, #16 + mov r8, r10 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d14 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + +function ff_hevc_put_qpel_h1_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_h2_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_h3_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_3 +endfunc + + +function ff_hevc_put_qpel_uw_h1_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_uw_h2_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_uw_h3_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_3 +endfunc + +.macro hevc_put_qpel_hXvY_neon_8 filterh filterv + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + + vpush {d8-d15} + sub r2, #4 + sub r2, r2, r3, lsl #1 + sub r2, r3 // extra_before 3 + lsl r1, #1 + mov r12, r4 + mov r6, r0 + mov r7, r2 +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vst1.16 {q8}, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vst1.16 d16, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, #4 + sub r2, r2, r3, lsl #1 + sub r2, r3 // extra_before 3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vqrshrun.s16 d0, q8, #6 + vst1.8 d0, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vqrshrun.s16 d0, q8, #6 + vst1.32 d0[0], [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b + b 99f +.Lbi\@: lsl r9, #1 + mov r10, r8 +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q8 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r10, #16 + mov r8, r10 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d16 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + + +function ff_hevc_put_qpel_h1v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h2v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h3v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h1v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h2v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h3v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h1v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_h2v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_h3v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b +endfunc + + +function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b +endfunc + +.macro init_put_pixels + pld [r1] + pld [r1, r2] + mov r12, MAX_PB_SIZE + lsl r12, #1 +.endm + +function ff_hevc_put_pixels_w2_neon_8, export=1 + init_put_pixels + vmov.u8 d5, #255 + vshr.u64 d5, #32 +0: subs r3, #1 + vld1.32 {d0[0]}, [r1], r2 + pld [r1] + vld1.32 d6, [r0] + vshll.u8 q0, d0, #6 + vbit d6, d0, d5 + vst1.32 d6, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w4_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.32 {d0[0]}, [r1], r2 + vld1.32 {d0[1]}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q0, d0, #6 + vst1.64 {d0}, [r0], r12 + vst1.64 {d1}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w6_neon_8, export=1 + init_put_pixels + vmov.u8 q10, #255 + vshr.u64 d21, #32 +0: subs r3, #1 + vld1.16 {d0}, [r1], r2 + pld [r1] + vshll.u8 q0, d0, #6 + vld1.8 {q12}, [r0] + vbit q12, q0, q10 + vst1.8 {q12}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w8_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.8 {d0}, [r1], r2 + vld1.8 {d2}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q0, d0, #6 + vshll.u8 q1, d2, #6 + vst1.16 {q0}, [r0], r12 + vst1.16 {q1}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w12_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.64 {d0}, [r1] + add r1, #8 + vld1.32 {d1[0]}, [r1], r2 + sub r1, #8 + vld1.64 {d2}, [r1] + add r1, #8 + vld1.32 {d1[1]}, [r1], r2 + sub r1, #8 + pld [r1] + pld [r1, r2] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vmov d22, d19 + vst1.64 {d16, d17, d18}, [r0], r12 + vst1.64 {d20, d21, d22}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w16_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vst1.8 {q8, q9}, [r0], r12 + vst1.8 {q10, q11}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w24_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {d0, d1, d2}, [r1], r2 + pld [r1] + vshll.u8 q10, d0, #6 + vshll.u8 q11, d1, #6 + vshll.u8 q12, d2, #6 + vstm r0, {q10, q11, q12} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w32_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1], r2 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vstm r0, {q8, q9, q10, q11} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w48_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1] + add r1, #32 + vld1.8 {q2}, [r1], r2 + sub r1, #32 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vshll.u8 q12, d4, #6 + vshll.u8 q13, d5, #6 + vstm r0, {q8, q9, q10, q11, q12, q13} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w64_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1] + add r1, #32 + vld1.8 {q2, q3}, [r1], r2 + sub r1, #32 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vshll.u8 q12, d4, #6 + vshll.u8 q13, d5, #6 + vshll.u8 q14, d6, #6 + vshll.u8 q15, d7, #6 + vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_qpel_uw_pixels_neon_8, export=1 + push {r4-r9} + ldr r5, [sp, #24] // width + ldr r4, [sp, #28] // height + ldr r8, [sp, #32] // src2 + ldr r9, [sp, #36] // src2stride + vpush {d8-d15} + cmp r8, #0 + bne 2f +1: subs r4, #1 + vld1.8 {d0}, [r2], r3 + vst1.8 d0, [r0], r1 + bne 1b + vpop {d8-d15} + pop {r4-r9} + bx lr +2: subs r4, #1 + vld1.8 {d0}, [r2], r3 + vld1.16 {q1}, [r8], r9 + vshll.u8 q0, d0, #6 + vqadd.s16 q0, q1 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + bne 2b + vpop {d8-d15} + pop {r4-r9} + bx lr +endfunc + +.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4 +function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1 + ldr r12, [sp] // height +1: subs r12, #4 + vld1.32 {\regs} , [r2], r3 + vld1.32 {\regs2} , [r2], r3 + vld1.32 {\regs3} , [r2], r3 + vld1.32 {\regs4} , [r2], r3 + vst1.32 {\regs} , [r0], r1 + vst1.32 {\regs2} , [r0], r1 + vst1.32 {\regs3} , [r0], r1 + vst1.32 {\regs4} , [r0], r1 + bne 1b + bx lr +endfunc +.endm + +.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4 +function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1 + push {r4-r5} + ldr r12, [sp, #8] // height +1: subs r12, #2 + mov r4, r2 + vld1.32 {\regs} , [r2]! + vld1.32 {\regs2} , [r2] + add r2, r4, r3 + mov r4, r2 + vld1.32 {\regs3} , [r2]! + vld1.32 {\regs4} , [r2] + add r2, r4, r3 + mov r5, r0 + vst1.32 {\regs} , [r0]! + vst1.32 {\regs2} , [r0] + add r0, r5, r1 + mov r5, r0 + vst1.32 {\regs3} , [r0]! + vst1.32 {\regs4} , [r0] + add r0, r5, r1 + bne 1b + pop {r4-r5} + bx lr +endfunc +.endm + +put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1] +put_qpel_uw_pixels 8, d0, d1, d2, d3 +put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0] +put_qpel_uw_pixels 16, q0, q1, q2, q3 +put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21 +put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11 +put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10 +put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11 diff --git a/libavcodec/arm/hpeldsp_arm.S b/libavcodec/arm/hpeldsp_arm.S index 6eb4837..6f3e3fb 100644 --- a/libavcodec/arm/hpeldsp_arm.S +++ b/libavcodec/arm/hpeldsp_arm.S @@ -2,20 +2,20 @@ @ ARMv4-optimized halfpel functions @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> @ -@ This file is part of Libav. +@ This file is part of FFmpeg. @ -@ Libav is free software; you can redistribute it and/or +@ FFmpeg is free software; you can redistribute it and/or @ modify it under the terms of the GNU Lesser General Public @ License as published by the Free Software Foundation; either @ version 2.1 of the License, or (at your option) any later version. @ -@ Libav is distributed in the hope that it will be useful, +@ FFmpeg is distributed in the hope that it will be useful, @ but WITHOUT ANY WARRANTY; without even the implied warranty of @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @ Lesser General Public License for more details. @ @ You should have received a copy of the GNU Lesser General Public -@ License along with Libav; if not, write to the Free Software +@ License along with FFmpeg; if not, write to the Free Software @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @ diff --git a/libavcodec/arm/hpeldsp_arm.h b/libavcodec/arm/hpeldsp_arm.h index a864152..5f3c774 100644 --- a/libavcodec/arm/hpeldsp_arm.h +++ b/libavcodec/arm/hpeldsp_arm.h @@ -1,18 +1,20 @@ /* - * This file is part of Libav. + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * Libav is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/hpeldsp_armv6.S b/libavcodec/arm/hpeldsp_armv6.S index f1abc32..a8bd459 100644 --- a/libavcodec/arm/hpeldsp_armv6.S +++ b/libavcodec/arm/hpeldsp_armv6.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/hpeldsp_init_arm.c b/libavcodec/arm/hpeldsp_init_arm.c index 6390660..1977b13 100644 --- a/libavcodec/arm/hpeldsp_init_arm.c +++ b/libavcodec/arm/hpeldsp_init_arm.c @@ -2,20 +2,20 @@ * ARM-optimized halfpel functions * Copyright (c) 2001 Lionel Ulmer * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/hpeldsp_init_armv6.c b/libavcodec/arm/hpeldsp_init_armv6.c index 67a500d..967a8e0 100644 --- a/libavcodec/arm/hpeldsp_init_armv6.c +++ b/libavcodec/arm/hpeldsp_init_armv6.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/hpeldsp_init_neon.c b/libavcodec/arm/hpeldsp_init_neon.c index 76d4eaf..d9feadd 100644 --- a/libavcodec/arm/hpeldsp_init_neon.c +++ b/libavcodec/arm/hpeldsp_init_neon.c @@ -2,20 +2,20 @@ * ARM NEON optimised DSP functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/hpeldsp_neon.S b/libavcodec/arm/hpeldsp_neon.S index 90bc3cb..cf4a6cf 100644 --- a/libavcodec/arm/hpeldsp_neon.S +++ b/libavcodec/arm/hpeldsp_neon.S @@ -2,20 +2,20 @@ * ARM NEON optimised DSP functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/idct.h b/libavcodec/arm/idct.h index db4d6c5..6c79a69 100644 --- a/libavcodec/arm/idct.h +++ b/libavcodec/arm/idct.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/idctdsp_arm.S b/libavcodec/arm/idctdsp_arm.S index 0d6a76b..057eff9 100644 --- a/libavcodec/arm/idctdsp_arm.S +++ b/libavcodec/arm/idctdsp_arm.S @@ -2,20 +2,20 @@ @ ARMv4-optimized IDCT functions @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> @ -@ This file is part of Libav. +@ This file is part of FFmpeg. @ -@ Libav is free software; you can redistribute it and/or +@ FFmpeg is free software; you can redistribute it and/or @ modify it under the terms of the GNU Lesser General Public @ License as published by the Free Software Foundation; either @ version 2.1 of the License, or (at your option) any later version. @ -@ Libav is distributed in the hope that it will be useful, +@ FFmpeg is distributed in the hope that it will be useful, @ but WITHOUT ANY WARRANTY; without even the implied warranty of @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @ Lesser General Public License for more details. @ @ You should have received a copy of the GNU Lesser General Public -@ License along with Libav; if not, write to the Free Software +@ License along with FFmpeg; if not, write to the Free Software @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @ diff --git a/libavcodec/arm/idctdsp_arm.h b/libavcodec/arm/idctdsp_arm.h index 9012b82..d7bc5cd 100644 --- a/libavcodec/arm/idctdsp_arm.h +++ b/libavcodec/arm/idctdsp_arm.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/idctdsp_armv6.S b/libavcodec/arm/idctdsp_armv6.S index c180d73..a6e77d6 100644 --- a/libavcodec/arm/idctdsp_armv6.S +++ b/libavcodec/arm/idctdsp_armv6.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c index 8216985..ebc90e4 100644 --- a/libavcodec/arm/idctdsp_init_arm.c +++ b/libavcodec/arm/idctdsp_init_arm.c @@ -2,20 +2,20 @@ * ARM-optimized IDCT functions * Copyright (c) 2001 Lionel Ulmer * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -39,28 +39,28 @@ static void j_rev_dct_arm_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block) { ff_j_rev_dct_arm(block); - ff_put_pixels_clamped(block, dest, line_size); + ff_put_pixels_clamped_c(block, dest, line_size); } static void j_rev_dct_arm_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) { ff_j_rev_dct_arm(block); - ff_add_pixels_clamped(block, dest, line_size); + ff_add_pixels_clamped_arm(block, dest, line_size); } static void simple_idct_arm_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block) { ff_simple_idct_arm(block); - ff_put_pixels_clamped(block, dest, line_size); + ff_put_pixels_clamped_c(block, dest, line_size); } static void simple_idct_arm_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) { ff_simple_idct_arm(block); - ff_add_pixels_clamped(block, dest, line_size); + ff_add_pixels_clamped_arm(block, dest, line_size); } av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx, @@ -68,8 +68,8 @@ av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx, { int cpu_flags = av_get_cpu_flags(); - if (!high_bit_depth) { - if (avctx->idct_algo == FF_IDCT_AUTO || + if (!avctx->lowres && !high_bit_depth) { + if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) || avctx->idct_algo == FF_IDCT_ARM) { c->idct_put = j_rev_dct_arm_put; c->idct_add = j_rev_dct_arm_add; diff --git a/libavcodec/arm/idctdsp_init_armv5te.c b/libavcodec/arm/idctdsp_init_armv5te.c index 251165d..3d881e1 100644 --- a/libavcodec/arm/idctdsp_init_armv5te.c +++ b/libavcodec/arm/idctdsp_init_armv5te.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -29,8 +29,9 @@ av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { - if (!high_bit_depth && + if (!avctx->lowres && !high_bit_depth && (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { c->idct_put = ff_simple_idct_put_armv5te; c->idct_add = ff_simple_idct_add_armv5te; diff --git a/libavcodec/arm/idctdsp_init_armv6.c b/libavcodec/arm/idctdsp_init_armv6.c index 3941ee8..edf3070 100644 --- a/libavcodec/arm/idctdsp_init_armv6.c +++ b/libavcodec/arm/idctdsp_init_armv6.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -32,8 +32,8 @@ void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels, av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { - if (!high_bit_depth) { - if (avctx->idct_algo == FF_IDCT_AUTO || + if (!avctx->lowres && !high_bit_depth) { + if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) || avctx->idct_algo == FF_IDCT_SIMPLEARMV6) { c->idct_put = ff_simple_idct_put_armv6; c->idct_add = ff_simple_idct_add_armv6; diff --git a/libavcodec/arm/idctdsp_init_neon.c b/libavcodec/arm/idctdsp_init_neon.c index c94f7b6..b70c5b0 100644 --- a/libavcodec/arm/idctdsp_init_neon.c +++ b/libavcodec/arm/idctdsp_init_neon.c @@ -2,20 +2,20 @@ * ARM-NEON-optimized IDCT functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,15 +27,16 @@ #include "idct.h" #include "idctdsp_arm.h" -void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int); -void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int); -void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int); +void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { - if (!high_bit_depth) { + if (!avctx->lowres && !high_bit_depth) { if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLENEON) { c->idct_put = ff_simple_idct_put_neon; c->idct_add = ff_simple_idct_add_neon; diff --git a/libavcodec/arm/idctdsp_neon.S b/libavcodec/arm/idctdsp_neon.S index 7095879..1911a33 100644 --- a/libavcodec/arm/idctdsp_neon.S +++ b/libavcodec/arm/idctdsp_neon.S @@ -2,20 +2,20 @@ * ARM-NEON-optimized IDCT functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S index 42f3739..72c4c77 100644 --- a/libavcodec/arm/int_neon.S +++ b/libavcodec/arm/int_neon.S @@ -1,21 +1,21 @@ /* * ARM NEON optimised integer operations - * Copyright (c) 2009 Kostya Shishkov + * Copyright (c) 2009 Konstantin Shishkov * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -35,7 +35,7 @@ function ff_scalarproduct_int16_neon, export=1 vmlal.s16 q2, d18, d22 vmlal.s16 q3, d19, d23 subs r2, r2, #16 - bne 1b + bgt 1b vpadd.s32 d16, d0, d1 vpadd.s32 d17, d2, d3 @@ -48,3 +48,4 @@ function ff_scalarproduct_int16_neon, export=1 vmov.32 r0, d3[0] bx lr endfunc + diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/lossless_audiodsp_init_arm.c index 47ea034..981a39a 100644 --- a/libavcodec/arm/apedsp_init_arm.c +++ b/libavcodec/arm/lossless_audiodsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,12 +23,12 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/arm/cpu.h" -#include "libavcodec/apedsp.h" +#include "libavcodec/lossless_audiodsp.h" int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, const int16_t *v3, int len, int mul); -av_cold void ff_apedsp_init_arm(APEDSPContext *c) +av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c) { int cpu_flags = av_get_cpu_flags(); diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/lossless_audiodsp_neon.S index 7cfbf43..ba7c45f 100644 --- a/libavcodec/arm/apedsp_neon.S +++ b/libavcodec/arm/lossless_audiodsp_neon.S @@ -2,20 +2,20 @@ * ARM NEON optimised integer operations * Copyright (c) 2009 Kostya Shishkov * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -47,7 +47,7 @@ function ff_scalarproduct_and_madd_int16_neon, export=1 vst1.16 {q10}, [r12,:128]! subs r3, r3, #16 vst1.16 {q13}, [r12,:128]! - bne 1b + bgt 1b vpadd.s32 d16, d0, d1 vpadd.s32 d17, d2, d3 diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h index 45ac67d..dc57c55 100644 --- a/libavcodec/arm/mathops.h +++ b/libavcodec/arm/mathops.h @@ -2,20 +2,20 @@ * simple math operations * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mdct_fixed_init_arm.c b/libavcodec/arm/mdct_fixed_init_arm.c deleted file mode 100644 index 606c80c..0000000 --- a/libavcodec/arm/mdct_fixed_init_arm.c +++ /dev/null @@ -1,40 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" - -#define FFT_FLOAT 0 -#include "libavcodec/fft.h" - -void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i); -void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i); - -av_cold void ff_mdct_fixed_init_arm(FFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - if (!s->inverse && s->nbits >= 3) { - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; - s->mdct_calc = ff_mdct_fixed_calc_neon; - s->mdct_calcw = ff_mdct_fixed_calcw_neon; - } - } -} diff --git a/libavcodec/arm/mdct_fixed_neon.S b/libavcodec/arm/mdct_fixed_neon.S index c77be59..365c5e7 100644 --- a/libavcodec/arm/mdct_fixed_neon.S +++ b/libavcodec/arm/mdct_fixed_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mdct_init_arm.c b/libavcodec/arm/mdct_init_arm.c deleted file mode 100644 index 24678dd..0000000 --- a/libavcodec/arm/mdct_init_arm.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" - -#include "libavcodec/fft.h" - -void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input); - -void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); - -av_cold void ff_mdct_init_arm(FFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp_vm(cpu_flags)) { - s->imdct_half = ff_imdct_half_vfp; - } - - if (have_neon(cpu_flags)) { - s->imdct_calc = ff_imdct_calc_neon; - s->imdct_half = ff_imdct_half_neon; - s->mdct_calc = ff_mdct_calc_neon; - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; - } -} diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S index bfe259c..a6952fa 100644 --- a/libavcodec/arm/mdct_neon.S +++ b/libavcodec/arm/mdct_neon.S @@ -2,20 +2,20 @@ * ARM NEON optimised MDCT * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S index f3fe668..43f6d14 100644 --- a/libavcodec/arm/mdct_vfp.S +++ b/libavcodec/arm/mdct_vfp.S @@ -2,20 +2,20 @@ * Copyright (c) 2013 RISC OS Open Ltd * Author: Ben Avison <bavison@riscosopen.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/me_cmp_armv6.S b/libavcodec/arm/me_cmp_armv6.S index 436e20d..fa5a823 100644 --- a/libavcodec/arm/me_cmp_armv6.S +++ b/libavcodec/arm/me_cmp_armv6.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/me_cmp_init_arm.c b/libavcodec/arm/me_cmp_init_arm.c index 4d73f3e..03870a2 100644 --- a/libavcodec/arm/me_cmp_init_arm.c +++ b/libavcodec/arm/me_cmp_init_arm.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mlpdsp_armv5te.S b/libavcodec/arm/mlpdsp_armv5te.S index 4272dae..4f9aa48 100644 --- a/libavcodec/arm/mlpdsp_armv5te.S +++ b/libavcodec/arm/mlpdsp_armv5te.S @@ -2,20 +2,20 @@ * Copyright (c) 2014 RISC OS Open Ltd * Author: Ben Avison <bavison@riscosopen.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S index de9db46..b7ecf6c 100644 --- a/libavcodec/arm/mlpdsp_armv6.S +++ b/libavcodec/arm/mlpdsp_armv6.S @@ -2,20 +2,20 @@ * Copyright (c) 2014 RISC OS Open Ltd * Author: Ben Avison <bavison@riscosopen.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c index 4cdd10c..34a5f61 100644 --- a/libavcodec/arm/mlpdsp_init_arm.c +++ b/libavcodec/arm/mlpdsp_init_arm.c @@ -2,20 +2,20 @@ * Copyright (c) 2014 RISC OS Open Ltd * Author: Ben Avison <bavison@riscosopen.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S index 49bd0bc..977abb6 100644 --- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S +++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mpegaudiodsp_init_arm.c b/libavcodec/arm/mpegaudiodsp_init_arm.c index e73aee6..d87bd27 100644 --- a/libavcodec/arm/mpegaudiodsp_init_arm.c +++ b/libavcodec/arm/mpegaudiodsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Mans Rullgard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -26,7 +26,7 @@ #include "config.h" void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window, - int *dither, int16_t *out, int incr); + int *dither, int16_t *out, ptrdiff_t incr); av_cold void ff_mpadsp_init_arm(MPADSPContext *s) { diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c index 34e9cf1..918be16 100644 --- a/libavcodec/arm/mpegvideo_arm.c +++ b/libavcodec/arm/mpegvideo_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2002 Michael Niedermayer * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mpegvideo_arm.h b/libavcodec/arm/mpegvideo_arm.h index 17e3a5b..709ae6b 100644 --- a/libavcodec/arm/mpegvideo_arm.h +++ b/libavcodec/arm/mpegvideo_arm.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mpegvideo_armv5te.c b/libavcodec/arm/mpegvideo_armv5te.c index 4bb7b6e..e20bb4c 100644 --- a/libavcodec/arm/mpegvideo_armv5te.c +++ b/libavcodec/arm/mpegvideo_armv5te.c @@ -2,24 +2,25 @@ * Optimization of some functions from mpegvideo.c for armv5te * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavcodec/avcodec.h" #include "libavcodec/mpegvideo.h" #include "mpegvideo_arm.h" @@ -55,7 +56,7 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, int level, qmul, qadd; int nCoeffs; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); qmul = qscale << 1; @@ -84,7 +85,7 @@ static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, int qmul, qadd; int nCoeffs; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); qadd = (qscale - 1) | 1; qmul = qscale << 1; diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S index 4426e15..8687d6b 100644 --- a/libavcodec/arm/mpegvideo_armv5te_s.S +++ b/libavcodec/arm/mpegvideo_armv5te_s.S @@ -2,20 +2,20 @@ * Optimization of some functions from mpegvideo.c for armv5te * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S index 3e1f7b5..1889d7a 100644 --- a/libavcodec/arm/mpegvideo_neon.S +++ b/libavcodec/arm/mpegvideo_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mpegvideoencdsp_armv6.S b/libavcodec/arm/mpegvideoencdsp_armv6.S index 99db501..ab0dad7 100644 --- a/libavcodec/arm/mpegvideoencdsp_armv6.S +++ b/libavcodec/arm/mpegvideoencdsp_armv6.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/mpegvideoencdsp_init_arm.c b/libavcodec/arm/mpegvideoencdsp_init_arm.c index ab9ba3e..4bfe835 100644 --- a/libavcodec/arm/mpegvideoencdsp_init_arm.c +++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/neon.S b/libavcodec/arm/neon.S index 716a607..787bc4b 100644 --- a/libavcodec/arm/neon.S +++ b/libavcodec/arm/neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/neontest.c b/libavcodec/arm/neontest.c index 67d7747..f9c0dbf 100644 --- a/libavcodec/arm/neontest.c +++ b/libavcodec/arm/neontest.c @@ -2,20 +2,20 @@ * check NEON registers for clobbers * Copyright (c) 2013 Martin Storsjo * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/pixblockdsp_armv6.S b/libavcodec/arm/pixblockdsp_armv6.S index 4c925a4..b10ea78 100644 --- a/libavcodec/arm/pixblockdsp_armv6.S +++ b/libavcodec/arm/pixblockdsp_armv6.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/pixblockdsp_init_arm.c b/libavcodec/arm/pixblockdsp_init_arm.c index bb32631..59d2b49 100644 --- a/libavcodec/arm/pixblockdsp_init_arm.c +++ b/libavcodec/arm/pixblockdsp_init_arm.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/rdft_init_arm.c b/libavcodec/arm/rdft_init_arm.c index 2858ba9..1c5d8be 100644 --- a/libavcodec/arm/rdft_init_arm.c +++ b/libavcodec/arm/rdft_init_arm.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S index 7d01d53..eabb92b 100644 --- a/libavcodec/arm/rdft_neon.S +++ b/libavcodec/arm/rdft_neon.S @@ -2,20 +2,20 @@ * ARM NEON optimised RDFT * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -30,18 +30,21 @@ function ff_rdft_calc_neon, export=1 lsls r6, r6, #31 bne 1f - add r0, r4, #20 + add r0, r4, #24 bl X(ff_fft_permute_neon) - add r0, r4, #20 + add r0, r4, #24 mov r1, r5 bl X(ff_fft_calc_neon) 1: ldr r12, [r4, #0] @ nbits mov r2, #1 + ldr r8, [r4, #20] @ negative_sin lsl r12, r2, r12 add r0, r5, #8 + lsl r8, r8, #31 add r1, r5, r12, lsl #2 lsr r12, r12, #2 + vdup.32 d26, r8 ldr r2, [r4, #12] @ tcos sub r12, r12, #2 ldr r3, [r4, #16] @ tsin @@ -55,6 +58,7 @@ function ff_rdft_calc_neon, export=1 vld1.32 {d5}, [r3,:64]! @ tsin[i] vmov.f32 d18, #0.5 @ k1 vdup.32 d19, r6 + veor d5, d26, d5 pld [r0, #32] veor d19, d18, d19 @ k2 vmov.i32 d16, #0 @@ -90,6 +94,7 @@ function ff_rdft_calc_neon, export=1 vld1.32 {d5}, [r3,:64]! @ tsin[i] veor d24, d22, d17 @ ev.re,-ev.im vrev64.32 d3, d23 @ od.re, od.im + veor d5, d26, d5 pld [r2, #32] veor d2, d3, d16 @ -od.re, od.im pld [r3, #32] @@ -140,10 +145,10 @@ function ff_rdft_calc_neon, export=1 vmul.f32 d22, d22, d18 vst1.32 {d22}, [r5,:64] - add r0, r4, #20 + add r0, r4, #24 mov r1, r5 bl X(ff_fft_permute_neon) - add r0, r4, #20 + add r0, r4, #24 mov r1, r5 pop {r4-r8,lr} b X(ff_fft_calc_neon) diff --git a/libavcodec/arm/rv34dsp_init_arm.c b/libavcodec/arm/rv34dsp_init_arm.c index 5ce787b..8bfe90b 100644 --- a/libavcodec/arm/rv34dsp_init_arm.c +++ b/libavcodec/arm/rv34dsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S index a29123f..3d4a83d 100644 --- a/libavcodec/arm/rv34dsp_neon.S +++ b/libavcodec/arm/rv34dsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/rv40dsp_init_arm.c b/libavcodec/arm/rv40dsp_init_arm.c index df3e461..c24854d 100644 --- a/libavcodec/arm/rv40dsp_init_arm.c +++ b/libavcodec/arm/rv40dsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S index 6bd45eb..099f88c 100644 --- a/libavcodec/arm/rv40dsp_neon.S +++ b/libavcodec/arm/rv40dsp_neon.S @@ -2,20 +2,20 @@ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net> * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c index 4da7967..4fb69f9 100644 --- a/libavcodec/arm/sbrdsp_init_arm.c +++ b/libavcodec/arm/sbrdsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2012 Mans Rullgard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/sbrdsp_neon.S b/libavcodec/arm/sbrdsp_neon.S index 610397f..e66abd6 100644 --- a/libavcodec/arm/sbrdsp_neon.S +++ b/libavcodec/arm/sbrdsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2012 Mans Rullgard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S index a651927..42d79ab 100644 --- a/libavcodec/arm/simple_idct_arm.S +++ b/libavcodec/arm/simple_idct_arm.S @@ -4,22 +4,22 @@ * Author: Frederic Boulay <dilb@handhelds.org> * * The function defined in this file is derived from the simple_idct function - * from the libavcodec library part of the Libav project. + * from the libavcodec library part of the FFmpeg project. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S index b196833..a8d0346 100644 --- a/libavcodec/arm/simple_idct_armv5te.S +++ b/libavcodec/arm/simple_idct_armv5te.S @@ -4,20 +4,20 @@ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> * Copyright (c) 2006 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S index a8de990..f95c20d 100644 --- a/libavcodec/arm/simple_idct_armv6.S +++ b/libavcodec/arm/simple_idct_armv6.S @@ -4,20 +4,20 @@ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> * Copyright (c) 2007 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S index 9e0a97a..726d4cb 100644 --- a/libavcodec/arm/simple_idct_neon.S +++ b/libavcodec/arm/simple_idct_neon.S @@ -6,20 +6,20 @@ * Based on Simple IDCT * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/startcode.h b/libavcodec/arm/startcode.h index d7996c1..cf25d9d 100644 --- a/libavcodec/arm/startcode.h +++ b/libavcodec/arm/startcode.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S index 64078b2..a46f009 100644 --- a/libavcodec/arm/startcode_armv6.S +++ b/libavcodec/arm/startcode_armv6.S @@ -2,20 +2,20 @@ * Copyright (c) 2013 RISC OS Open Ltd * Author: Ben Avison <bavison@riscosopen.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/synth_filter_init_arm.c index bf0d9b4..ea0ce14 100644 --- a/libavcodec/arm/dcadsp_init_arm.c +++ b/libavcodec/arm/synth_filter_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -22,20 +22,9 @@ #include "libavutil/arm/cpu.h" #include "libavutil/attributes.h" -#include "libavcodec/dcadsp.h" - -void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs); -void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs); - -void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs); -void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs); - -void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, - SynthFilterContext *synth, FFTContext *imdct, - float synth_buf_ptr[512], - int *synth_buf_offset, float synth_buf2[32], - const float window[512], float *samples_out, - float raXin[32], float scale); +#include "libavutil/internal.h" +#include "libavcodec/fft.h" +#include "libavcodec/synth_filter.h" void ff_synth_filter_float_vfp(FFTContext *imdct, float *synth_buf_ptr, int *synth_buf_offset, @@ -49,21 +38,6 @@ void ff_synth_filter_float_neon(FFTContext *imdct, float out[32], const float in[32], float scale); -av_cold void ff_dcadsp_init_arm(DCADSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp_vm(cpu_flags)) { - s->lfe_fir[0] = ff_dca_lfe_fir32_vfp; - s->lfe_fir[1] = ff_dca_lfe_fir64_vfp; - s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp; - } - if (have_neon(cpu_flags)) { - s->lfe_fir[0] = ff_dca_lfe_fir0_neon; - s->lfe_fir[1] = ff_dca_lfe_fir1_neon; - } -} - av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) { int cpu_flags = av_get_cpu_flags(); diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S index 62bb667..5417be7 100644 --- a/libavcodec/arm/synth_filter_neon.S +++ b/libavcodec/arm/synth_filter_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S index 5d79e50..596734c 100644 --- a/libavcodec/arm/synth_filter_vfp.S +++ b/libavcodec/arm/synth_filter_vfp.S @@ -2,20 +2,20 @@ * Copyright (c) 2013 RISC OS Open Ltd * Author: Ben Avison <bavison@riscosopen.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vc1dsp.h b/libavcodec/arm/vc1dsp.h index 30f059f..cd01ac5 100644 --- a/libavcodec/arm/vc1dsp.h +++ b/libavcodec/arm/vc1dsp.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c index a6a97c8..5f2c759 100644 --- a/libavcodec/arm/vc1dsp_init_arm.c +++ b/libavcodec/arm/vc1dsp_init_arm.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -28,8 +28,10 @@ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp) { int cpu_flags = av_get_cpu_flags(); +#if HAVE_ARMV6 if (have_setend(cpu_flags)) dsp->startcode_find_candidate = ff_startcode_find_candidate_armv6; +#endif if (have_neon(cpu_flags)) ff_vc1dsp_init_neon(dsp); } diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c index 08c07c4..005d45c 100644 --- a/libavcodec/arm/vc1dsp_init_neon.c +++ b/libavcodec/arm/vc1dsp_init_neon.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -37,40 +37,38 @@ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd); -void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc20_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc30_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); - -void ff_put_vc1_mspel_mc01_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc02_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc03_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); - -void ff_put_vc1_mspel_mc11_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc12_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc13_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); - -void ff_put_vc1_mspel_mc21_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc22_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc23_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); - -void ff_put_vc1_mspel_mc31_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc32_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); -void ff_put_vc1_mspel_mc33_neon(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd); +#define DECL_PUT(X, Y) \ +void ff_put_vc1_mspel_mc##X##Y##_neon(uint8_t *dst, const uint8_t *src, \ + ptrdiff_t stride, int rnd); \ +static void ff_put_vc1_mspel_mc##X##Y##_16_neon(uint8_t *dst, const uint8_t *src, \ + ptrdiff_t stride, int rnd) \ +{ \ + ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \ + ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \ + dst += 8*stride; src += 8*stride; \ + ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \ + ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \ +} + +DECL_PUT(1, 0) +DECL_PUT(2, 0) +DECL_PUT(3, 0) + +DECL_PUT(0, 1) +DECL_PUT(0, 2) +DECL_PUT(0, 3) + +DECL_PUT(1, 1) +DECL_PUT(1, 2) +DECL_PUT(1, 3) + +DECL_PUT(2, 1) +DECL_PUT(2, 2) +DECL_PUT(2, 3) + +DECL_PUT(3, 1) +DECL_PUT(3, 2) +DECL_PUT(3, 3) void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); @@ -81,6 +79,10 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); +#define FN_ASSIGN(X, Y) \ + dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ + dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon + av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) { dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; @@ -92,23 +94,26 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; - dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon; + dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon; if (HAVE_AS_DN_DIRECTIVE) { - dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon; - dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon; - dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon; - dsp->put_vc1_mspel_pixels_tab[ 4] = ff_put_vc1_mspel_mc01_neon; - dsp->put_vc1_mspel_pixels_tab[ 5] = ff_put_vc1_mspel_mc11_neon; - dsp->put_vc1_mspel_pixels_tab[ 6] = ff_put_vc1_mspel_mc21_neon; - dsp->put_vc1_mspel_pixels_tab[ 7] = ff_put_vc1_mspel_mc31_neon; - dsp->put_vc1_mspel_pixels_tab[ 8] = ff_put_vc1_mspel_mc02_neon; - dsp->put_vc1_mspel_pixels_tab[ 9] = ff_put_vc1_mspel_mc12_neon; - dsp->put_vc1_mspel_pixels_tab[10] = ff_put_vc1_mspel_mc22_neon; - dsp->put_vc1_mspel_pixels_tab[11] = ff_put_vc1_mspel_mc32_neon; - dsp->put_vc1_mspel_pixels_tab[12] = ff_put_vc1_mspel_mc03_neon; - dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon; - dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon; - dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon; + FN_ASSIGN(1, 0); + FN_ASSIGN(2, 0); + FN_ASSIGN(3, 0); + + FN_ASSIGN(0, 1); + FN_ASSIGN(1, 1); + FN_ASSIGN(2, 1); + FN_ASSIGN(3, 1); + + FN_ASSIGN(0, 2); + FN_ASSIGN(1, 2); + FN_ASSIGN(2, 2); + FN_ASSIGN(3, 2); + + FN_ASSIGN(0, 3); + FN_ASSIGN(1, 3); + FN_ASSIGN(2, 3); + FN_ASSIGN(3, 3); } dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S index 1653a4c..611cbf2 100644 --- a/libavcodec/arm/vc1dsp_neon.S +++ b/libavcodec/arm/vc1dsp_neon.S @@ -4,20 +4,20 @@ * Copyright (c) 2010 Rob Clark <rob@ti.com> * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/videodsp_arm.h b/libavcodec/arm/videodsp_arm.h index a708759..112cbb8 100644 --- a/libavcodec/arm/videodsp_arm.h +++ b/libavcodec/arm/videodsp_arm.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/videodsp_armv5te.S b/libavcodec/arm/videodsp_armv5te.S index 0510019..aff1161 100644 --- a/libavcodec/arm/videodsp_armv5te.S +++ b/libavcodec/arm/videodsp_armv5te.S @@ -2,20 +2,20 @@ @ ARMv5te-optimized core video DSP functions @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> @ -@ This file is part of Libav. +@ This file is part of FFmpeg @ -@ Libav is free software; you can redistribute it and/or +@ FFmpeg is free software; you can redistribute it and/or @ modify it under the terms of the GNU Lesser General Public @ License as published by the Free Software Foundation; either @ version 2.1 of the License, or (at your option) any later version. @ -@ Libav is distributed in the hope that it will be useful, +@ FFmpeg is distributed in the hope that it will be useful, @ but WITHOUT ANY WARRANTY; without even the implied warranty of @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @ Lesser General Public License for more details. @ @ You should have received a copy of the GNU Lesser General Public -@ License along with Libav; if not, write to the Free Software +@ License along with FFmpeg; if not, write to the Free Software @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @ diff --git a/libavcodec/arm/videodsp_init_arm.c b/libavcodec/arm/videodsp_init_arm.c index 20c6e4a..a89abb2 100644 --- a/libavcodec/arm/videodsp_init_arm.c +++ b/libavcodec/arm/videodsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2012 Ronald S. Bultje * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/videodsp_init_armv5te.c b/libavcodec/arm/videodsp_init_armv5te.c index 832191f..1ea1f34 100644 --- a/libavcodec/arm/videodsp_init_armv5te.c +++ b/libavcodec/arm/videodsp_init_armv5te.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2012 Ronald S. Bultje * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,5 +27,7 @@ void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h); av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc) { +#if HAVE_ARMV5TE_EXTERNAL ctx->prefetch = ff_prefetch_arm; +#endif } diff --git a/libavcodec/arm/vorbisdsp_init_arm.c b/libavcodec/arm/vorbisdsp_init_arm.c index 853ba2d..f4b3d80 100644 --- a/libavcodec/arm/vorbisdsp_init_arm.c +++ b/libavcodec/arm/vorbisdsp_init_arm.c @@ -2,20 +2,20 @@ * ARM NEON optimised DSP functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vorbisdsp_neon.S b/libavcodec/arm/vorbisdsp_neon.S index 7df876c..79ce54f 100644 --- a/libavcodec/arm/vorbisdsp_neon.S +++ b/libavcodec/arm/vorbisdsp_neon.S @@ -2,20 +2,20 @@ * ARM NEON optimised DSP functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c index 1c91434..65ea53f 100644 --- a/libavcodec/arm/vp3dsp_init_arm.c +++ b/libavcodec/arm/vp3dsp_init_arm.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S index 58bd97d..2942d48 100644 --- a/libavcodec/arm/vp3dsp_neon.S +++ b/libavcodec/arm/vp3dsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 David Conrad * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h index 6bc9456..feb1247 100644 --- a/libavcodec/arm/vp56_arith.h +++ b/libavcodec/arm/vp56_arith.h @@ -1,20 +1,20 @@ /* * Copyright (C) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp6dsp_init_arm.c b/libavcodec/arm/vp6dsp_init_arm.c index 7e26150..a59d612 100644 --- a/libavcodec/arm/vp6dsp_init_arm.c +++ b/libavcodec/arm/vp6dsp_init_arm.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp6dsp_neon.S b/libavcodec/arm/vp6dsp_neon.S index 10b4d0f..03dd28d 100644 --- a/libavcodec/arm/vp6dsp_neon.S +++ b/libavcodec/arm/vp6dsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h index 93b2788..965342d 100644 --- a/libavcodec/arm/vp8.h +++ b/libavcodec/arm/vp8.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S index 3863dc3..e7d25a4 100644 --- a/libavcodec/arm/vp8_armv6.S +++ b/libavcodec/arm/vp8_armv6.S @@ -1,20 +1,20 @@ /* * Copyright (C) 2010 Mans Rullgard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp8dsp.h b/libavcodec/arm/vp8dsp.h index 0d55e0f..7281d0b 100644 --- a/libavcodec/arm/vp8dsp.h +++ b/libavcodec/arm/vp8dsp.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S index 9eb9734..2320bf4 100644 --- a/libavcodec/arm/vp8dsp_armv6.S +++ b/libavcodec/arm/vp8dsp_armv6.S @@ -5,20 +5,20 @@ * Copyright (c) 2010 Rob Clark <rob@ti.com> * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * This code was partially ported from libvpx, which uses this license: diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c index aa77dba..8b80176 100644 --- a/libavcodec/arm/vp8dsp_init_arm.c +++ b/libavcodec/arm/vp8dsp_init_arm.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp8dsp_init_armv6.c b/libavcodec/arm/vp8dsp_init_armv6.c index febe4e7..a5bcd73 100644 --- a/libavcodec/arm/vp8dsp_init_armv6.c +++ b/libavcodec/arm/vp8dsp_init_armv6.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp8dsp_init_neon.c b/libavcodec/arm/vp8dsp_init_neon.c index 2b6c775..53f1f23 100644 --- a/libavcodec/arm/vp8dsp_init_neon.c +++ b/libavcodec/arm/vp8dsp_init_neon.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S index f43b4f7..fcb4248 100644 --- a/libavcodec/arm/vp8dsp_neon.S +++ b/libavcodec/arm/vp8dsp_neon.S @@ -4,20 +4,20 @@ * Copyright (c) 2010 Rob Clark <rob@ti.com> * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp9dsp_init.h b/libavcodec/arm/vp9dsp_init.h new file mode 100644 index 0000000..0dc1c2d --- /dev/null +++ b/libavcodec/arm/vp9dsp_init.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_VP9DSP_INIT_H +#define AVCODEC_ARM_VP9DSP_INIT_H + +#include "libavcodec/vp9dsp.h" + +void ff_vp9dsp_init_10bpp_arm(VP9DSPContext *dsp); +void ff_vp9dsp_init_12bpp_arm(VP9DSPContext *dsp); + +#endif /* AVCODEC_ARM_VP9DSP_INIT_H */ diff --git a/libavcodec/arm/vp9dsp_init_10bpp_arm.c b/libavcodec/arm/vp9dsp_init_10bpp_arm.c new file mode 100644 index 0000000..b8cb293 --- /dev/null +++ b/libavcodec/arm/vp9dsp_init_10bpp_arm.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPP 10 +#define INIT_FUNC ff_vp9dsp_init_10bpp_arm +#include "vp9dsp_init_16bpp_arm_template.c" diff --git a/libavcodec/arm/vp9dsp_init_12bpp_arm.c b/libavcodec/arm/vp9dsp_init_12bpp_arm.c new file mode 100644 index 0000000..fa65eb2 --- /dev/null +++ b/libavcodec/arm/vp9dsp_init_12bpp_arm.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPP 12 +#define INIT_FUNC ff_vp9dsp_init_12bpp_arm +#include "vp9dsp_init_16bpp_arm_template.c" diff --git a/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c new file mode 100644 index 0000000..1b00078 --- /dev/null +++ b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavutil/arm/cpu.h" +#include "vp9dsp_init.h" + +#define declare_fpel(type, sz, suffix) \ +void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define decl_mc_func(op, filter, dir, sz, bpp) \ +void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define define_8tap_2d_fn(op, filter, sz, bpp) \ +static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, \ + ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \ + /* We only need h + 7 lines, but the horizontal filter assumes an \ + * even number of rows, so filter h + 8 lines here. */ \ + ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \ + src - 3 * src_stride, src_stride, \ + h + 8, mx, 0); \ + ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \ + temp + 3 * 2 * sz, 2 * sz, \ + h, 0, my); \ +} + +#define decl_filter_funcs(op, dir, sz, bpp) \ + decl_mc_func(op, regular, dir, sz, bpp); \ + decl_mc_func(op, sharp, dir, sz, bpp); \ + decl_mc_func(op, smooth, dir, sz, bpp) + +#define decl_mc_funcs(sz, bpp) \ + decl_filter_funcs(put, h, sz, bpp); \ + decl_filter_funcs(avg, h, sz, bpp); \ + decl_filter_funcs(put, v, sz, bpp); \ + decl_filter_funcs(avg, v, sz, bpp); \ + decl_filter_funcs(put, hv, sz, bpp); \ + decl_filter_funcs(avg, hv, sz, bpp) + +declare_fpel(copy, 128, ); +declare_fpel(copy, 64, ); +declare_fpel(copy, 32, ); +declare_fpel(copy, 16, ); +declare_fpel(copy, 8, ); +declare_fpel(avg, 64, _16); +declare_fpel(avg, 32, _16); +declare_fpel(avg, 16, _16); +declare_fpel(avg, 8, _16); +declare_fpel(avg, 4, _16); + +decl_mc_funcs(64, BPP); +decl_mc_funcs(32, BPP); +decl_mc_funcs(16, BPP); +decl_mc_funcs(8, BPP); +decl_mc_funcs(4, BPP); + +#define define_8tap_2d_funcs(sz, bpp) \ + define_8tap_2d_fn(put, regular, sz, bpp) \ + define_8tap_2d_fn(put, sharp, sz, bpp) \ + define_8tap_2d_fn(put, smooth, sz, bpp) \ + define_8tap_2d_fn(avg, regular, sz, bpp) \ + define_8tap_2d_fn(avg, sharp, sz, bpp) \ + define_8tap_2d_fn(avg, smooth, sz, bpp) + +define_8tap_2d_funcs(64, BPP) +define_8tap_2d_funcs(32, BPP) +define_8tap_2d_funcs(16, BPP) +define_8tap_2d_funcs(8, BPP) +define_8tap_2d_funcs(4, BPP) + + +static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#define init_fpel(idx1, idx2, sz, type, suffix) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix##_neon + +#define init_copy_avg(idx, sz1, sz2) \ + init_fpel(idx, 0, sz2, copy, ); \ + init_fpel(idx, 1, sz1, avg, _16) + +#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \ + dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon + +#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \ + init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp) + +#define init_mc_funcs_dirs(idx, sz, bpp) \ + init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \ + init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \ + init_mc_funcs(idx, hv, 1, 1, sz, , bpp) + + init_copy_avg(0, 64, 128); + init_copy_avg(1, 32, 64); + init_copy_avg(2, 16, 32); + init_copy_avg(3, 8, 16); + init_copy_avg(4, 4, 8); + + init_mc_funcs_dirs(0, 64, BPP); + init_mc_funcs_dirs(1, 32, BPP); + init_mc_funcs_dirs(2, 16, BPP); + init_mc_funcs_dirs(3, 8, BPP); + init_mc_funcs_dirs(4, 4, BPP); + } +} + +#define define_itxfm2(type_a, type_b, sz, bpp) \ +void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \ + ptrdiff_t stride, \ + int16_t *_block, int eob) +#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp) + +#define define_itxfm_funcs(sz, bpp) \ + define_itxfm(idct, idct, sz, bpp); \ + define_itxfm(iadst, idct, sz, bpp); \ + define_itxfm(idct, iadst, sz, bpp); \ + define_itxfm(iadst, iadst, sz, bpp) + +define_itxfm_funcs(4, BPP); +define_itxfm_funcs(8, BPP); +define_itxfm_funcs(16, BPP); +define_itxfm(idct, idct, 32, BPP); +define_itxfm(iwht, iwht, 4, BPP); + + +static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#define init_itxfm2(tx, sz, bpp) \ + dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon +#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp) + +#define init_idct2(tx, nm, bpp) \ + dsp->itxfm_add[tx][DCT_DCT] = \ + dsp->itxfm_add[tx][ADST_DCT] = \ + dsp->itxfm_add[tx][DCT_ADST] = \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon +#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp) + + init_itxfm(TX_4X4, 4x4, BPP); + init_itxfm(TX_8X8, 8x8, BPP); + init_itxfm(TX_16X16, 16x16, BPP); + init_idct(TX_32X32, idct_idct_32x32, BPP); + init_idct(4, iwht_iwht_4x4, BPP); + } +} + +#define define_loop_filter(dir, wd, size, bpp) \ +void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H) + +#define define_loop_filters(wd, size, bpp) \ + define_loop_filter(h, wd, size, bpp); \ + define_loop_filter(v, wd, size, bpp) + +define_loop_filters(4, 8, BPP); +define_loop_filters(8, 8, BPP); +define_loop_filters(16, 8, BPP); + +define_loop_filters(16, 16, BPP); + +define_loop_filters(44, 16, BPP); +define_loop_filters(48, 16, BPP); +define_loop_filters(84, 16, BPP); +define_loop_filters(88, 16, BPP); + +static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \ + dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon + +#define init_lpf_func_16(idx, dir, bpp) \ + dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon + +#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \ + dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon + +#define init_lpf_funcs_8_wd(idx, wd, bpp) \ + init_lpf_func_8(idx, 0, h, wd, bpp); \ + init_lpf_func_8(idx, 1, v, wd, bpp) + +#define init_lpf_funcs_16(bpp) \ + init_lpf_func_16(0, h, bpp); \ + init_lpf_func_16(1, v, bpp) + +#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \ + init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \ + init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp) + +#define init_lpf_funcs_8(bpp) \ + init_lpf_funcs_8_wd(0, 4, bpp); \ + init_lpf_funcs_8_wd(1, 8, bpp); \ + init_lpf_funcs_8_wd(2, 16, bpp) + +#define init_lpf_funcs_mix2(bpp) \ + init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \ + init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \ + init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \ + init_lpf_funcs_mix2_wd(1, 1, 88, bpp) + + init_lpf_funcs_8(BPP); + init_lpf_funcs_16(BPP); + init_lpf_funcs_mix2(BPP); + } +} + +av_cold void INIT_FUNC(VP9DSPContext *dsp) +{ + vp9dsp_mc_init_arm(dsp); + vp9dsp_loopfilter_init_arm(dsp); + vp9dsp_itxfm_init_arm(dsp); +} diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c index 1ede170..cb7f48d 100644 --- a/libavcodec/arm/vp9dsp_init_arm.c +++ b/libavcodec/arm/vp9dsp_init_arm.c @@ -1,28 +1,30 @@ /* * Copyright (c) 2016 Google Inc. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdint.h> #include "libavutil/attributes.h" +#include "libavutil/internal.h" #include "libavutil/arm/cpu.h" -#include "libavcodec/vp9.h" +#include "libavcodec/vp9dsp.h" +#include "vp9dsp_init.h" #define declare_fpel(type, sz) \ void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ @@ -239,8 +241,17 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp) } } -av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp) +av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp) { + if (bpp == 10) { + ff_vp9dsp_init_10bpp_arm(dsp); + return; + } else if (bpp == 12) { + ff_vp9dsp_init_12bpp_arm(dsp); + return; + } else if (bpp != 8) + return; + vp9dsp_mc_init_arm(dsp); vp9dsp_loopfilter_init_arm(dsp); vp9dsp_itxfm_init_arm(dsp); diff --git a/libavcodec/arm/vp9itxfm_16bpp_neon.S b/libavcodec/arm/vp9itxfm_16bpp_neon.S new file mode 100644 index 0000000..b4f615e --- /dev/null +++ b/libavcodec/arm/vp9itxfm_16bpp_neon.S @@ -0,0 +1,1945 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +const itxfm4_coeffs, align=4 + .short 11585, 0, 6270, 15137 +iadst4_coeffs: + .short 5283, 15212, 9929, 13377 +endconst + +const iadst8_coeffs, align=4 + .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 +idct_coeffs: + .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 + .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 + .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 + .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 +endconst + +const iadst16_coeffs, align=4 + .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 + .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 +endconst + +@ Do two 4x4 transposes, using q registers for the subtransposes that don't +@ need to address the individual d registers. +@ r0,r1 == rq1, r2,r3 == rq1, etc +.macro transpose32_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + vswp \r1, \r4 @ vtrn.64 \rq0, \rq2 + vswp \r3, \r6 @ vtrn.64 \rq1, \rq3 + vswp \r9, \r12 @ vtrn.64 \rq4, \rq6 + vswp \r11, \r14 @ vtrn.64 \rq5, \rq7 + vtrn.32 \rq0, \rq1 + vtrn.32 \rq2, \rq3 + vtrn.32 \rq4, \rq5 + vtrn.32 \rq6, \rq7 +.endm + +@ Do eight 2x2 transposes. +.macro transpose32_8x_2x2 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + vtrn.32 \r0, \r1 + vtrn.32 \r2, \r3 + vtrn.32 \r4, \r5 + vtrn.32 \r6, \r7 + vtrn.32 \r8, \r9 + vtrn.32 \r10, \r11 + vtrn.32 \r12, \r13 + vtrn.32 \r14, \r15 +.endm + +@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 +@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 +@ in/out are d registers +.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0 + vadd.s32 \tmpd1, \in1, \in2 + vsub.s32 \tmpd2, \in1, \in2 +.if \neg > 0 + vneg.s32 \tmpd1, \tmpd1 +.endif + vmull.s32 \tmpq3, \tmpd1, d0[0] + vmull.s32 \tmpq4, \tmpd2, d0[0] + vrshrn.s64 \out1, \tmpq3, #14 + vrshrn.s64 \out2, \tmpq4, #14 +.endm + +@ Same as mbutterfly0 above, but treating the input in in2 as zero, +@ writing the same output into both out1 and out2. +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 + vmull.s32 \tmpq3, \in1, d0[0] + vrshrn.s64 \out1, \tmpq3, #14 + vrshrn.s64 \out2, \tmpq3, #14 +.endm + +@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 +@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 +@ Same as mbutterfly0, but with input being 2 q registers, output +@ being 4 d registers. +@ This can do with either 4 or 6 temporary q registers. +.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6 + vadd.s32 \tmpq1, \in1, \in2 + vsub.s32 \tmpq2, \in1, \in2 + vmull.s32 \tmpq3, \tmpd11, d0[0] + vmull.s32 \tmpq4, \tmpd12, d0[0] +.ifb \tmpq5 + vrshrn.s64 \out1, \tmpq3, #14 + vrshrn.s64 \out2, \tmpq4, #14 + vmull.s32 \tmpq3, \tmpd21, d0[0] + vmull.s32 \tmpq4, \tmpd22, d0[0] + vrshrn.s64 \out3, \tmpq3, #14 + vrshrn.s64 \out4, \tmpq4, #14 +.else + vmull.s32 \tmpq5, \tmpd21, d0[0] + vmull.s32 \tmpq6, \tmpd22, d0[0] + vrshrn.s64 \out1, \tmpq3, #14 + vrshrn.s64 \out2, \tmpq4, #14 + vrshrn.s64 \out3, \tmpq5, #14 + vrshrn.s64 \out4, \tmpq6, #14 +.endif +.endm + +@ out1 = in1 * coef1 - in2 * coef2 +@ out2 = in1 * coef2 + in2 * coef1 +@ out are 2 q registers, in are 2 d registers +.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2, neg=0 + vmull.s32 \out1, \in1, \coef1 + vmlsl.s32 \out1, \in2, \coef2 +.if \neg + vmov.s64 \out2, #0 + vmlsl.s32 \out2, \in1, \coef2 + vmlsl.s32 \out2, \in2, \coef1 +.else + vmull.s32 \out2, \in1, \coef2 + vmlal.s32 \out2, \in2, \coef1 +.endif +.endm + +@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2 +@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1 +@ out are 4 q registers, in are 4 d registers +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2 + vmull.s32 \out1, \in1, \coef1 + vmull.s32 \out2, \in2, \coef1 + vmull.s32 \out3, \in1, \coef2 + vmull.s32 \out4, \in2, \coef2 + vmlsl.s32 \out1, \in3, \coef2 + vmlsl.s32 \out2, \in4, \coef2 + vmlal.s32 \out3, \in3, \coef1 + vmlal.s32 \out4, \in4, \coef1 +.endm + +@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 +@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 +@ inout are 2 d registers, tmp are 2 q registers +.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0 + mbutterfly_l \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2, \neg + vrshrn.s64 \inout1, \tmp1, #14 + vrshrn.s64 \inout2, \tmp2, #14 +.endm + +@ Same as mbutterfly above, but treating the input in inout2 as zero +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 + vmull.s32 \tmp1, \inout1, \coef1 + vmull.s32 \tmp2, \inout1, \coef2 + vrshrn.s64 \inout1, \tmp1, #14 + vrshrn.s64 \inout2, \tmp2, #14 +.endm + +@ Same as mbutterfly above, but treating the input in inout1 as zero +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 + vmov.s64 \tmp1, #0 + vmull.s32 \tmp2, \inout2, \coef1 + vmlsl.s32 \tmp1, \inout2, \coef2 + vrshrn.s64 \inout2, \tmp2, #14 + vrshrn.s64 \inout1, \tmp1, #14 +.endm + +@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14 +@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14 +@ inout are 4 d registers, tmp are 4 q registers +.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2 + vrshrn.s64 \inout1, \tmp1, #14 + vrshrn.s64 \inout2, \tmp2, #14 + vrshrn.s64 \inout3, \tmp3, #14 + vrshrn.s64 \inout4, \tmp4, #14 +.endm + +@ out1 = in1 + in2 +@ out2 = in1 - in2 +.macro butterfly out1, out2, in1, in2 + vadd.s32 \out1, \in1, \in2 + vsub.s32 \out2, \in1, \in2 +.endm + +@ out1 = in1 - in2 +@ out2 = in1 + in2 +.macro butterfly_r out1, out2, in1, in2 + vsub.s32 \out1, \in1, \in2 + vadd.s32 \out2, \in1, \in2 +.endm + +@ out1 = (in1 + in2 + (1 << 13)) >> 14 +@ out2 = (in1 - in2 + (1 << 13)) >> 14 +@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers +.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2 + vadd.s64 \tmp1, \in1, \in2 + vsub.s64 \tmp2, \in1, \in2 + vrshrn.s64 \out1, \tmp1, #14 + vrshrn.s64 \out2, \tmp2, #14 +.endm + +@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 +@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 +@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers +.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 + vadd.s64 \tmp1, \in1, \in3 + vadd.s64 \tmp2, \in2, \in4 + vsub.s64 \tmp3, \in1, \in3 + vsub.s64 \tmp4, \in2, \in4 + vrshrn.s64 \out1, \tmp1, #14 + vrshrn.s64 \out2, \tmp2, #14 + vrshrn.s64 \out3, \tmp3, #14 + vrshrn.s64 \out4, \tmp4, #14 +.endm + + +.macro iwht4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 + vadd.i32 \c0, \c0, \c1 + vsub.i32 q11, \c2, \c3 + vsub.i32 q10, \c0, q11 + vshr.s32 q10, q10, #1 + vsub.i32 \c2, q10, \c1 + vsub.i32 \c1, q10, \c3 + vadd.i32 \c3, q11, \c2 + vsub.i32 \c0, \c0, \c1 +.endm + +.macro iwht4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 + iwht4_10 \c0, \c1, \c2, \c3, \cd0, \cd1, \cd2, \cd3, \cd4, \cd5, \cd6, \cd7 +.endm + +@ c0 == cd0,cd1, c1 == cd2,cd3 +.macro idct4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 + vmul.s32 q13, \c1, d1[1] + vmul.s32 q11, \c1, d1[0] + vadd.i32 q14, \c0, \c2 + vsub.i32 q15, \c0, \c2 + vmla.s32 q13, \c3, d1[0] + vmul.s32 q12, q14, d0[0] + vmul.s32 q10, q15, d0[0] + vmls.s32 q11, \c3, d1[1] + vrshr.s32 q13, q13, #14 + vrshr.s32 q12, q12, #14 + vrshr.s32 q10, q10, #14 + vrshr.s32 q11, q11, #14 + vadd.i32 \c0, q12, q13 + vsub.i32 \c3, q12, q13 + vadd.i32 \c1, q10, q11 + vsub.i32 \c2, q10, q11 +.endm + +.macro idct4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 + vmull.s32 q13, \cd2, d1[1] + vmull.s32 q15, \cd3, d1[1] + vmull.s32 q11, \cd2, d1[0] + vmull.s32 q3, \cd3, d1[0] + vadd.i32 q14, \c0, \c2 + vsub.i32 q2, \c0, \c2 + vmlal.s32 q13, \cd6, d1[0] + vmlal.s32 q15, \cd7, d1[0] + vmull.s32 q12, d28, d0[0] + vmull.s32 q14, d29, d0[0] + vmull.s32 q10, d4, d0[0] + vmull.s32 q8, d5, d0[0] + vmlsl.s32 q11, \cd6, d1[1] + vmlsl.s32 q3, \cd7, d1[1] + vrshrn.s64 d26, q13, #14 + vrshrn.s64 d27, q15, #14 + vrshrn.s64 d24, q12, #14 + vrshrn.s64 d25, q14, #14 + vrshrn.s64 d20, q10, #14 + vrshrn.s64 d21, q8, #14 + vrshrn.s64 d22, q11, #14 + vrshrn.s64 d23, q3, #14 + vadd.i32 \c0, q12, q13 + vsub.i32 \c3, q12, q13 + vadd.i32 \c1, q10, q11 + vsub.i32 \c2, q10, q11 +.endm + +.macro iadst4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 + vmul.s32 q10, \c0, d2[0] + vmla.s32 q10, \c2, d2[1] + vmla.s32 q10, \c3, d3[0] + vmul.s32 q11, \c0, d3[0] + vmls.s32 q11, \c2, d2[0] + vsub.s32 \c0, \c0, \c2 + vmls.s32 q11, \c3, d2[1] + vadd.s32 \c0, \c0, \c3 + vmul.s32 q13, \c1, d3[1] + vmul.s32 q12, \c0, d3[1] + vadd.s32 q14, q10, q13 + vadd.s32 q15, q11, q13 + vrshr.s32 \c0, q14, #14 + vadd.s32 q10, q10, q11 + vrshr.s32 \c1, q15, #14 + vsub.s32 q10, q10, q13 + vrshr.s32 \c2, q12, #14 + vrshr.s32 \c3, q10, #14 +.endm + +.macro iadst4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7 + vmull.s32 q10, \cd0, d2[0] + vmull.s32 q4, \cd1, d2[0] + vmlal.s32 q10, \cd4, d2[1] + vmlal.s32 q4, \cd5, d2[1] + vmlal.s32 q10, \cd6, d3[0] + vmlal.s32 q4, \cd7, d3[0] + vmull.s32 q11, \cd0, d3[0] + vmull.s32 q5, \cd1, d3[0] + vmlsl.s32 q11, \cd4, d2[0] + vmlsl.s32 q5, \cd5, d2[0] + vsub.s32 \c0, \c0, \c2 + vmlsl.s32 q11, \cd6, d2[1] + vmlsl.s32 q5, \cd7, d2[1] + vadd.s32 \c0, \c0, \c3 + vmull.s32 q13, \cd2, d3[1] + vmull.s32 q6, \cd3, d3[1] + vmull.s32 q12, \cd0, d3[1] + vmull.s32 q7, \cd1, d3[1] + vadd.s64 q14, q10, q13 + vadd.s64 q2, q4, q6 + vadd.s64 q15, q11, q13 + vadd.s64 q3, q5, q6 + vrshrn.s64 \cd1, q2, #14 + vrshrn.s64 \cd0, q14, #14 + vadd.s64 q10, q10, q11 + vadd.s64 q4, q4, q5 + vrshrn.s64 \cd3, q3, #14 + vrshrn.s64 \cd2, q15, #14 + vsub.s64 q10, q10, q13 + vsub.s64 q4, q4, q6 + vrshrn.s64 \cd4, q12, #14 + vrshrn.s64 \cd5, q7, #14 + vrshrn.s64 \cd6, q10, #14 + vrshrn.s64 \cd7, q4, #14 +.endm + +@ The public functions in this file have got the following signature: +@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +.macro itxfm_func4x4 txfm1, txfm2, bpp +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1 +.ifc \txfm1,\txfm2 +.ifc \txfm1,idct + movrel r12, itxfm4_coeffs + vld1.16 {d0}, [r12,:64] + vmovl.s16 q0, d0 +.endif +.ifc \txfm1,iadst + movrel r12, iadst4_coeffs + vld1.16 {d1}, [r12,:64] + vmovl.s16 q1, d1 +.endif +.else + movrel r12, itxfm4_coeffs + vld1.16 {q0}, [r12,:128] + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 +.endif +.if \bpp > 10 +.ifnc \txfm1\()_\txfm2,idct_idct + @ iadst4_12 needs q4-q7 + vpush {q4-q7} +.endif +.endif + + vmov.i32 q14, #0 + vmov.i32 q15, #0 +.ifc \txfm1\()_\txfm2,idct_idct + cmp r3, #1 + bne 1f + @ DC-only for idct/idct + vld1.32 {d4[]}, [r2,:32] + vmull.s32 q2, d4, d0[0] + vrshrn.s64 d4, q2, #14 + vmull.s32 q2, d4, d0[0] + vrshrn.s64 d4, q2, #14 + vst1.32 {d30[0]}, [r2,:32] + vdup.32 q2, d4[0] + vmov q3, q2 + vmov q8, q2 + vmov q9, q2 + b 2f +.endif + +1: + vld1.32 {q2-q3}, [r2,:128] + vst1.32 {q14-q15}, [r2,:128]! + vld1.32 {q8-q9}, [r2,:128] + +.ifc \txfm1,iwht + vshr.s32 q2, q2, #2 + vshr.s32 q3, q3, #2 + vshr.s32 q8, q8, #2 + vshr.s32 q9, q9, #2 +.endif + + vst1.16 {q14-q15}, [r2,:128]! + \txfm1\()4_\bpp q2, q3, q8, q9, d4, d5, d6, d7, d16, d17, d18, d19 + + @ Transpose 4x4 with 32 bit elements + vtrn.32 q2, q3 + vtrn.32 q8, q9 + vswp d5, d16 + vswp d7, d18 + + \txfm2\()4_\bpp q2, q3, q8, q9, d4, d5, d6, d7, d16, d17, d18, d19 +2: + vmvn.u16 q15, #((0xffff << \bpp) & 0xffff) + vld1.16 {d0}, [r0,:64], r1 + vld1.16 {d1}, [r0,:64], r1 +.ifnc \txfm1,iwht + vrshr.s32 q2, q2, #4 + vrshr.s32 q3, q3, #4 + vrshr.s32 q8, q8, #4 + vrshr.s32 q9, q9, #4 +.endif + vaddw.u16 q2, q2, d0 + vaddw.u16 q3, q3, d1 + vld1.16 {d2}, [r0,:64], r1 + vld1.16 {d3}, [r0,:64], r1 + vqmovun.s32 d0, q2 + vqmovun.s32 d1, q3 + sub r0, r0, r1, lsl #2 + + vaddw.u16 q8, q8, d2 + vmin.u16 q0, q0, q15 + vaddw.u16 q9, q9, d3 + vst1.16 {d0}, [r0,:64], r1 + vqmovun.s32 d2, q8 + vqmovun.s32 d3, q9 + vmin.u16 q1, q1, q15 + + vst1.16 {d1}, [r0,:64], r1 + vst1.16 {d2}, [r0,:64], r1 + vst1.16 {d3}, [r0,:64], r1 + +.if \bpp > 10 +.ifnc \txfm1\()_\txfm2,idct_idct + vpop {q4-q7} +.endif +.endif + bx lr +endfunc +.endm + +.macro itxfm_funcs4x4 bpp +itxfm_func4x4 idct, idct, \bpp +itxfm_func4x4 iadst, idct, \bpp +itxfm_func4x4 idct, iadst, \bpp +itxfm_func4x4 iadst, iadst, \bpp +itxfm_func4x4 iwht, iwht, \bpp +.endm + +itxfm_funcs4x4 10 +itxfm_funcs4x4 12 + +.macro idct8 + dmbutterfly0 d16, d17, d24, d25, q8, q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a + dmbutterfly d20, d21, d28, d29, d1[0], d1[1], q2, q3, q4, q5 @ q10 = t2a, q14 = t3a + dmbutterfly d18, d19, d30, d31, d2[0], d2[1], q2, q3, q4, q5 @ q9 = t4a, q15 = t7a + dmbutterfly d26, d27, d22, d23, d3[0], d3[1], q2, q3, q4, q5 @ q13 = t5a, q11 = t6a + + butterfly q2, q14, q8, q14 @ q2 = t0, q14 = t3 + butterfly q3, q10, q12, q10 @ q3 = t1, q10 = t2 + butterfly q4, q13, q9, q13 @ q4 = t4, q13 = t5a + butterfly q5, q11, q15, q11 @ q5 = t7, q11 = t6a + + butterfly q8, q15, q2, q5 @ q8 = out[0], q15 = out[7] + + dmbutterfly0 d4, d5, d10, d11, q11, q13, q9, q13, d18, d19, d26, d27, q2, q5, q11, q12 @ q2 = t6, q5 = t5 + + butterfly q11, q12, q14, q4 @ q11 = out[3], q12 = out[4] + butterfly q9, q14, q3, q2 @ q9 = out[1], q14 = out[6] + butterfly_r q13, q10, q10, q5 @ q13 = out[5], q10 = out[2] +.endm + +.macro iadst8 + movrel r12, iadst8_coeffs + vld1.16 {q1}, [r12,:128]! + vmovl.s16 q0, d2 + vmovl.s16 q1, d3 + + dmbutterfly_l q4, q5, q2, q3, d30, d31, d16, d17, d0[1], d0[0] @ q4,q5 = t1a, q2,q3 = t0a + dmbutterfly_l q8, q15, q6, q7, d22, d23, d24, d25, d2[1], d2[0] @ q8,q15 = t5a, q6,q7 = t4a + + dbutterfly_n d22, d23, d4, d5, q2, q3, q6, q7, q11, q12, q2, q3 @ q11 = t0, q2 = t4 + + dbutterfly_n d24, d25, d6, d7, q4, q5, q8, q15, q12, q3, q6, q7 @ q12 = t1, q3 = t5 + + dmbutterfly_l q6, q7, q4, q5, d26, d27, d20, d21, d1[1], d1[0] @ q6,q7 = t3a, q4,q5 = t2a + dmbutterfly_l q10, q13, q8, q15, d18, d19, d28, d29, d3[1], d3[0] @ q10,q13 = t7a, q8,q15 = t6a + + dbutterfly_n d18, d19, d8, d9, q4, q5, q8, q15, q9, q14, q4, q5 @ q9 = t2, q4 = t6 + dbutterfly_n d16, d17, d12, d13, q6, q7, q10, q13, q8, q15, q6, q7 @ q8 = t3, q6 = t7 + + movrel r12, idct_coeffs + vld1.16 {q0}, [r12,:128] + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + + butterfly q15, q12, q12, q8 @ q15 = -out[7], q12 = t3 + vneg.s32 q15, q15 @ q15 = out[7] + butterfly q8, q9, q11, q9 @ q8 = out[0], q9 = t2 + + dmbutterfly_l q10, q11, q5, q7, d4, d5, d6, d7, d1[0], d1[1] @ q10,q11 = t5a, q5,q7 = t4a + dmbutterfly_l q2, q3, q13, q14, d12, d13, d8, d9, d1[1], d1[0] @ q2,q3 = t6a, q13,q14 = t7a + + dbutterfly_n d28, d29, d8, d9, q10, q11, q13, q14, q4, q6, q10, q11 @ q14 = out[6], q4 = t7 + + dmbutterfly0 d22, d23, d24, d25, q9, q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4] + vneg.s32 q11, q11 @ q11 = out[3] + + dbutterfly_n d18, d19, d4, d5, q5, q7, q2, q3, q9, q10, q2, q3 @ q9 = -out[1], q2 = t6 + vneg.s32 q9, q9 @ q9 = out[1] + + dmbutterfly0 d20, d21, d26, d27, q2, q4, q3, q5, d6, d7, d10, d11, q6, q7 @ q10 = out[2], q13 = -out[5] + vneg.s32 q13, q13 @ q13 = out[5] +.endm + +function idct8x8_dc_add_neon + movrel r12, idct_coeffs + vld1.16 {d0}, [r12,:64] + + vmov.i32 q2, #0 + vmovl.s16 q0, d0 + + vld1.32 {d16[]}, [r2,:32] + vmull.s32 q8, d16, d0[0] + vrshrn.s64 d16, q8, #14 + vmull.s32 q8, d16, d0[0] + vrshrn.s64 d16, q8, #14 + vdup.32 q8, d16[0] + vst1.32 {d4[0]}, [r2,:32] + + vrshr.s32 q8, q8, #5 + vdup.s16 q15, r8 + + mov r3, r0 + mov r12, #8 +1: + @ Loop to add the constant from q8 into all 8x8 outputs + subs r12, r12, #2 + vld1.16 {q2}, [r0,:128], r1 + vaddw.u16 q10, q8, d4 + vld1.16 {q3}, [r0,:128], r1 + vaddw.u16 q11, q8, d5 + vaddw.u16 q12, q8, d6 + vaddw.u16 q13, q8, d7 + vqmovun.s32 d4, q10 + vqmovun.s32 d5, q11 + vqmovun.s32 d6, q12 + vqmovun.s32 d7, q13 + vmin.u16 q2, q2, q15 + vst1.16 {q2}, [r3,:128], r1 + vmin.u16 q3, q3, q15 + vst1.16 {q3}, [r3,:128], r1 + bne 1b + + pop {r4-r8,pc} +endfunc +.ltorg + +.macro itxfm8_1d_funcs txfm +@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it, +@ transpose into a horizontal 8x4 slice and store. +@ r0 = dst (temp buffer) +@ r1 = slice offset +@ r2 = src +function \txfm\()8_1d_4x8_pass1_neon + mov r12, #32 + vmov.s32 q2, #0 +.irp i, 8, 9, 10, 11, 12, 13, 14, 15 + vld1.32 {q\i}, [r2,:128] + vst1.32 {q2}, [r2,:128], r12 +.endr + + \txfm\()8 + + @ Do two 4x4 transposes. Originally, q8-q15 contain the + @ 8 rows. Afterwards, q8-q11, q12-q15 contain the transposed + @ 4x4 blocks. + transpose32_q_2x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + + @ Store the transposed 4x4 blocks horizontally. + cmp r1, #4 + beq 1f +.irp i, 8, 12, 9, 13, 10, 14, 11, 15 + vst1.32 {q\i}, [r0,:128]! +.endr + bx lr +1: + @ Special case: For the last input column (r1 == 4), + @ which would be stored as the last row in the temp buffer, + @ don't store the first 4x4 block, but keep it in registers + @ for the first slice of the second pass (where it is the + @ last 4x4 block). +.irp i, 12, 13, 14, 15 + add r0, r0, #16 + vst1.32 {q\i}, [r0,:128]! +.endr + vmov q12, q8 + vmov q13, q9 + vmov q14, q10 + vmov q15, q11 + bx lr +endfunc + +@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it, +@ load the destination pixels (from a similar 4x8 slice), add and store back. +@ r0 = dst +@ r1 = dst stride +@ r2 = src (temp buffer) +@ r3 = slice offset +function \txfm\()8_1d_4x8_pass2_neon + mov r12, #32 +.irp i, 8, 9, 10, 11 + vld1.32 {q\i}, [r2,:128], r12 +.endr + cmp r3, #0 + beq 1f +.irp i, 12, 13, 14, 15 + vld1.32 {q\i}, [r2,:128], r12 +.endr +1: + + add r3, r0, r1 + lsl r1, r1, #1 + \txfm\()8 + + vdup.s16 q4, r8 +.macro load_add_store coef0, coef1, coef2, coef3 + vld1.16 {d4}, [r0,:64], r1 + vld1.16 {d5}, [r3,:64], r1 + vld1.16 {d6}, [r0,:64], r1 + vld1.16 {d7}, [r3,:64], r1 + + vrshr.s32 \coef0, \coef0, #5 + vrshr.s32 \coef1, \coef1, #5 + vrshr.s32 \coef2, \coef2, #5 + vrshr.s32 \coef3, \coef3, #5 + + vaddw.u16 \coef0, \coef0, d4 + vaddw.u16 \coef1, \coef1, d5 + vaddw.u16 \coef2, \coef2, d6 + vaddw.u16 \coef3, \coef3, d7 + + sub r0, r0, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + vqmovun.s32 d4, \coef0 + vqmovun.s32 d5, \coef1 + vqmovun.s32 d6, \coef2 + vqmovun.s32 d7, \coef3 + + vmin.u16 q2, q2, q4 + vmin.u16 q3, q3, q4 + + vst1.16 {d4}, [r0,:64], r1 + vst1.16 {d5}, [r3,:64], r1 + vst1.16 {d6}, [r0,:64], r1 + vst1.16 {d7}, [r3,:64], r1 +.endm + load_add_store q8, q9, q10, q11 + load_add_store q12, q13, q14, q15 +.purgem load_add_store + + bx lr +endfunc +.endm + +itxfm8_1d_funcs idct +itxfm8_1d_funcs iadst + +.macro itxfm_func8x8 txfm1, txfm2 +function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +.ifc \txfm1\()_\txfm2,idct_idct + cmp r3, #1 + beq idct8x8_dc_add_neon +.endif +.ifnc \txfm1\()_\txfm2,idct_idct + vpush {q4-q7} +.else + vpush {q4-q5} +.endif + + @ Align the stack, allocate a temp buffer +T mov r7, sp +T and r7, r7, #15 +A and r7, sp, #15 + add r7, r7, #256 + sub sp, sp, r7 + + mov r4, r0 + mov r5, r1 + mov r6, r2 + +.ifc \txfm1,idct + movrel r12, idct_coeffs + vld1.16 {q0}, [r12,:128] + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 +.endif + +.irp i, 0, 4 + add r0, sp, #(\i*32) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i == 4 + cmp r3, #12 + ble 1f +.endif +.endif + mov r1, #\i + add r2, r6, #(\i*4) + bl \txfm1\()8_1d_4x8_pass1_neon +.endr +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + @ For all-zero slices in pass 1, set q12-q15 to zero, for the in-register + @ passthrough of coefficients to pass 2 and clear the end of the temp buffer + vmov.i32 q12, #0 + vmov.i32 q13, #0 + vmov.i32 q14, #0 + vmov.i32 q15, #0 +.rept 4 + vst1.32 {q12-q13}, [r0,:128]! +.endr +3: +.endif +.ifc \txfm1\()_\txfm2,iadst_idct + movrel r12, idct_coeffs + vld1.16 {q0}, [r12,:128] + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 +.endif +.irp i, 0, 4 + add r0, r4, #(\i*2) + mov r1, r5 + add r2, sp, #(\i*4) + mov r3, #\i + bl \txfm2\()8_1d_4x8_pass2_neon +.endr + + add sp, sp, r7 +.ifnc \txfm1\()_\txfm2,idct_idct + vpop {q4-q7} +.else + vpop {q4-q5} +.endif + pop {r4-r8,pc} +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1 + push {r4-r8,lr} + movw r8, #0x03ff + b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1 + push {r4-r8,lr} + movw r8, #0x0fff + b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +endfunc +.endm + +itxfm_func8x8 idct, idct +itxfm_func8x8 iadst, idct +itxfm_func8x8 idct, iadst +itxfm_func8x8 iadst, iadst + +function idct16x16_dc_add_neon + movrel r12, idct_coeffs + vld1.16 {d0}, [r12,:64] + + vmov.i32 q2, #0 + vmovl.s16 q0, d0 + + vld1.32 {d16[]}, [r2,:32] + vmull.s32 q8, d16, d0[0] + vrshrn.s64 d16, q8, #14 + vmull.s32 q8, d16, d0[0] + vrshrn.s64 d16, q8, #14 + vdup.32 q8, d16[0] + vst1.32 {d4[0]}, [r2,:32] + + vrshr.s32 q8, q8, #6 + vdup.s16 q15, r9 + + mov r3, r0 + mov r12, #16 +1: + @ Loop to add the constant from q8 into all 16x16 outputs + subs r12, r12, #2 + vld1.16 {q0-q1}, [r0,:128], r1 + vaddw.u16 q9, q8, d0 + vaddw.u16 q10, q8, d1 + vld1.16 {q2-q3}, [r0,:128], r1 + vaddw.u16 q11, q8, d2 + vaddw.u16 q12, q8, d3 + vaddw.u16 q13, q8, d4 + vaddw.u16 q14, q8, d5 + vqmovun.s32 d0, q9 + vaddw.u16 q9, q8, d6 + vqmovun.s32 d1, q10 + vaddw.u16 q10, q8, d7 + vqmovun.s32 d2, q11 + vqmovun.s32 d3, q12 + vqmovun.s32 d4, q13 + vqmovun.s32 d5, q14 + vmin.u16 q0, q0, q15 + vmin.u16 q1, q1, q15 + vqmovun.s32 d6, q9 + vqmovun.s32 d7, q10 + vst1.16 {q0-q1}, [r3,:128], r1 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 + vst1.16 {q2-q3}, [r3,:128], r1 + bne 1b + + pop {r4-r9,pc} +endfunc +.ltorg + +.macro idct16_end + butterfly d18, d11, d8, d11 @ d18 = t0a, d11 = t7a + butterfly d19, d22, d9, d22 @ d19 = t1a, d22 = t6 + butterfly d8, d26, d20, d26 @ d8 = t2a, d26 = t5 + butterfly d9, d10, d28, d10 @ d9 = t3a, d10 = t4 + butterfly d20, d28, d16, d24 @ d20 = t8a, d28 = t11a + butterfly d24, d21, d23, d21 @ d24 = t9, d21 = t10 + butterfly d23, d27, d25, d27 @ d23 = t14, d27 = t13 + butterfly d25, d29, d29, d17 @ d25 = t15a, d29 = t12a + + mbutterfly0 d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a + mbutterfly0 d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12, d28 = t11 + + vswp d27, d29 @ d27 = t12, d29 = t13a + vswp d28, d27 @ d28 = t12, d27 = t11 + butterfly d16, d31, d18, d25 @ d16 = out[0], d31 = out[15] + butterfly d17, d30, d19, d23 @ d17 = out[1], d30 = out[14] + butterfly_r d25, d22, d22, d24 @ d25 = out[9], d22 = out[6] + butterfly d23, d24, d11, d20 @ d23 = out[7], d24 = out[8] + butterfly d18, d29, d8, d29 @ d18 = out[2], d29 = out[13] + butterfly d19, d28, d9, d28 @ d19 = out[3], d28 = out[12] + vmov d8, d21 @ d8 = t10a + butterfly d20, d27, d10, d27 @ d20 = out[4], d27 = out[11] + butterfly d21, d26, d26, d8 @ d21 = out[5], d26 = out[10] + bx lr +.endm + +function idct16 + mbutterfly0 d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a + mbutterfly d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a + mbutterfly d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a + mbutterfly d26, d22, d3[0], d3[1], q4, q5 @ d26 = t5a, d22 = t6a + mbutterfly d17, d31, d4[0], d4[1], q4, q5 @ d17 = t8a, d31 = t15a + mbutterfly d25, d23, d5[0], d5[1], q4, q5 @ d25 = t9a, d23 = t14a + mbutterfly d21, d27, d6[0], d6[1], q4, q5 @ d21 = t10a, d27 = t13a + mbutterfly d29, d19, d7[0], d7[1], q4, q5 @ d29 = t11a, d19 = t12a + + butterfly d8, d28, d16, d28 @ d8 = t0, d28 = t3 + butterfly d9, d20, d24, d20 @ d9 = t1, d20 = t2 + butterfly d10, d26, d18, d26 @ d10 = t4, d26 = t5 + butterfly d11, d22, d30, d22 @ d11 = t7, d22 = t6 + butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9 + butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10 + butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13 + butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 + + mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a + mbutterfly d23, d25, d1[0], d1[1], q9, q15 @ d23 = t9a, d25 = t14a + mbutterfly d27, d21, d1[0], d1[1], q9, q15, neg=1 @ d27 = t13a, d21 = t10a + idct16_end +endfunc + +function idct16_half + mbutterfly0_h d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a + mbutterfly_h1 d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a + mbutterfly_h1 d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a + mbutterfly_h2 d26, d22, d3[0], d3[1], q4, q5 @ d26 = t5a, d22 = t6a + mbutterfly_h1 d17, d31, d4[0], d4[1], q4, q5 @ d17 = t8a, d31 = t15a + mbutterfly_h2 d25, d23, d5[0], d5[1], q4, q5 @ d25 = t9a, d23 = t14a + mbutterfly_h1 d21, d27, d6[0], d6[1], q4, q5 @ d21 = t10a, d27 = t13a + mbutterfly_h2 d29, d19, d7[0], d7[1], q4, q5 @ d29 = t11a, d19 = t12a + + butterfly d8, d28, d16, d28 @ d8 = t0, d28 = t3 + butterfly d9, d20, d24, d20 @ d9 = t1, d20 = t2 + butterfly d10, d26, d18, d26 @ d10 = t4, d26 = t5 + butterfly d11, d22, d30, d22 @ d11 = t7, d22 = t6 + butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9 + butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10 + butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13 + butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 + + mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a + mbutterfly d23, d25, d1[0], d1[1], q9, q15 @ d23 = t9a, d25 = t14a + mbutterfly d27, d21, d1[0], d1[1], q9, q15, neg=1 @ d27 = t13a, d21 = t10a + idct16_end +endfunc + +function idct16_quarter + vmov.s64 q12, #0 + vmull.s32 q4, d17, d4[0] + vmull.s32 q5, d18, d2[1] + vmull.s32 q15, d18, d2[0] + vmlsl.s32 q12, d19, d7[1] + vmull.s32 q14, d17, d4[1] + vmull.s32 q13, d19, d7[0] + vmull.s32 q11, d16, d0[0] + vrshrn.s64 d16, q4, #14 + vrshrn.s64 d11, q5, #14 + vrshrn.s64 d10, q15, #14 + vrshrn.s64 d24, q12, #14 + vrshrn.s64 d29, q14, #14 + vrshrn.s64 d17, q13, #14 + vrshrn.s64 d28, q11, #14 + + mbutterfly_l q10, q11, d17, d24, d1[0], d1[1], neg=1 + mbutterfly_l q9, q15, d29, d16, d1[0], d1[1] + vrshrn.s64 d27, q10, #14 + vrshrn.s64 d21, q11, #14 + vrshrn.s64 d23, q9, #14 + vrshrn.s64 d25, q15, #14 + vmov d8, d28 + vmov d9, d28 + mbutterfly0 d22, d26, d11, d10, d18, d30, q9, q15 + vmov d20, d28 + idct16_end +endfunc + +function iadst16 + movrel r12, iadst16_coeffs + vld1.16 {q0}, [r12,:128]! + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + + mbutterfly_l q3, q2, d31, d16, d0[1], d0[0] @ q3 = t1, q2 = t0 + mbutterfly_l q5, q4, d23, d24, d2[1], d2[0] @ q5 = t9, q4 = t8 + butterfly_n d31, d24, q3, q5, q6, q5 @ d31 = t1a, d24 = t9a + mbutterfly_l q7, q6, d29, d18, d1[1], d1[0] @ q7 = t3, q6 = t2 + butterfly_n d16, d23, q2, q4, q3, q4 @ d16 = t0a, d23 = t8a + mbutterfly_l q3, q2, d21, d26, d3[1], d3[0] @ q3 = t11, q2 = t10 + + vld1.16 {q0}, [r12,:128]! + butterfly_n d29, d26, q7, q3, q4, q3 @ d29 = t3a, d26 = t11a + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + mbutterfly_l q5, q4, d27, d20, d0[1], d0[0] @ q5 = t5, q4 = t4 + butterfly_n d18, d21, q6, q2, q3, q2 @ d18 = t2a, d21 = t10a + + mbutterfly_l q7, q6, d19, d28, d2[1], d2[0] @ q7 = t13, q6 = t12 + butterfly_n d20, d28, q5, q7, q2, q7 @ d20 = t5a, d28 = t13a + mbutterfly_l q3, q2, d25, d22, d1[1], d1[0] @ q3 = t7, q2 = t6 + butterfly_n d27, d19, q4, q6, q5, q6 @ d27 = t4a, d19 = t12a + + mbutterfly_l q5, q4, d17, d30, d3[1], d3[0] @ q5 = t15, q4 = t14 + movrel r12, idct_coeffs + vld1.16 {q0}, [r12,:128] + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + butterfly_n d22, d30, q3, q5, q6, q5 @ d22 = t7a, d30 = t15a + mbutterfly_l q7, q6, d23, d24, d2[0], d2[1] @ q7 = t9, q6 = t8 + butterfly_n d25, d17, q2, q4, q3, q4 @ d25 = t6a, d17 = t14a + + mbutterfly_l q2, q3, d28, d19, d2[1], d2[0] @ q2 = t12, q3 = t13 + butterfly_n d23, d19, q6, q2, q4, q2 @ d23 = t8a, d19 = t12a + mbutterfly_l q5, q4, d21, d26, d3[0], d3[1] @ q5 = t11, q4 = t10 + butterfly_r d4, d27, d16, d27 @ d4 = t4, d27 = t0 + butterfly_n d24, d28, q7, q3, q6, q3 @ d24 = t9a, d28 = t13a + + mbutterfly_l q6, q7, d30, d17, d3[1], d3[0] @ q6 = t14, q7 = t15 + butterfly_r d5, d20, d31, d20 @ d5 = t5, d20 = t1 + butterfly_n d21, d17, q4, q6, q3, q6 @ d21 = t10a, d17 = t14a + butterfly_n d26, d30, q5, q7, q4, q7 @ d26 = t11a, d30 = t15a + + butterfly_r d6, d25, d18, d25 @ d6 = t6, d25 = t2 + butterfly_r d7, d22, d29, d22 @ d7 = t7, d22 = t3 + + mbutterfly_l q5, q4, d19, d28, d1[0], d1[1] @ q5 = t13, q4 = t12 + mbutterfly_l q6, q7, d30, d17, d1[1], d1[0] @ q6 = t14, q7 = t15 + + butterfly_n d18, d30, q4, q6, q8, q6 @ d18 = out[2], d30 = t14a + butterfly_n d29, d17, q5, q7, q6, q7 @ d29 = -out[13], d17 = t15a + vneg.s32 d29, d29 @ d29 = out[13] + + mbutterfly_l q5, q4, d4, d5, d1[0], d1[1] @ q5 = t5a, q4 = t4a + mbutterfly_l q6, q7, d7, d6, d1[1], d1[0] @ q6 = t6a, q7 = t7a + + butterfly d2, d6, d27, d25 @ d2 = out[0], d6 = t2a + butterfly d3, d7, d23, d21 @ d3 =-out[1], d7 = t10 + + butterfly_n d19, d31, q4, q6, q2, q4 @ d19 = -out[3], d31 = t6 + vneg.s32 d19, d19 @ d19 = out[3] + butterfly_n d28, d16, q5, q7, q2, q5 @ d28 = out[12], d16 = t7 + + butterfly d5, d8, d20, d22 @ d5 =-out[15],d8 = t3a + butterfly d4, d9, d24, d26 @ d4 = out[14],d9 = t11 + + mbutterfly0 d23, d24, d6, d8, d10, d11, q6, q7, 1 @ d23 = out[7], d24 = out[8] + mbutterfly0 d20, d27, d16, d31, d10, d11, q6, q7 @ d20 = out[4], d27 = out[11] + mbutterfly0 d22, d25, d9, d7, d10, d11, q6, q7 @ d22 = out[6], d25 = out[9] + mbutterfly0 d21, d26, d30, d17, d10, d11, q6, q7, 1 @ d21 = out[5], d26 = out[10] + + vneg.s32 d31, d5 @ d31 = out[15] + vneg.s32 d17, d3 @ d17 = out[1] + + vmov d16, d2 + vmov d30, d4 + bx lr +endfunc + +.macro itxfm16_1d_funcs txfm, suffix +@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it, +@ transpose into a horizontal 16x2 slice and store. +@ r0 = dst (temp buffer) +@ r2 = src +function \txfm\()16_1d_2x16_pass1\suffix\()_neon + push {lr} + + mov r12, #64 + vmov.s32 q4, #0 +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.32 {d\i}, [r2,:64] + vst1.32 {d8}, [r2,:64], r12 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + vld1.32 {d\i}, [r2,:64] + vst1.32 {d8}, [r2,:64], r12 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.32 {d\i}, [r2,:64] + vst1.32 {d8}, [r2,:64], r12 +.endr +.endif + + bl \txfm\()16\suffix + + @ Do eight 2x2 transposes. Originally, d16-d31 contain the + @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight + @ transposed 2x2 blocks. + transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + + @ Store the transposed 2x2 blocks horizontally. +.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31 + vst1.32 {d\i}, [r0,:64]! +.endr + pop {pc} +endfunc + +@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it, +@ load the destination pixels (from a similar 2x16 slice), add and store back. +@ r0 = dst +@ r1 = dst stride +@ r2 = src (temp buffer) +function \txfm\()16_1d_2x16_pass2\suffix\()_neon + push {lr} + + mov r12, #64 +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.16 {d\i}, [r2,:64], r12 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19, 20 + vld1.16 {d\i}, [r2,:64], r12 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.16 {d\i}, [r2,:64], r12 +.endr +.endif + + add r3, r0, r1 + lsl r1, r1, #1 + bl \txfm\()16\suffix + +.macro load_add_store coef0, coef1, coef2, coef3 + vrshr.s32 \coef0, \coef0, #6 + vrshr.s32 \coef1, \coef1, #6 + + vld1.32 {d8[]}, [r0,:32], r1 + vld1.32 {d8[1]}, [r3,:32], r1 + vrshr.s32 \coef2, \coef2, #6 + vrshr.s32 \coef3, \coef3, #6 + vld1.32 {d9[]}, [r0,:32], r1 + vld1.32 {d9[1]}, [r3,:32], r1 + vaddw.u16 \coef0, \coef0, d8 + vld1.32 {d10[]}, [r0,:32], r1 + vld1.32 {d10[1]}, [r3,:32], r1 + vaddw.u16 \coef1, \coef1, d9 + vld1.32 {d11[]}, [r0,:32], r1 + vld1.32 {d11[1]}, [r3,:32], r1 + + vqmovun.s32 d8, \coef0 + vdup.s16 q8, r9 + vqmovun.s32 d9, \coef1 + sub r0, r0, r1, lsl #2 + sub r3, r3, r1, lsl #2 + vaddw.u16 \coef2, \coef2, d10 + vaddw.u16 \coef3, \coef3, d11 + vmin.u16 q4, q4, q8 + vst1.32 {d8[0]}, [r0,:32], r1 + vst1.32 {d8[1]}, [r3,:32], r1 + vqmovun.s32 d10, \coef2 + vst1.32 {d9[0]}, [r0,:32], r1 + vst1.32 {d9[1]}, [r3,:32], r1 + vqmovun.s32 d11, \coef3 + vmin.u16 q5, q5, q8 + + vst1.32 {d10[0]}, [r0,:32], r1 + vst1.32 {d10[1]}, [r3,:32], r1 + vst1.32 {d11[0]}, [r0,:32], r1 + vst1.32 {d11[1]}, [r3,:32], r1 +.endm + load_add_store q8, q9, q10, q11 + load_add_store q12, q13, q14, q15 +.purgem load_add_store + + pop {pc} +endfunc +.endm + +itxfm16_1d_funcs idct +itxfm16_1d_funcs iadst +itxfm16_1d_funcs idct, _quarter +itxfm16_1d_funcs idct, _half +.ltorg + +@ This is the minimum eob value for each subpartition, in increments of 2 +const min_eob_idct_idct_16, align=4 + .short 0, 3, 10, 22, 38, 62, 89, 121 +endconst + +.macro itxfm_func16x16 txfm1, txfm2 +function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +.ifc \txfm1\()_\txfm2,idct_idct + cmp r3, #1 + beq idct16x16_dc_add_neon +.endif +.ifnc \txfm1\()_\txfm2,idct_idct + vpush {q4-q7} +.else + vpush {q4-q5} +.endif + + @ Align the stack, allocate a temp buffer +T mov r7, sp +T and r7, r7, #15 +A and r7, sp, #15 + add r7, r7, #1024 + sub sp, sp, r7 + + mov r4, r0 + mov r5, r1 + mov r6, r2 + +.ifc \txfm1,idct + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128] + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 +.endif + +.ifc \txfm1\()_\txfm2,idct_idct + cmp r3, #10 + ble idct16x16_quarter_add_16_neon + cmp r3, #38 + ble idct16x16_half_add_16_neon + + movrel r8, min_eob_idct_idct_16 + 2 +.endif + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r0, sp, #(\i*64) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i > 0 + ldrh_post r1, r8, #2 + cmp r3, r1 + it le + movle r1, #(16 - \i)/2 + ble 1f +.endif +.endif + add r2, r6, #(\i*4) + bl \txfm1\()16_1d_2x16_pass1_neon +.endr + +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + vmov.i32 q14, #0 + vmov.i32 q15, #0 +2: + subs r1, r1, #1 + @ Unroll for 2 lines +.rept 2 + @ Fill one line with zeros + vst1.32 {q14-q15}, [r0,:128]! + vst1.32 {q14-q15}, [r0,:128]! +.endr + bne 2b +3: +.endif + +.ifc \txfm1\()_\txfm2,iadst_idct + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128] + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 +.endif +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r0, r4, #(\i*2) + mov r1, r5 + add r2, sp, #(\i*4) + bl \txfm2\()16_1d_2x16_pass2_neon +.endr + + add sp, sp, r7 +.ifnc \txfm1\()_\txfm2,idct_idct + vpop {q4-q7} +.else + vpop {q4-q5} +.endif + pop {r4-r9,pc} +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1 + push {r4-r9,lr} + movw r9, #0x03ff + b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1 + push {r4-r9,lr} + movw r9, #0x0fff + b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +endfunc +.endm + +itxfm_func16x16 idct, idct +itxfm_func16x16 iadst, idct +itxfm_func16x16 idct, iadst +itxfm_func16x16 iadst, iadst +.ltorg + +.macro idct16_partial size +function idct16x16_\size\()_add_16_neon +.irp i, 0, 2 + add r0, sp, #(\i*64) +.ifc \size,quarter +.if \i == 2 + cmp r3, #3 + ble 1f +.endif +.endif + add r2, r6, #(\i*4) + bl idct16_1d_2x16_pass1_\size\()_neon +.endr + +.ifc \size,half +.irp i, 4, 6 + add r0, sp, #(\i*64) +.if \i == 6 + cmp r3, #22 + ble 1f +.endif + add r2, r6, #(\i*4) + bl idct16_1d_2x16_pass1_\size\()_neon +.endr +.endif + + b 3f +1: + vmov.i32 q14, #0 + vmov.i32 q15, #0 + + @ Unroll for 2 lines +.rept 2 + @ Fill one line with zeros + vst1.32 {q14-q15}, [r0,:128]! + vst1.32 {q14-q15}, [r0,:128]! +.endr + +3: + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r0, r4, #(\i*2) + mov r1, r5 + add r2, sp, #(\i*4) + bl idct16_1d_2x16_pass2_\size\()_neon +.endr + + add sp, sp, r7 + vpop {q4-q5} + pop {r4-r9,pc} +endfunc +.endm + +idct16_partial quarter +idct16_partial half + +function idct32x32_dc_add_neon + movrel r12, idct_coeffs + vld1.16 {d0}, [r12,:64] + + vmov.i32 q2, #0 + vmovl.s16 q0, d0 + + vld1.32 {d16[]}, [r2,:32] + vmull.s32 q8, d16, d0[0] + vrshrn.s64 d16, q8, #14 + vmull.s32 q8, d16, d0[0] + vrshrn.s64 d16, q8, #14 + vdup.32 q8, d16[0] + vst1.32 {d4[0]}, [r2,:32] + + vrshr.s32 q8, q8, #6 + vdup.s16 q15, r9 + + mov r3, r0 + mov r12, #32 + sub r1, r1, #32 +1: + @ Loop to add the constant from q8 into all 32x32 outputs + subs r12, r12, #1 + vld1.16 {q0-q1}, [r0,:128]! + vaddw.u16 q9, q8, d0 + vaddw.u16 q10, q8, d1 + vld1.16 {q2-q3}, [r0,:128], r1 + vaddw.u16 q11, q8, d2 + vaddw.u16 q12, q8, d3 + vaddw.u16 q13, q8, d4 + vaddw.u16 q14, q8, d5 + vqmovun.s32 d0, q9 + vaddw.u16 q9, q8, d6 + vqmovun.s32 d1, q10 + vaddw.u16 q10, q8, d7 + vqmovun.s32 d2, q11 + vqmovun.s32 d3, q12 + vqmovun.s32 d4, q13 + vqmovun.s32 d5, q14 + vmin.u16 q0, q0, q15 + vmin.u16 q1, q1, q15 + vqmovun.s32 d6, q9 + vqmovun.s32 d7, q10 + vst1.16 {q0-q1}, [r3,:128]! + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 + vst1.16 {q2-q3}, [r3,:128], r1 + bne 1b + + pop {r4-r9,pc} +endfunc + +.macro idct32_end + butterfly d16, d9, d8, d9 @ d16 = t16a, d9 = t19a + butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18 + butterfly d18, d10, d11, d10 @ d18 = t23a, d10 = t20a + butterfly d19, d21, d22, d21 @ d19 = t22, d21 = t21 + butterfly d8, d28, d28, d30 @ d8 = t24a, d28 = t27a + butterfly d23, d26, d25, d26 @ d23 = t25, d26 = t26 + butterfly d11, d29, d29, d31 @ d11 = t31a, d29 = t28a + butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29 + + mbutterfly d27, d20, d1[0], d1[1], q12, q15 @ d27 = t18a, d20 = t29a + mbutterfly d29, d9, d1[0], d1[1], q12, q15 @ d29 = t19, d9 = t28 + mbutterfly d28, d10, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27, d10 = t20 + mbutterfly d26, d21, d1[0], d1[1], q12, q15, neg=1 @ d26 = t26a, d21 = t21a + + butterfly d31, d24, d11, d8 @ d31 = t31, d24 = t24 + butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a + butterfly_r d23, d16, d16, d18 @ d23 = t23, d16 = t16 + butterfly_r d22, d17, d17, d19 @ d22 = t22a, d17 = t17a + butterfly d18, d21, d27, d21 @ d18 = t18, d21 = t21 + butterfly_r d27, d28, d9, d28 @ d27 = t27a, d28 = t28a + butterfly d8, d26, d20, d26 @ d8 = t29, d26 = t26 + butterfly d19, d20, d29, d10 @ d19 = t19a, d20 = t20 + vmov d29, d8 @ d29 = t29 + + mbutterfly0 d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27, d20 = t20 + mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a + mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22 + mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a + bx lr +.endm + +function idct32_odd + movrel r12, idct_coeffs + + @ Overwrite the idct16 coeffs with the stored ones for idct32 + vmovl.s16 q0, d12 + vmovl.s16 q1, d13 + vmovl.s16 q2, d14 + vmovl.s16 q3, d15 + + mbutterfly d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a + mbutterfly d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a + mbutterfly d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a + mbutterfly d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a + mbutterfly d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a + mbutterfly d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a + mbutterfly d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a + mbutterfly d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a + + @ Reload the idct16 coefficients. We could swap the coefficients between + @ q0-q3 and q6-q7 by narrowing/lengthening, but that's slower than just + @ loading and lengthening. + vld1.16 {q0-q1}, [r12,:128] + + butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17 + butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18 + butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21 + butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22 + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 + butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 + butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 + butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 + + mbutterfly d23, d24, d2[0], d2[1], q8, q9 @ d23 = t17a, d24 = t30a + mbutterfly d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a + mbutterfly d21, d26, d3[0], d3[1], q8, q9 @ d21 = t21a, d26 = t26a + mbutterfly d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a + idct32_end +endfunc + +function idct32_odd_half + movrel r12, idct_coeffs + + vmovl.s16 q0, d12 + vmovl.s16 q1, d13 + vmovl.s16 q2, d14 + vmovl.s16 q3, d15 + + mbutterfly_h1 d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a + mbutterfly_h2 d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a + mbutterfly_h1 d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a + mbutterfly_h2 d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a + mbutterfly_h1 d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a + mbutterfly_h2 d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a + mbutterfly_h1 d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a + mbutterfly_h2 d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a + + vld1.16 {q0-q1}, [r12,:128] + + butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17 + butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18 + butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21 + butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22 + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 + butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 + butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 + butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 + + mbutterfly d23, d24, d2[0], d2[1], q8, q9 @ d23 = t17a, d24 = t30a + mbutterfly d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a + mbutterfly d21, d26, d3[0], d3[1], q8, q9 @ d21 = t21a, d26 = t26a + mbutterfly d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a + idct32_end +endfunc + +function idct32_odd_quarter + movrel r12, idct_coeffs + + vmovl.s16 q0, d12 + vmovl.s16 q1, d13 + vmovl.s16 q2, d14 + vmovl.s16 q3, d15 + + vmov.s64 q14, #0 + vmov.s64 q5, #0 + + vmull.s32 q4, d16, d0[0] + vmlsl.s32 q14, d19, d3[1] + vmull.s32 q15, d16, d0[1] + vmull.s32 q11, d17, d7[0] + vmlsl.s32 q5, d17, d7[1] + vmull.s32 q13, d19, d3[0] + vmull.s32 q10, d18, d4[0] + vmull.s32 q12, d18, d4[1] + + vld1.16 {q0-q1}, [r12,:128] + + vrshrn.s64 d8, q4, #14 + vrshrn.s64 d9, q14, #14 + vrshrn.s64 d29, q15, #14 + vrshrn.s64 d28, q11, #14 + + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + + vrshrn.s64 d11, q5, #14 + vrshrn.s64 d31, q13, #14 + vrshrn.s64 d10, q10, #14 + vrshrn.s64 d30, q12, #14 + + mbutterfly_l q8, q9, d29, d8, d2[0], d2[1] + mbutterfly_l q13, q10, d31, d9, d2[0], d2[1], neg=1 + vrshrn.s64 d23, q8, #14 + vrshrn.s64 d24, q9, #14 + vrshrn.s64 d27, q13, #14 + vrshrn.s64 d20, q10, #14 + mbutterfly_l q8, q9, d30, d10, d3[0], d3[1] + vrshrn.s64 d21, q8, #14 + vrshrn.s64 d26, q9, #14 + mbutterfly_l q8, q9, d28, d11, d3[0], d3[1], neg=1 + vrshrn.s64 d25, q8, #14 + vrshrn.s64 d22, q9, #14 + + idct32_end +endfunc + +.macro idct32_funcs suffix +@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix. +@ We don't have register space to do a single pass IDCT of 2x32 though, +@ but the 32-point IDCT can be decomposed into two 16-point IDCTs; +@ a normal IDCT16 with every other input component (the even ones, with +@ each output written twice), followed by a separate 16-point IDCT +@ of the odd inputs, added/subtracted onto the outputs of the first idct16. +@ r0 = dst (temp buffer) +@ r1 = unused +@ r2 = src +function idct32_1d_2x32_pass1\suffix\()_neon + push {lr} + + @ Double stride of the input, since we only read every other line + mov r12, #256 + vmov.s32 d8, #0 + + @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.32 {d\i}, [r2,:64] + vst1.32 {d8}, [r2,:64], r12 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + vld1.32 {d\i}, [r2,:64] + vst1.32 {d8}, [r2,:64], r12 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.32 {d\i}, [r2,:64] + vst1.32 {d8}, [r2,:64], r12 +.endr +.endif + + bl idct16\suffix + + @ Do eight 2x2 transposes. Originally, d16-d31 contain the + @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight + @ transposed 2x2 blocks. + transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + + @ Store the registers a, b, c, d, e, f, g, h horizontally, followed + @ by the same registers h, g, f, e, d, c, b, a mirrored. +.macro store_rev a, b, c, d, e, f, g, h +.irp i, \a, \b, \c, \d, \e, \f, \g, \h + vst1.32 {d\i}, [r0,:64]! + vrev64.32 d\i, d\i +.endr +.irp i, \h, \g, \f, \e, \d, \c, \b, \a + vst1.32 {d\i}, [r0,:64]! +.endr +.endm + store_rev 16, 18, 20, 22, 24, 26, 28, 30 + store_rev 17, 19, 21, 23, 25, 27, 29, 31 + sub r0, r0, #256 +.purgem store_rev + + @ Move r2 back to the start of the input, and move + @ to the first odd row +.ifb \suffix + sub r2, r2, r12, lsl #4 +.endif +.ifc \suffix,_quarter + sub r2, r2, r12, lsl #2 +.endif +.ifc \suffix,_half + sub r2, r2, r12, lsl #3 +.endif + add r2, r2, #128 + + vmov.s32 d8, #0 + @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d8}, [r2,:64], r12 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d8}, [r2,:64], r12 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d8}, [r2,:64], r12 +.endr +.endif + + bl idct32_odd\suffix + + transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + + @ Store the registers a, b, c, d, e, f, g, h horizontally, + @ adding into the output first, and then mirrored, subtracted + @ from the output. +.macro store_rev a, b, c, d, e, f, g, h +.irp i, \a, \b, \c, \d, \e, \f, \g, \h + vld1.32 {d8}, [r0,:64] + vadd.s32 d8, d8, d\i + vst1.32 {d8}, [r0,:64]! + vrev64.32 d\i, d\i +.endr +.irp i, \h, \g, \f, \e, \d, \c, \b, \a + vld1.32 {d8}, [r0,:64] + vsub.s32 d8, d8, d\i + vst1.32 {d8}, [r0,:64]! +.endr +.endm + + store_rev 31, 29, 27, 25, 23, 21, 19, 17 + store_rev 30, 28, 26, 24, 22, 20, 18, 16 +.purgem store_rev + pop {pc} +endfunc +.ltorg + +@ This is mostly the same as 2x32_pass1, but without the transpose, +@ and use the source as temp buffer between the two idct passes, and +@ add into the destination. +@ r0 = dst +@ r1 = dst stride +@ r2 = src (temp buffer) +function idct32_1d_2x32_pass2\suffix\()_neon + push {lr} + + mov r12, #256 + @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.32 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + vld1.32 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.32 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #3 +.endif + + bl idct16\suffix + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vst1.32 {d\i}, [r2,:64], r12 +.endr + + sub r2, r2, r12, lsl #4 + add r2, r2, #128 + + @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.32 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + vld1.32 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.32 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #3 +.endif + sub r2, r2, #128 + + bl idct32_odd\suffix + + @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to + @ allow clobbering q2-q3 below. + vmovn.s32 d0, q0 + vmovn.s32 d1, q1 + vmovn.s32 d2, q2 + vmovn.s32 d3, q3 + + mov r12, #256 + vdup.s16 q4, r9 +.macro load_acc_store a, b, c, d, neg=0 + vld1.32 {d4}, [r2,:64], r12 + vld1.32 {d5}, [r2,:64], r12 +.if \neg == 0 + vadd.s32 d4, d4, d\a + vld1.32 {d6}, [r2,:64], r12 + vadd.s32 d5, d5, d\b + vld1.32 {d7}, [r2,:64], r12 + vadd.s32 d6, d6, d\c + vadd.s32 d7, d7, d\d +.else + vsub.s32 d4, d4, d\a + vld1.32 {d6}, [r2,:64], r12 + vsub.s32 d5, d5, d\b + vld1.32 {d7}, [r2,:64], r12 + vsub.s32 d6, d6, d\c + vsub.s32 d7, d7, d\d +.endif + vld1.32 {d10[]}, [r0,:32], r1 + vld1.32 {d10[1]}, [r0,:32], r1 + vrshr.s32 q2, q2, #6 + vld1.32 {d11[]}, [r0,:32], r1 + vrshr.s32 q3, q3, #6 + vld1.32 {d11[1]}, [r0,:32], r1 + sub r0, r0, r1, lsl #2 + vaddw.u16 q2, q2, d10 + vaddw.u16 q3, q3, d11 + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q4 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + vst1.32 {d5[0]}, [r0,:32], r1 + vst1.32 {d5[1]}, [r0,:32], r1 +.endm + load_acc_store 31, 30, 29, 28 + load_acc_store 27, 26, 25, 24 + load_acc_store 23, 22, 21, 20 + load_acc_store 19, 18, 17, 16 + sub r2, r2, r12 + neg r12, r12 + load_acc_store 16, 17, 18, 19, 1 + load_acc_store 20, 21, 22, 23, 1 + load_acc_store 24, 25, 26, 27, 1 + load_acc_store 28, 29, 30, 31, 1 +.purgem load_acc_store + @ Lengthen the idct16 coeffs back into 32 bit form + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + pop {pc} +endfunc +.endm + +idct32_funcs +idct32_funcs _quarter +idct32_funcs _half + +const min_eob_idct_idct_32, align=4 + .short 0, 3, 9, 21, 34, 51, 70, 98, 135, 176, 240, 258, 336, 357, 448, 472 +endconst + +function vp9_idct_idct_32x32_add_16_neon + cmp r3, #1 + beq idct32x32_dc_add_neon + vpush {q4-q7} + movrel r8, min_eob_idct_idct_32 + 2 + + @ Align the stack, allocate a temp buffer +T mov r7, sp +T and r7, r7, #15 +A and r7, sp, #15 + add r7, r7, #4096 + sub sp, sp, r7 + + mov r4, r0 + mov r5, r1 + mov r6, r2 + + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128]! + vld1.16 {q6-q7}, [r12,:128] + vmovl.s16 q2, d2 + vmovl.s16 q3, d3 + vmovl.s16 q1, d1 + vmovl.s16 q0, d0 + + cmp r3, #34 + ble idct32x32_quarter_add_16_neon + cmp r3, #135 + ble idct32x32_half_add_16_neon + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r0, sp, #(\i*128) +.if \i > 0 + ldrh_post r1, r8, #2 + cmp r3, r1 + it le + movle r1, #(32 - \i)/2 + ble 1f +.endif + add r2, r6, #(\i*4) + bl idct32_1d_2x32_pass1_neon +.endr + b 3f + +1: + @ Write zeros to the temp buffer for pass 2 + vmov.i16 q14, #0 + vmov.i16 q15, #0 +2: + subs r1, r1, #1 +.rept 2 + @ Fill one line with zeros + vst1.16 {q14-q15}, [r0,:128]! + vst1.16 {q14-q15}, [r0,:128]! + vst1.16 {q14-q15}, [r0,:128]! + vst1.16 {q14-q15}, [r0,:128]! +.endr + bne 2b +3: +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r0, r4, #(\i*2) + mov r1, r5 + add r2, sp, #(\i*4) + bl idct32_1d_2x32_pass2_neon +.endr + + add sp, sp, r7 + vpop {q4-q7} + pop {r4-r9,pc} +endfunc + +function ff_vp9_idct_idct_32x32_add_10_neon, export=1 + push {r4-r9,lr} + movw r9, #0x03ff + b vp9_idct_idct_32x32_add_16_neon +endfunc + +function ff_vp9_idct_idct_32x32_add_12_neon, export=1 + push {r4-r9,lr} + movw r9, #0x0fff + b vp9_idct_idct_32x32_add_16_neon +endfunc + +.macro idct32_partial size, rows +function idct32x32_\size\()_add_16_neon +.irp i, 0, 2, 4, 6 + add r0, sp, #(\i*128) +.ifc \size,quarter +.if \i > 0 + ldrh_post r1, r8, #2 + cmp r3, r1 + it le + movle r1, #(\rows - \i)/2 + ble 1f +.endif +.endif + add r2, r6, #(\i*4) + bl idct32_1d_2x32_pass1_\size\()_neon +.endr +.ifc \size,half + add r8, r8, #8 +.irp i, 8, 10, 12, 14 + add r0, sp, #(\i*128) +.if \i > 8 + ldrh_post r1, r8, #2 + cmp r3, r1 + it le + movle r1, #(\rows - \i)/2 + ble 1f +.endif + add r2, r6, #(\i*4) + bl idct32_1d_2x32_pass1_\size\()_neon +.endr +.endif + b 3f + +1: + @ Write zeros to the temp buffer for pass 2 + vmov.i16 q14, #0 + vmov.i16 q15, #0 +2: + subs r1, r1, #1 +.rept 2 + @ Fill one line with zeros + vst1.16 {q14-q15}, [r0,:128]! + vst1.16 {q14-q15}, [r0,:128]! + vst1.16 {q14-q15}, [r0,:128]! + vst1.16 {q14-q15}, [r0,:128]! +.endr + bne 2b +3: +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r0, r4, #(\i*2) + mov r1, r5 + add r2, sp, #(\i*4) + bl idct32_1d_2x32_pass2_\size\()_neon +.endr + + add sp, sp, r7 + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +idct32_partial quarter, 8 +idct32_partial half, 16 diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 67a4754..6c09922 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2016 Google Inc. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp9lpf_16bpp_neon.S b/libavcodec/arm/vp9lpf_16bpp_neon.S new file mode 100644 index 0000000..7d2571d --- /dev/null +++ b/libavcodec/arm/vp9lpf_16bpp_neon.S @@ -0,0 +1,1044 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + vswp \r1, \r8 @ vtrn.64 \rq0, \rq4 + vswp \r3, \r10 @ vtrn.64 \rq1, \rq5 + vswp \r5, \r12 @ vtrn.64 \rq2, \rq6 + vswp \r7, \r14 @ vtrn.64 \rq3, \rq7 + vtrn.32 \rq0, \rq2 + vtrn.32 \rq1, \rq3 + vtrn.32 \rq4, \rq6 + vtrn.32 \rq5, \rq7 + vtrn.16 \rq0, \rq1 + vtrn.16 \rq2, \rq3 + vtrn.16 \rq4, \rq5 + vtrn.16 \rq6, \rq7 +.endm + +.macro transpose16_4x4 r0, r1, r2, r3 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + +@ Do a 4x4 transpose, using q registers for the subtransposes that don't +@ need to address the indiviudal d registers. +@ r0,r1 == rq0, r2,r3 == rq1 +.macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3 + vtrn.32 \rq0, \rq1 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + +@ The input to and output from this macro is in the registers q8-q15, +@ and q0-q7 are used as scratch registers. +@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 +.macro loop_filter_q wd + vdup.u16 q0, r2 @ E + vdup.u16 q1, r3 @ I + + vabd.u16 q2, q8, q9 @ abs(p3 - p2) + vabd.u16 q3, q9, q10 @ abs(p2 - p1) + vabd.u16 q4, q10, q11 @ abs(p1 - p0) + vabd.u16 q5, q12, q13 @ abs(q0 - q1) + vabd.u16 q6, q13, q14 @ abs(q1 - q2) + vabd.u16 q7, q14, q15 @ abs(q2 - q3) + vmax.u16 q2, q2, q3 + vmax.u16 q3, q4, q5 + vmax.u16 q4, q6, q7 + vabd.u16 q5, q11, q12 @ abs(p0 - q0) + vmax.u16 q2, q2, q3 + vadd.u16 q5, q5, q5 @ abs(p0 - q0) * 2 + vabd.u16 q6, q10, q13 @ abs(p1 - q1) + vmax.u16 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3)) + vshr.u16 q6, q6, #1 + vcle.u16 q2, q2, q1 @ max(abs()) <= I + vadd.u16 q5, q5, q6 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + vcle.u16 q5, q5, q0 + vand q2, q2, q5 @ fm + + vmovn.u16 d10, q2 + vmov r8, r9, d10 + orrs r8, r8, r9 + @ If no pixels need filtering, just exit as soon as possible + beq 9f + +.if \wd >= 8 + vdup.u16 q0, r5 + + vabd.u16 q1, q8, q11 @ abs(p3 - p0) + vabd.u16 q3, q9, q11 @ abs(p2 - p0) + vabd.u16 q4, q10, q11 @ abs(p1 - p0) + vabd.u16 q5, q13, q12 @ abs(q1 - q0) + vabd.u16 q6, q14, q12 @ abs(q2 - q0) + vabd.u16 q7, q15, q12 @ abs(q3 - q0) + vmax.u16 q1, q1, q3 + vmax.u16 q4, q4, q5 + vmax.u16 q6, q6, q7 + @ The rest of the calculation of flat8in is interleaved below +.endif + + @ Calculate the normal inner loop filter for 2 or 4 pixels + vabd.u16 q3, q10, q11 @ abs(p1 - p0) +.if \wd == 8 + vmax.u16 q1, q1, q4 +.endif + vabd.u16 q4, q13, q12 @ abs(q1 - q0) +.if \wd == 8 + vmax.u16 q1, q1, q6 +.endif + + vsub.u16 q5, q10, q13 @ p1 - q1 + vmax.u16 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0)) + vdup.u16 q4, r4 @ H + vsub.u16 q6, q12, q11 @ q0 - p0 +.if \wd == 8 + vcle.u16 q1, q1, q0 @ flat8in +.endif + vdup.u16 q0, r6 @ left shift for saturation + vcle.u16 q3, q3, q4 @ !hev +.if \wd == 8 + vand q1, q1, q2 @ flat8in && fm +.endif + vneg.s16 q4, q0 @ negative left shift after saturation + vqshl.s16 q5, q5, q0 +.if \wd == 8 + vbic q2, q2, q1 @ fm && !flat8in +.endif + vmov.s16 q7, #3 + vand q3, q3, q2 @ !hev && fm && !flat8in + vshl.s16 q5, q5, q4 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1) + + vmul.s16 q6, q6, q7 @ 3 * (q0 - p0) + vbic q5, q5, q3 @ if (!hev) av_clip_int2p = 0 + vadd.s16 q6, q6, q5 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)] + vmov.s16 q5, #4 + vqshl.s16 q6, q6, q0 + vmov.s16 q0, #3 + vshl.s16 q6, q6, q4 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f + vdup.u16 q4, r7 @ max pixel value + + vshr.u16 q4, q4, #1 @ (1 << (BIT_DEPTH - 1)) - 1) + + vadd.s16 q5, q6, q5 @ f + 4 + vadd.s16 q0, q6, q0 @ f + 3 + vmov.s16 q6, #0 + vmin.s16 q5, q5, q4 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) + vmin.s16 q0, q0, q4 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) + vdup.u16 q4, r7 @ max pixel value + vshr.s16 q5, q5, #3 @ f1 + vshr.s16 q0, q0, #3 @ f2 + + vadd.s16 q0, q11, q0 @ p0 + f2 + vsub.s16 q7, q12, q5 @ q0 - f1 + vmin.s16 q0, q0, q4 + vmin.s16 q7, q7, q4 + vrshr.s16 q5, q5, #1 @ f = (f1 + 1) >> 1 + vmax.s16 q0, q0, q6 @ out p0 + vmax.s16 q7, q7, q6 @ out q0 + vbit q11, q0, q2 @ if (fm && !flat8in) + vbit q12, q7, q2 +.if \wd >= 8 + vmovn.u16 d4, q1 +.endif + + vadd.s16 q0, q10, q5 @ p1 + f + vsub.s16 q7, q13, q5 @ q1 - f +.if \wd >= 8 + vmov r8, r9, d4 +.endif + vmin.s16 q0, q0, q4 + vmin.s16 q7, q7, q4 +.if \wd >= 8 + orrs r8, r8, r9 +.endif + vmax.s16 q0, q0, q6 @ out p1 + vmax.s16 q7, q7, q6 @ out q1 + vbit q10, q0, q3 @ if (!hev && fm && !flat8in) + vbit q13, q7, q3 + +.if \wd >= 8 + @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels + beq 6f + + @ flat8in + vadd.u16 q2, q8, q9 + vadd.u16 q3, q10, q13 + vadd.u16 q4, q8, q10 + vadd.u16 q5, q11, q14 + vadd.u16 q0, q2, q2 + vadd.u16 q0, q0, q11 + vadd.u16 q0, q0, q12 + vadd.u16 q0, q0, q4 + vsub.s16 q3, q3, q2 + vsub.s16 q5, q5, q4 + vrshr.u16 q6, q0, #3 @ out p2 + + vadd.u16 q0, q0, q3 + vadd.u16 q2, q8, q11 + vadd.u16 q3, q12, q15 + vrshr.u16 q7, q0, #3 @ out p1 + + vadd.u16 q0, q0, q5 + vsub.s16 q3, q3, q2 + vadd.u16 q4, q9, q12 + vbit q9, q6, q1 + vadd.u16 q5, q13, q15 + vrshr.u16 q6, q0, #3 @ out p0 + + vadd.u16 q0, q0, q3 + vsub.s16 q5, q5, q4 + vadd.u16 q2, q10, q13 + vbit q10, q7, q1 + vadd.u16 q3, q14, q15 + vrshr.u16 q7, q0, #3 @ out q0 + + vadd.u16 q0, q0, q5 + vsub.s16 q3, q3, q2 + vbit q11, q6, q1 + vrshr.u16 q6, q0, #3 @ out q1 + + vadd.u16 q0, q0, q3 + vbit q12, q7, q1 + vrshr.u16 q7, q0, #3 @ out q2 + vbit q13, q6, q1 + vbit q14, q7, q1 +.endif +.endm + +@ The input to and output from this macro is in the registers d16-d31, +@ and d0-d7 are used as scratch registers. +@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31 +@ Depending on the width of the loop filter, we either use d16-d19 +@ and d28-d31 as temp registers, or d8-d15. +@ In practice, this is only ever instantiated once, so the macro parameters +@ could be hardcoded, but keeping them as is, to keep similarities to the +@ 8 bpp and aarch64 versions. +.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 + vdup.u16 d0, r2 @ E + vdup.u16 d2, r3 @ I + + vabd.u16 d4, d20, d21 @ abs(p3 - p2) + vabd.u16 d5, d21, d22 @ abs(p2 - p1) + vabd.u16 d6, d22, d23 @ abs(p1 - p0) + vabd.u16 d7, d24, d25 @ abs(q0 - q1) + vabd.u16 \tmp1, d25, d26 @ abs(q1 - q2) + vabd.u16 \tmp2, d26, d27 @ abs(q2 - q3) + vmax.u16 d4, d4, d5 + vmax.u16 d5, d6, d7 + vmax.u16 \tmp1, \tmp1, \tmp2 + vabd.u16 d6, d23, d24 @ abs(p0 - q0) + vmax.u16 d4, d4, d5 + vadd.u16 d6, d6, d6 @ abs(p0 - q0) * 2 + vabd.u16 d5, d22, d25 @ abs(p1 - q1) + vmax.u16 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3)) + vshr.u16 d5, d5, #1 + vcle.u16 d4, d4, d2 @ max(abs()) <= I + vadd.u16 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + vcle.u16 d6, d6, d0 + vand d4, d4, d6 @ fm + + vdup.u16 d3, r4 @ H + vmov r8, r9, d4 + orrs r8, r8, r9 + @ If no pixels need filtering, just exit as soon as possible + beq 9f + +.if \wd >= 8 + vdup.u16 d0, r5 + + vabd.u16 d6, d20, d23 @ abs(p3 - p0) + vabd.u16 d2, d21, d23 @ abs(p2 - p0) + vabd.u16 d1, d22, d23 @ abs(p1 - p0) + vabd.u16 \tmp1, d25, d24 @ abs(q1 - q0) + vabd.u16 \tmp2, d26, d24 @ abs(q2 - q0) + vabd.u16 \tmp3, d27, d24 @ abs(q3 - q0) + vmax.u16 d6, d6, d2 + vmax.u16 d1, d1, \tmp1 + vmax.u16 \tmp2, \tmp2, \tmp3 +.if \wd == 16 + vabd.u16 d7, d16, d23 @ abs(p7 - p0) + vmax.u16 d6, d6, d1 + vabd.u16 d2, d17, d23 @ abs(p6 - p0) + vmax.u16 d6, d6, \tmp2 + vabd.u16 d1, d18, d23 @ abs(p5 - p0) + vcle.u16 d6, d6, d0 @ flat8in + vabd.u16 d8, d19, d23 @ abs(p4 - p0) + vand d6, d6, d4 @ flat8in && fm + vabd.u16 d9, d28, d24 @ abs(q4 - q0) + vbic d4, d4, d6 @ fm && !flat8in + vabd.u16 d10, d29, d24 @ abs(q5 - q0) + vabd.u16 d11, d30, d24 @ abs(q6 - q0) + vabd.u16 d12, d31, d24 @ abs(q7 - q0) + + vmax.u16 d7, d7, d2 + vmax.u16 d1, d1, d8 + vmax.u16 d9, d9, d10 + vmax.u16 d11, d11, d12 + @ The rest of the calculation of flat8out is interleaved below +.else + @ The rest of the calculation of flat8in is interleaved below +.endif +.endif + + @ Calculate the normal inner loop filter for 2 or 4 pixels + vabd.u16 d5, d22, d23 @ abs(p1 - p0) +.if \wd == 16 + vmax.u16 d7, d7, d1 + vmax.u16 d9, d9, d11 +.elseif \wd == 8 + vmax.u16 d6, d6, d1 +.endif + vabd.u16 d1, d25, d24 @ abs(q1 - q0) +.if \wd == 16 + vmax.u16 d7, d7, d9 +.elseif \wd == 8 + vmax.u16 d6, d6, \tmp2 +.endif + vdup.u16 \tmp2, r6 @ left shift for saturation + vsub.u16 \tmp1, d22, d25 @ p1 - q1 + vneg.s16 \tmp6, \tmp2 @ negative left shift after saturation + vmax.u16 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0)) + vsub.u16 \tmp3, d24, d23 @ q0 - p0 + vmov.s16 \tmp5, #3 +.if \wd == 8 + vcle.u16 d6, d6, d0 @ flat8in +.endif + vcle.u16 d5, d5, d3 @ !hev +.if \wd == 8 + vand d6, d6, d4 @ flat8in && fm +.endif + vqshl.s16 \tmp1, \tmp1, \tmp2 +.if \wd == 16 + vcle.u16 d7, d7, d0 @ flat8out +.elseif \wd == 8 + vbic d4, d4, d6 @ fm && !flat8in +.endif + vand d5, d5, d4 @ !hev && fm && !flat8in +.if \wd == 16 + vand d7, d7, d6 @ flat8out && flat8in && fm +.endif + vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1) + + vmul.s16 \tmp3, \tmp3, \tmp5 @ 3 * (q0 - p0) + vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int2p = 0 + vmov.s16 d2, #4 + vadd.s16 \tmp3, \tmp3, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)] + vmov.s16 d3, #3 + vqshl.s16 \tmp1, \tmp3, \tmp2 + vmov.s16 \tmp5, #0 + vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f + vdup.u16 \tmp6, r7 @ max pixel value +.if \wd == 16 + vbic d6, d6, d7 @ fm && flat8in && !flat8out +.endif + + vshr.u16 \tmp2, \tmp6, #1 @ (1 << (BIT_DEPTH - 1)) - 1 + + vadd.s16 \tmp3, \tmp1, d2 @ f + 4 + vadd.s16 \tmp4, \tmp1, d3 @ f + 3 + vmin.s16 \tmp3, \tmp3, \tmp2 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) + vmin.s16 \tmp4, \tmp4, \tmp2 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) + vshr.s16 \tmp3, \tmp3, #3 @ f1 + vshr.s16 \tmp4, \tmp4, #3 @ f2 + + vadd.s16 d0, d23, \tmp4 @ p0 + f2 + vsub.s16 d2, d24, \tmp3 @ q0 - f1 + vmin.s16 d0, d0, \tmp6 + vmin.s16 d2, d2, \tmp6 + vrshr.s16 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1 + vmax.s16 d0, d0, \tmp5 @ out p0 + vmax.s16 d2, d2, \tmp5 @ out q0 + vbit d23, d0, d4 @ if (fm && !flat8in) + vbit d24, d2, d4 + + vadd.s16 d0, d22, \tmp3 @ p1 + f + vsub.s16 d2, d25, \tmp3 @ q1 - f +.if \wd >= 8 + vmov r8, r9, d6 +.endif + vmin.s16 d0, d0, \tmp6 + vmin.s16 d2, d2, \tmp6 +.if \wd >= 8 + orrs r8, r8, r9 +.endif + vmax.s16 d0, d0, \tmp5 @ out p1 + vmax.s16 d2, d2, \tmp5 @ out q1 + vbit d22, d0, d5 @ if (!hev && fm && !flat8in) + vbit d25, d2, d5 + +.if \wd >= 8 + @ If no pixels need flat8in, jump to flat8out + @ (or to a writeout of the inner 4 pixels, for wd=8) + beq 6f + + @ flat8in + vadd.u16 \tmp1, d20, d21 + vadd.u16 \tmp3, d22, d25 + vadd.u16 \tmp5, d20, d22 + vadd.u16 \tmp7, d23, d26 + vadd.u16 d0, \tmp1, \tmp1 + vadd.u16 d0, d0, d23 + vadd.u16 d0, d0, d24 + vadd.u16 d0, d0, \tmp5 + vsub.s16 \tmp3, \tmp3, \tmp1 + vsub.s16 \tmp7, \tmp7, \tmp5 + vrshr.u16 d2, d0, #3 @ out p2 + + vadd.u16 d0, d0, \tmp3 + vadd.u16 \tmp1, d20, d23 + vadd.u16 \tmp3, d24, d27 + vrshr.u16 d3, d0, #3 @ out p1 + + vadd.u16 d0, d0, \tmp7 + vsub.s16 \tmp3, \tmp3, \tmp1 + vadd.u16 \tmp5, d21, d24 + vadd.u16 \tmp7, d25, d27 + vrshr.u16 d4, d0, #3 @ out p0 + + vadd.u16 d0, d0, \tmp3 + vsub.s16 \tmp7, \tmp7, \tmp5 + vadd.u16 \tmp1, d22, d25 + vadd.u16 \tmp3, d26, d27 + vrshr.u16 d5, d0, #3 @ out d0 + + vadd.u16 d0, d0, \tmp7 + vsub.s16 \tmp3, \tmp3, \tmp1 + vrshr.u16 \tmp5, d0, #3 @ out q1 + + vadd.u16 d0, d0, \tmp3 + @ The output here is written back into the input registers. This doesn't + @ matter for the flat8out part below, since we only update those pixels + @ which won't be touched below. + vbit d21, d2, d6 + vbit d22, d3, d6 + vbit d23, d4, d6 + vrshr.u16 \tmp6, d0, #3 @ out q2 + vbit d24, d5, d6 + vbit d25, \tmp5, d6 + vbit d26, \tmp6, d6 +.endif +.if \wd == 16 +6: + vorr d2, d6, d7 + vmov r8, r9, d2 + orrs r8, r8, r9 + @ If no pixels needed flat8in nor flat8out, jump to a + @ writeout of the inner 4 pixels + beq 7f + vmov r8, r9, d7 + orrs r8, r8, r9 + @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels + beq 8f + + @ flat8out + @ This writes all outputs into d2-d17 (skipping d6 and d16). + @ If this part is skipped, the output is read from d21-d26 (which is the input + @ to this section). + vshl.u16 d0, d16, #3 @ 8 * d16 + vsub.u16 d0, d0, d16 @ 7 * d16 + vadd.u16 d0, d0, d17 + vadd.u16 d8, d17, d18 + vadd.u16 d10, d19, d20 + vadd.s16 d0, d0, d8 + vadd.u16 d8, d16, d17 + vadd.u16 d12, d21, d22 + vadd.s16 d0, d0, d10 + vadd.u16 d10, d18, d25 + vadd.u16 d14, d23, d24 + vsub.s16 d10, d10, d8 + vadd.s16 d0, d0, d12 + vadd.s16 d0, d0, d14 + vadd.u16 d12, d16, d18 + vadd.u16 d14, d19, d26 + vrshr.u16 d2, d0, #4 + + vadd.s16 d0, d0, d10 + vadd.u16 d8, d16, d19 + vadd.u16 d10, d20, d27 + vsub.s16 d14, d14, d12 + vbif d2, d17, d7 + vrshr.u16 d3, d0, #4 + + vadd.s16 d0, d0, d14 + vadd.u16 d12, d16, d20 + vadd.u16 d14, d21, d28 + vsub.s16 d10, d10, d8 + vbif d3, d18, d7 + vrshr.u16 d4, d0, #4 + + vadd.s16 d0, d0, d10 + vadd.u16 d8, d16, d21 + vadd.u16 d10, d22, d29 + vsub.s16 d14, d14, d12 + vbif d4, d19, d7 + vrshr.u16 d5, d0, #4 + + vadd.s16 d0, d0, d14 + vadd.u16 d12, d16, d22 + vadd.u16 d14, d23, d30 + vsub.s16 d10, d10, d8 + vbif d5, d20, d7 + vrshr.u16 d6, d0, #4 + + vadd.s16 d0, d0, d10 + vadd.u16 d10, d16, d23 + vsub.s16 d14, d14, d12 + vadd.u16 d12, d24, d31 + vbif d6, d21, d7 + vrshr.u16 d8, d0, #4 + + vadd.s16 d0, d0, d14 + vsub.s16 d10, d12, d10 + vadd.u16 d12, d17, d24 + vadd.u16 d14, d25, d31 + vbif d8, d22, d7 + vrshr.u16 d9, d0, #4 + + vadd.s16 d0, d0, d10 + vsub.s16 d14, d14, d12 + vadd.u16 d12, d26, d31 + vbif d9, d23, d7 + vrshr.u16 d10, d0, #4 + + vadd.s16 d0, d0, d14 + vadd.u16 d14, d18, d25 + vadd.u16 d18, d19, d26 + vsub.s16 d12, d12, d14 + vadd.u16 d14, d27, d31 + vbif d10, d24, d7 + vrshr.u16 d11, d0, #4 + + vadd.s16 d0, d0, d12 + vadd.u16 d12, d20, d27 + vsub.s16 d14, d14, d18 + vadd.u16 d18, d28, d31 + vbif d11, d25, d7 + vsub.s16 d18, d18, d12 + vrshr.u16 d12, d0, #4 + + vadd.s16 d0, d0, d14 + vadd.u16 d14, d21, d28 + vadd.u16 d20, d29, d31 + vbif d12, d26, d7 + vrshr.u16 d13, d0, #4 + + vadd.s16 d0, d0, d18 + vsub.s16 d20, d20, d14 + vadd.u16 d18, d22, d29 + vadd.u16 d22, d30, d31 + vbif d13, d27, d7 + vrshr.u16 d14, d0, #4 + + vadd.s16 d0, d0, d20 + vsub.s16 d22, d22, d18 + vbif d14, d28, d7 + vrshr.u16 d15, d0, #4 + + vadd.s16 d0, d0, d22 + vbif d15, d29, d7 + vrshr.u16 d17, d0, #4 + vbif d17, d30, d7 +.endif +.endm + +.macro loop_filter_q_4 + loop_filter_q 4 +.endm + +.macro loop_filter_q_8 + loop_filter_q 8 +.endm + +.macro loop_filter_16 + loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15 +.endm + + +@ The public functions in this file have got the following signature: +@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); + +.macro bpp_frontend func, bpp +function ff_\func\()_\bpp\()_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + vpush {q4-q7} + lsl r2, r2, #\bpp - 8 + lsl r3, r3, #\bpp - 8 + lsl r4, r4, #\bpp - 8 + mov r5, #1 << (\bpp - 8) + mov r6, #16 - \bpp + movw r7, #((1 << \bpp) - 1) + bl \func\()_16_neon + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro bpp_frontends func + bpp_frontend \func, 10 + bpp_frontend \func, 12 +.endm + +.macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp +function ff_\func\()_\suffix\()_\bpp\()_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + vpush {q4-q7} + lsl r2, r2, #\bpp - 8 + lsl r3, r3, #\bpp - 8 + lsl r4, r4, #\bpp - 8 + mov r5, #1 << (\bpp - 8) + mov r6, #16 - \bpp + movw r7, #((1 << \bpp) - 1) + bl \func\()_\int_suffix\()_16_neon +.ifc \dir,h + add r0, r0, r1, lsl #2 +.else + add r0, r0, #8 +.endif + bl \func\()_\int_suffix\()_16_neon +.if \rep >= 4 +.ifc \dir,h + add r0, r0, r1, lsl #2 + bl \func\()_\int_suffix\()_16_neon + add r0, r0, r1, lsl #2 + bl \func\()_\int_suffix\()_16_neon +.else + add r0, r0, #8 + bl \func\()_\int_suffix\()_16_neon + add r0, r0, #8 + bl \func\()_\int_suffix\()_16_neon +.endif +.endif + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro bpp_frontends_rep func, suffix, int_suffix, rep, dir + bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10 + bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12 +.endm + +.macro bpp_frontend_mix2 wd1, wd2, dir, bpp +function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + vpush {q4-q7} + push {r2, r3, r4} + and r2, r2, #0xff + and r3, r3, #0xff + and r4, r4, #0xff + lsl r2, r2, #\bpp - 8 + lsl r3, r3, #\bpp - 8 + lsl r4, r4, #\bpp - 8 + mov r5, #1 << (\bpp - 8) + mov r6, #16 - \bpp + movw r7, #((1 << \bpp) - 1) + bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon +.ifc \dir,h + add r0, r0, r1, lsl #3 +.else + add r0, r0, #16 +.endif + pop {r2, r3, r4} + lsr r2, r2, #8 + lsr r3, r3, #8 + lsr r4, r4, #8 + lsl r2, r2, #\bpp - 8 + lsl r3, r3, #\bpp - 8 + lsl r4, r4, #\bpp - 8 + bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro bpp_frontends_mix2 wd1, wd2 + bpp_frontend_mix2 \wd1, \wd2, v, 10 + bpp_frontend_mix2 \wd1, \wd2, v, 12 + bpp_frontend_mix2 \wd1, \wd2, h, 10 + bpp_frontend_mix2 \wd1, \wd2, h, 12 +.endm + +function vp9_loop_filter_v_4_8_16_neon + sub r12, r0, r1, lsl #2 + vld1.16 {q8}, [r12,:128], r1 @ p3 + vld1.16 {q12}, [r0, :128], r1 @ q0 + vld1.16 {q9}, [r12,:128], r1 @ p2 + vld1.16 {q13}, [r0, :128], r1 @ q1 + vld1.16 {q10}, [r12,:128], r1 @ p1 + vld1.16 {q14}, [r0, :128], r1 @ q2 + vld1.16 {q11}, [r12,:128], r1 @ p0 + vld1.16 {q15}, [r0, :128], r1 @ q3 + sub r0, r0, r1, lsl #2 + sub r12, r12, r1, lsl #1 + + loop_filter_q_4 + + vst1.16 {q10}, [r12,:128], r1 + vst1.16 {q12}, [r0, :128], r1 + vst1.16 {q11}, [r12,:128], r1 + vst1.16 {q13}, [r0, :128], r1 + sub r0, r0, r1, lsl #1 +9: + bx lr +endfunc + +bpp_frontends vp9_loop_filter_v_4_8 + + +function vp9_loop_filter_h_4_8_16_neon + sub r12, r0, #8 + add r0, r12, r1, lsl #2 + vld1.16 {q8}, [r12,:64], r1 + vld1.16 {q12}, [r0, :64], r1 + vld1.16 {q9}, [r12,:64], r1 + vld1.16 {q13}, [r0, :64], r1 + vld1.16 {q10}, [r12,:64], r1 + vld1.16 {q14}, [r0, :64], r1 + vld1.16 {q11}, [r12,:64], r1 + vld1.16 {q15}, [r0, :64], r1 + + sub r12, r12, r1, lsl #2 + sub r0, r0, r1, lsl #2 + @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the + @ outermost 2 pixels since they aren't changed. + add r12, r12, #4 + add r0, r0, #4 + + transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + + loop_filter_q_4 + + @ We only will write the mid 4 pixels back; after the loop filter, + @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels). + @ We need to transpose them to columns, done with a + @ 4x4 transpose (which in practice is two 4x4 transposes of the two + @ 4x4 halves of the 8x4 pixels; into 4x8 pixels). + transpose16_4x4 q10, q11, q12, q13 + + vst1.16 {d20}, [r12], r1 + vst1.16 {d21}, [r0], r1 + vst1.16 {d22}, [r12], r1 + vst1.16 {d23}, [r0], r1 + vst1.16 {d24}, [r12], r1 + vst1.16 {d25}, [r0], r1 + vst1.16 {d26}, [r12], r1 + vst1.16 {d27}, [r0], r1 + sub r12, r12, r1, lsl #2 +9: + add r0, r12, #4 + bx lr +endfunc + +bpp_frontends vp9_loop_filter_h_4_8 + + +function vp9_loop_filter_v_8_8_16_neon + sub r12, r0, r1, lsl #2 + vld1.16 {q8}, [r12,:128], r1 @ p3 + vld1.16 {q12}, [r0, :128], r1 @ q0 + vld1.16 {q9}, [r12,:128], r1 @ p2 + vld1.16 {q13}, [r0, :128], r1 @ q1 + vld1.16 {q10}, [r12,:128], r1 @ p1 + vld1.16 {q14}, [r0, :128], r1 @ q2 + vld1.16 {q11}, [r12,:128], r1 @ p0 + vld1.16 {q15}, [r0, :128], r1 @ q3 + sub r12, r12, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r12, r12, r1 + + loop_filter_q_8 + + vst1.16 {q9}, [r12,:128], r1 + vst1.16 {q12}, [r0, :128], r1 + vst1.16 {q10}, [r12,:128], r1 + vst1.16 {q13}, [r0, :128], r1 + vst1.16 {q11}, [r12,:128], r1 + vst1.16 {q14}, [r0, :128], r1 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 +9: + bx lr +6: + sub r12, r0, r1, lsl #1 + vst1.16 {q10}, [r12,:128], r1 + vst1.16 {q12}, [r0, :128], r1 + vst1.16 {q11}, [r12,:128], r1 + vst1.16 {q13}, [r0, :128], r1 + sub r0, r0, r1, lsl #1 + bx lr +endfunc + +bpp_frontends vp9_loop_filter_v_8_8 + + +function vp9_loop_filter_h_8_8_16_neon + sub r12, r0, #8 + add r0, r12, r1, lsl #2 + vld1.16 {q8}, [r12,:64], r1 + vld1.16 {q12}, [r0, :64], r1 + vld1.16 {q9}, [r12,:64], r1 + vld1.16 {q13}, [r0, :64], r1 + vld1.16 {q10}, [r12,:64], r1 + vld1.16 {q14}, [r0, :64], r1 + vld1.16 {q11}, [r12,:64], r1 + vld1.16 {q15}, [r0, :64], r1 + + sub r12, r12, r1, lsl #2 + sub r0, r0, r1, lsl #2 + + transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + + loop_filter_q_8 + + @ Even though only 6 pixels per row have been changed, we write the + @ full 8 pixel registers. + transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + + vst1.16 {q8}, [r12,:64], r1 + vst1.16 {q12}, [r0, :64], r1 + vst1.16 {q9}, [r12,:64], r1 + vst1.16 {q13}, [r0, :64], r1 + vst1.16 {q10}, [r12,:64], r1 + vst1.16 {q14}, [r0, :64], r1 + vst1.16 {q11}, [r12,:64], r1 + vst1.16 {q15}, [r0, :64], r1 + sub r12, r12, r1, lsl #2 +9: + add r0, r12, #8 + bx lr +6: + @ If we didn't need to do the flat8in part, we use the same writeback + @ as in loop_filter_h_4_8. + add r12, r12, #4 + add r0, r0, #4 + transpose16_4x4 q10, q11, q12, q13 + + vst1.16 {d20}, [r12], r1 + vst1.16 {d21}, [r0], r1 + vst1.16 {d22}, [r12], r1 + vst1.16 {d23}, [r0], r1 + vst1.16 {d24}, [r12], r1 + vst1.16 {d25}, [r0], r1 + vst1.16 {d26}, [r12], r1 + vst1.16 {d27}, [r0], r1 + sub r12, r12, r1, lsl #2 + add r0, r12, #4 + bx lr +endfunc + +bpp_frontends vp9_loop_filter_h_8_8 + +bpp_frontends_mix2 4, 4 +bpp_frontends_mix2 4, 8 +bpp_frontends_mix2 8, 4 +bpp_frontends_mix2 8, 8 + +function vp9_loop_filter_v_16_4_16_neon + sub r12, r0, r1, lsl #3 + @ Read p7-p0 using r12 and q0-q7 using r0 + vld1.16 {d16}, [r12,:64], r1 @ p7 + vld1.16 {d24}, [r0, :64], r1 @ q0 + vld1.16 {d17}, [r12,:64], r1 @ p6 + vld1.16 {d25}, [r0, :64], r1 @ q1 + vld1.16 {d18}, [r12,:64], r1 @ p5 + vld1.16 {d26}, [r0, :64], r1 @ q2 + vld1.16 {d19}, [r12,:64], r1 @ p4 + vld1.16 {d27}, [r0, :64], r1 @ q3 + vld1.16 {d20}, [r12,:64], r1 @ p3 + vld1.16 {d28}, [r0, :64], r1 @ q4 + vld1.16 {d21}, [r12,:64], r1 @ p2 + vld1.16 {d29}, [r0, :64], r1 @ q5 + vld1.16 {d22}, [r12,:64], r1 @ p1 + vld1.16 {d30}, [r0, :64], r1 @ q6 + vld1.16 {d23}, [r12,:64], r1 @ p0 + vld1.16 {d31}, [r0, :64], r1 @ q7 + sub r12, r12, r1, lsl #3 + sub r0, r0, r1, lsl #3 + add r12, r12, r1 + + loop_filter_16 + + @ If we did the flat8out part, we get the output in + @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride, + @ store d2-d9 there, and d10-d17 into r0. + vst1.16 {d2}, [r12,:64], r1 + vst1.16 {d10}, [r0, :64], r1 + vst1.16 {d3}, [r12,:64], r1 + vst1.16 {d11}, [r0, :64], r1 + vst1.16 {d4}, [r12,:64], r1 + vst1.16 {d12}, [r0, :64], r1 + vst1.16 {d5}, [r12,:64], r1 + vst1.16 {d13}, [r0, :64], r1 + vst1.16 {d6}, [r12,:64], r1 + vst1.16 {d14}, [r0, :64], r1 + vst1.16 {d8}, [r12,:64], r1 + vst1.16 {d15}, [r0, :64], r1 + vst1.16 {d9}, [r12,:64], r1 + vst1.16 {d17}, [r0, :64], r1 + sub r0, r0, r1, lsl #3 + add r0, r0, r1 + +9: + bx lr + +8: + add r12, r12, r1, lsl #2 + @ If we didn't do the flat8out part, the output is left in the + @ input registers. + vst1.16 {d21}, [r12,:64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d22}, [r12,:64], r1 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d23}, [r12,:64], r1 + vst1.16 {d26}, [r0, :64], r1 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx lr +7: + sub r12, r0, r1, lsl #1 + vst1.16 {d22}, [r12,:64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d23}, [r12,:64], r1 + vst1.16 {d25}, [r0, :64], r1 + sub r0, r0, r1, lsl #1 + bx lr +endfunc + +bpp_frontends_rep vp9_loop_filter_v_16, 8, 4, 2, v +bpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v + +function vp9_loop_filter_h_16_4_16_neon + sub r12, r0, #16 + sub r0, r0, #8 + vld1.16 {d16}, [r12,:64], r1 + vld1.16 {d20}, [r0, :64], r1 + vld1.16 {d17}, [r12,:64], r1 + vld1.16 {d21}, [r0, :64], r1 + vld1.16 {d18}, [r12,:64], r1 + vld1.16 {d22}, [r0, :64], r1 + vld1.16 {d19}, [r12,:64], r1 + vld1.16 {d23}, [r0, :64], r1 + sub r12, r12, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r12, r12, #16 + add r0, r0, #16 + vld1.16 {d24}, [r12,:64], r1 + vld1.16 {d28}, [r0, :64], r1 + vld1.16 {d25}, [r12,:64], r1 + vld1.16 {d29}, [r0, :64], r1 + vld1.16 {d26}, [r12,:64], r1 + vld1.16 {d30}, [r0, :64], r1 + vld1.16 {d27}, [r12,:64], r1 + vld1.16 {d31}, [r0, :64], r1 + sub r0, r0, r1, lsl #2 + sub r12, r12, r1, lsl #2 + sub r12, r12, #16 + sub r0, r0, #16 + + @ The 16x4 pixels read above is in four 4x4 blocks + transpose16_q_4x4 q8, q9, d16, d17, d18, d19 + transpose16_q_4x4 q10, q11, d20, d21, d22, d23 + transpose16_q_4x4 q12, q13, d24, d25, d26, d27 + transpose16_q_4x4 q14, q15, d28, d29, d30, d31 + + loop_filter_16 + + @ Transpose back; this is the same transpose as above, but + @ we can't take advantage of q registers for the transpose, since + @ all d registers in the transpose aren't consecutive. + transpose16_4x4 d16, d2, d3, d4 + transpose16_4x4 d5, d6, d8, d9 + transpose16_4x4 d10, d11, d12, d13 + transpose16_4x4 d14, d15, d17, d31 + + vst1.16 {d16}, [r12,:64], r1 + vst1.16 {d5}, [r0, :64], r1 + + vst1.16 {d2}, [r12,:64], r1 + vst1.16 {d6}, [r0, :64], r1 + + vst1.16 {d3}, [r12,:64], r1 + vst1.16 {d8}, [r0, :64], r1 + + vst1.16 {d4}, [r12,:64], r1 + vst1.16 {d9}, [r0, :64], r1 + + sub r12, r12, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r12, r12, #16 + add r0, r0, #16 + + vst1.16 {d10}, [r12,:64], r1 + vst1.16 {d14}, [r0, :64], r1 + + vst1.16 {d11}, [r12,:64], r1 + vst1.16 {d15}, [r0, :64], r1 + + vst1.16 {d12}, [r12,:64], r1 + vst1.16 {d17}, [r0, :64], r1 + + vst1.16 {d13}, [r12,:64], r1 + vst1.16 {d31}, [r0, :64], r1 + sub r0, r0, r1, lsl #2 + sub r0, r0, #8 + bx lr +9: + add r0, r0, #8 + bx lr +8: + add r12, r12, #8 + add r0, r0, #8 + transpose16_q_4x4 q10, q11, d20, d21, d22, d23 + transpose16_q_4x4 q12, q13, d24, d25, d26, d27 + + vst1.16 {d20}, [r12,:64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d21}, [r12,:64], r1 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d22}, [r12,:64], r1 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d23}, [r12,:64], r1 + vst1.16 {d27}, [r0, :64], r1 + sub r0, r0, r1, lsl #2 + bx lr +7: + add r12, r12, #12 + add r0, r12, r1, lsl #1 + transpose16_q_4x4 q11, q12, d22, d23, d24, d25 + + vst1.16 {d22}, [r12], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r12], r1 + vst1.16 {d25}, [r0], r1 + sub r0, r0, r1, lsl #2 + add r0, r0, #4 + bx lr +endfunc + +bpp_frontends_rep vp9_loop_filter_h_16, 8, 4, 2, h +bpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index ae782b2..4b36080 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2016 Google Inc. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/arm/vp9mc_16bpp_neon.S b/libavcodec/arm/vp9mc_16bpp_neon.S new file mode 100644 index 0000000..f6ec037 --- /dev/null +++ b/libavcodec/arm/vp9mc_16bpp_neon.S @@ -0,0 +1,615 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +@ All public functions in this file have the following signature: +@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, +@ const uint8_t *ref, ptrdiff_t ref_stride, +@ int h, int mx, int my); + +function ff_vp9_copy128_neon, export=1 + ldr r12, [sp] + sub r1, r1, #96 + sub r3, r3, #96 +1: + subs r12, r12, #1 + vld1.16 {q0, q1}, [r2]! + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q2, q3}, [r2]! + vst1.16 {q2, q3}, [r0, :128]! + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2], r3 + vst1.16 {q10, q11}, [r0, :128], r1 + bne 1b + bx lr +endfunc + +function ff_vp9_avg64_16_neon, export=1 + push {lr} + ldr r12, [sp, #4] + sub r1, r1, #96 + sub r3, r3, #96 + mov lr, r0 +1: + subs r12, r12, #1 + vld1.16 {q8, q9}, [r2]! + vld1.16 {q0, q1}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vrhadd.u16 q0, q0, q8 + vld1.16 {q2, q3}, [r0, :128]! + vrhadd.u16 q1, q1, q9 + vld1.16 {q12, q13}, [r2]! + vrhadd.u16 q2, q2, q10 + vst1.16 {q0, q1}, [lr, :128]! + vrhadd.u16 q3, q3, q11 + vld1.16 {q8, q9}, [r0, :128]! + vst1.16 {q2, q3}, [lr, :128]! + vrhadd.u16 q8, q8, q12 + vld1.16 {q14, q15}, [r2], r3 + vrhadd.u16 q9, q9, q13 + vld1.16 {q10, q11}, [r0, :128], r1 + vrhadd.u16 q10, q10, q14 + vst1.16 {q8, q9}, [lr, :128]! + vrhadd.u16 q11, q11, q15 + vst1.16 {q10, q11}, [lr, :128], r1 + bne 1b + pop {pc} +endfunc + +function ff_vp9_avg32_16_neon, export=1 + push {lr} + ldr r12, [sp, #4] + sub r1, r1, #32 + sub r3, r3, #32 + mov lr, r0 +1: + subs r12, r12, #1 + vld1.16 {q8, q9}, [r2]! + vld1.16 {q0, q1}, [r0, :128]! + vld1.16 {q10, q11}, [r2], r3 + vrhadd.u16 q0, q0, q8 + vld1.16 {q2, q3}, [r0, :128], r1 + vrhadd.u16 q1, q1, q9 + vrhadd.u16 q2, q2, q10 + vst1.16 {q0, q1}, [lr, :128]! + vrhadd.u16 q3, q3, q11 + vst1.16 {q2, q3}, [lr, :128], r1 + bne 1b + pop {pc} +endfunc + +function ff_vp9_avg16_16_neon, export=1 + ldr r12, [sp] +1: + subs r12, r12, #1 + vld1.16 {q2, q3}, [r2], r3 + vld1.16 {q0, q1}, [r0, :128] + vrhadd.u16 q0, q0, q2 + vrhadd.u16 q1, q1, q3 + vst1.16 {q0, q1}, [r0, :128], r1 + bne 1b + bx lr +endfunc + +function ff_vp9_avg8_16_neon, export=1 + push {lr} + ldr r12, [sp, #4] + mov lr, r0 +1: + subs r12, r12, #2 + vld1.16 {q2}, [r2], r3 + vld1.16 {q0}, [r0, :128], r1 + vld1.16 {q3}, [r2], r3 + vrhadd.u16 q0, q0, q2 + vld1.16 {q1}, [r0, :128], r1 + vrhadd.u16 q1, q1, q3 + vst1.16 {q0}, [lr, :128], r1 + vst1.16 {q1}, [lr, :128], r1 + bne 1b + pop {pc} +endfunc + +function ff_vp9_avg4_16_neon, export=1 + ldr r12, [sp] +1: + subs r12, r12, #2 + vld1.16 {d2}, [r2], r3 + vld1.16 {d0}, [r0, :64], r1 + vld1.16 {d3}, [r2], r3 + vrhadd.u16 d0, d0, d2 + vld1.16 {d1}, [r0, :64] + sub r0, r0, r1 + vrhadd.u16 d1, d1, d3 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r0, :64], r1 + bne 1b + bx lr +endfunc + +@ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index +.macro vmull_lane dst, src, idx +.if \idx < 4 + vmull.s16 \dst, \src, d0[\idx] +.else + vmull.s16 \dst, \src, d1[\idx - 4] +.endif +.endm +.macro vmlal_lane dst, src, idx +.if \idx < 4 + vmlal.s16 \dst, \src, d0[\idx] +.else + vmlal.s16 \dst, \src, d1[\idx - 4] +.endif +.endm + +@ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate +@ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8) +.macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size + vext.8 q14, \src1, \src2, #(2*\offset) + vext.8 q15, \src3, \src4, #(2*\offset) + vmlal_lane \dst1, d28, \offset + vmlal_lane \dst3, d30, \offset +.if \size >= 8 + vmlal_lane \dst2, d29, \offset + vmlal_lane \dst4, d31, \offset +.endif +.endm + + +@ Instantiate a horizontal filter function for the given size. +@ This can work on 4 or 8 pixels in parallel; for larger +@ widths it will do 8 pixels at a time and loop horizontally. +@ The actual width (in bytes) is passed in r5, the height in r4 and +@ the filter coefficients in r12. +.macro do_8tap_h type, size +function \type\()_8tap_\size\()h + sub r2, r2, #6 + add r6, r0, r1 + add r7, r2, r3 + add r1, r1, r1 + add r3, r3, r3 + @ Only size >= 8 loops horizontally and needs + @ reduced dst stride +.if \size >= 8 + sub r1, r1, r5 +.endif + @ size >= 8 loads two qwords and increments r2, + @ for size 4 it's enough with three dwords and no + @ postincrement +.if \size >= 8 + sub r3, r3, r5 + sub r3, r3, #16 +.endif + @ Load the filter vector + vld1.16 {q0}, [r12,:128] +1: +.if \size >= 8 + mov r12, r5 +.endif + @ Load src +.if \size >= 8 + vld1.16 {q8, q9}, [r2]! + vld1.16 {q10, q11}, [r7]! +.else + vld1.16 {d16, d17, d18}, [r2] + vld1.16 {d20, d21, d22}, [r7] +.endif +2: + + vmull.s16 q1, d16, d0[0] + vmull.s16 q12, d20, d0[0] +.if \size >= 8 + vmull.s16 q2, d17, d0[0] + vmull.s16 q13, d21, d0[0] +.endif + extmlal q1, q2, q12, q13, q8, q9, q10, q11, 1, \size + extmlal q1, q2, q12, q13, q8, q9, q10, q11, 2, \size + extmlal q1, q2, q12, q13, q8, q9, q10, q11, 3, \size + extmlal q1, q2, q12, q13, q8, q9, q10, q11, 4, \size + extmlal q1, q2, q12, q13, q8, q9, q10, q11, 5, \size + extmlal q1, q2, q12, q13, q8, q9, q10, q11, 6, \size + extmlal q1, q2, q12, q13, q8, q9, q10, q11, 7, \size + + @ Round, shift and saturate. + @ The vqrshrun takes care of clamping negative values to zero, but + @ we manually need to do vmin with the max pixel value. + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d24, q12, #7 +.if \size >= 8 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d25, q13, #7 + vmin.u16 q1, q1, q3 + vmin.u16 q12, q12, q3 +.else + vmin.u16 d2, d2, d6 + vmin.u16 d24, d24, d6 +.endif + @ Average +.ifc \type,avg +.if \size >= 8 + vld1.16 {q14}, [r0,:128] + vld1.16 {q15}, [r6,:128] + vrhadd.u16 q1, q1, q14 + vrhadd.u16 q12, q12, q15 +.else + vld1.16 {d28}, [r0,:64] + vld1.16 {d30}, [r6,:64] + vrhadd.u16 d2, d2, d28 + vrhadd.u16 d24, d24, d30 +.endif +.endif + @ Store and loop horizontally (for size >= 8) +.if \size >= 8 + subs r12, r12, #16 + vst1.16 {q1}, [r0,:128]! + vst1.16 {q12}, [r6,:128]! + beq 3f + vmov q8, q9 + vmov q10, q11 + vld1.16 {q9}, [r2]! + vld1.16 {q11}, [r7]! + b 2b +.else @ \size == 4 + vst1.16 {d2}, [r0,:64] + vst1.16 {d24}, [r6,:64] +.endif +3: + @ Loop vertically + add r0, r0, r1 + add r6, r6, r1 + add r2, r2, r3 + add r7, r7, r3 + subs r4, r4, #2 + bne 1b + pop {r4-r7} + bx lr +endfunc +.endm + +.macro do_8tap_h_size size +do_8tap_h put, \size +do_8tap_h avg, \size +.endm + +do_8tap_h_size 4 +do_8tap_h_size 8 + +.macro do_8tap_h_func type, filter, offset, size, bpp +function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1 + push {r4-r7} + ldr r4, [sp, #16] + ldr r5, [sp, #20] + vmvn.u16 q3, #((0xffff << \bpp) & 0xffff) + movrelx r12, X(ff_vp9_subpel_filters), r6 + add r12, r12, 256*\offset + add r12, r12, r5, lsl #4 + mov r5, #2*\size +.if \size >= 8 + b \type\()_8tap_8h +.else + b \type\()_8tap_4h +.endif +endfunc +.endm + +.macro do_8tap_h_filters size, bpp +do_8tap_h_func put, regular, 1, \size, \bpp +do_8tap_h_func avg, regular, 1, \size, \bpp +do_8tap_h_func put, sharp, 2, \size, \bpp +do_8tap_h_func avg, sharp, 2, \size, \bpp +do_8tap_h_func put, smooth, 0, \size, \bpp +do_8tap_h_func avg, smooth, 0, \size, \bpp +.endm + +.macro do_8tap_h_filters_bpp bpp +do_8tap_h_filters 64, \bpp +do_8tap_h_filters 32, \bpp +do_8tap_h_filters 16, \bpp +do_8tap_h_filters 8, \bpp +do_8tap_h_filters 4, \bpp +.endm + +do_8tap_h_filters_bpp 10 +do_8tap_h_filters_bpp 12 + +.ltorg + +@ Vertical filters + +@ Round, shift and saturate and store qreg1-4 +.macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type + vqrshrun.s32 \dreg1, \qreg1, #7 + vqrshrun.s32 \dreg2, \qreg2, #7 + vqrshrun.s32 \dreg3, \qreg3, #7 + vqrshrun.s32 \dreg4, \qreg4, #7 +.ifc \type,avg + vld1.16 {\tmp1}, [r6,:64], r1 + vld1.16 {\tmp2}, [r6,:64], r1 + vld1.16 {\tmp3}, [r6,:64], r1 + vld1.16 {\tmp4}, [r6,:64], r1 +.endif + vmin.u16 \dreg1, \dreg1, \minreg + vmin.u16 \dreg2, \dreg2, \minreg + vmin.u16 \dreg3, \dreg3, \minreg + vmin.u16 \dreg4, \dreg4, \minreg +.ifc \type,avg + vrhadd.u16 \dreg1, \dreg1, \tmp1 + vrhadd.u16 \dreg2, \dreg2, \tmp2 + vrhadd.u16 \dreg3, \dreg3, \tmp3 + vrhadd.u16 \dreg4, \dreg4, \tmp4 +.endif + vst1.16 {\dreg1}, [r0,:64], r1 + vst1.16 {\dreg2}, [r0,:64], r1 + vst1.16 {\dreg3}, [r0,:64], r1 + vst1.16 {\dreg4}, [r0,:64], r1 +.endm + +@ Round, shift and saturate and store qreg1-4 +@ qreg1-2 belong to one line and qreg3-4 to the second line. +@ dreg1-2 == qreg1, dreg3-4 == qreg2. +.macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type + vqrshrun.s32 \dreg1, \qreg1, #7 + vqrshrun.s32 \dreg2, \qreg2, #7 + vqrshrun.s32 \dreg3, \qreg3, #7 + vqrshrun.s32 \dreg4, \qreg4, #7 +.ifc \type,avg + vld1.16 {\qreg3}, [r6,:128], r1 + vld1.16 {\qreg4}, [r6,:128], r1 +.endif + vmin.u16 \qreg1, \qreg1, \minreg + vmin.u16 \qreg2, \qreg2, \minreg +.ifc \type,avg + vrhadd.u16 \qreg1, \qreg1, \qreg3 + vrhadd.u16 \qreg2, \qreg2, \qreg4 +.endif + vst1.16 {\qreg1}, [r0,:128], r1 + vst1.16 {\qreg2}, [r0,:128], r1 +.endm + +@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 +@ (src1-src8 into dst1, src2-src9 into dst2). +.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2 + vmull.s16 \dst1, \src1, d0[0] + vmull.s16 \dst2, \src2, d0[0] + vmull.s16 \tmp1, \src2, d0[1] + vmull.s16 \tmp2, \src3, d0[1] + vmlal.s16 \dst1, \src3, d0[2] + vmlal.s16 \dst2, \src4, d0[2] + vmlal.s16 \tmp1, \src4, d0[3] + vmlal.s16 \tmp2, \src5, d0[3] + vmlal.s16 \dst1, \src5, d1[0] + vmlal.s16 \dst2, \src6, d1[0] + vmlal.s16 \tmp1, \src6, d1[1] + vmlal.s16 \tmp2, \src7, d1[1] + vmlal.s16 \dst1, \src7, d1[2] + vmlal.s16 \dst2, \src8, d1[2] + vmlal.s16 \tmp1, \src8, d1[3] + vmlal.s16 \tmp2, \src9, d1[3] + vadd.s32 \dst1, \dst1, \tmp1 + vadd.s32 \dst2, \dst2, \tmp2 +.endm + +@ Evaluate the filter twice in parallel. This does the same as convolve4 above, +@ but with double width (two input/output registers per row). +.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18 + vmull.s16 \dst1, \src1, d0[0] + vmull.s16 \dst2, \src2, d0[0] + vmull.s16 \dst3, \src3, d0[0] + vmull.s16 \dst4, \src4, d0[0] + vmlal.s16 \dst1, \src3, d0[1] + vmlal.s16 \dst2, \src4, d0[1] + vmlal.s16 \dst3, \src5, d0[1] + vmlal.s16 \dst4, \src6, d0[1] + vmlal.s16 \dst1, \src5, d0[2] + vmlal.s16 \dst2, \src6, d0[2] + vmlal.s16 \dst3, \src7, d0[2] + vmlal.s16 \dst4, \src8, d0[2] + vmlal.s16 \dst1, \src7, d0[3] + vmlal.s16 \dst2, \src8, d0[3] + vmlal.s16 \dst3, \src9, d0[3] + vmlal.s16 \dst4, \src10, d0[3] + vmlal.s16 \dst1, \src9, d1[0] + vmlal.s16 \dst2, \src10, d1[0] + vmlal.s16 \dst3, \src11, d1[0] + vmlal.s16 \dst4, \src12, d1[0] + vmlal.s16 \dst1, \src11, d1[1] + vmlal.s16 \dst2, \src12, d1[1] + vmlal.s16 \dst3, \src13, d1[1] + vmlal.s16 \dst4, \src14, d1[1] + vmlal.s16 \dst1, \src13, d1[2] + vmlal.s16 \dst2, \src14, d1[2] + vmlal.s16 \dst3, \src15, d1[2] + vmlal.s16 \dst4, \src16, d1[2] + vmlal.s16 \dst1, \src15, d1[3] + vmlal.s16 \dst2, \src16, d1[3] + vmlal.s16 \dst3, \src17, d1[3] + vmlal.s16 \dst4, \src18, d1[3] +.endm + +@ Instantiate a vertical filter function for filtering 8 pixels at a time. +@ The height is passed in r4, the width in r5 and the filter coefficients +@ in r12. +.macro do_8tap_8v type +function \type\()_8tap_8v + sub r2, r2, r3, lsl #1 + sub r2, r2, r3 + vld1.16 {q0}, [r12, :128] +1: +.ifc \type,avg + mov r6, r0 +.endif + mov r12, r4 + + vld1.16 {q5}, [r2], r3 + vld1.16 {q6}, [r2], r3 + vld1.16 {q7}, [r2], r3 + vld1.16 {q8}, [r2], r3 + vld1.16 {q9}, [r2], r3 + vld1.16 {q10}, [r2], r3 + vld1.16 {q11}, [r2], r3 +2: + vld1.16 {q12}, [r2], r3 + vld1.16 {q13}, [r2], r3 + vld1.16 {q14}, [r2], r3 + vld1.16 {q15}, [r2], r3 + convolve8 q2, q3, q4, q5, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27 + do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type + convolve8 q2, q3, q4, q5, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type + + subs r12, r12, #4 + beq 8f + + vld1.16 {q4}, [r2], r3 + vld1.16 {q5}, [r2], r3 + vld1.16 {q6}, [r2], r3 + vld1.16 {q7}, [r2], r3 + convolve8 q2, q3, q8, q9, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11 + do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type + convolve8 q2, q3, q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15 + do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type + + subs r12, r12, #4 + beq 8f + + vld1.16 {q8}, [r2], r3 + vld1.16 {q9}, [r2], r3 + vld1.16 {q10}, [r2], r3 + vld1.16 {q11}, [r2], r3 + convolve8 q2, q3, q12, q13, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19 + do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type + convolve8 q2, q3, q12, q13, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23 + do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type + + subs r12, r12, #4 + bne 2b + +8: + subs r5, r5, #8 + beq 9f + @ r0 -= h * dst_stride + mls r0, r1, r4, r0 + @ r2 -= h * src_stride + mls r2, r3, r4, r2 + @ r2 -= 8 * src_stride + sub r2, r2, r3, lsl #3 + @ r2 += 1 * src_stride + add r2, r2, r3 + add r2, r2, #16 + add r0, r0, #16 + b 1b +9: + vpop {q4-q7} + pop {r4-r6} + bx lr +endfunc +.endm + +do_8tap_8v put +do_8tap_8v avg + +@ Instantiate a vertical filter function for filtering a 4 pixels wide +@ slice. This only is designed to work for 4 or 8 output lines. +.macro do_8tap_4v type +function \type\()_8tap_4v + sub r2, r2, r3, lsl #1 + sub r2, r2, r3 + vld1.16 {q0}, [r12, :128] +.ifc \type,avg + mov r6, r0 +.endif + + vld1.16 {d16}, [r2], r3 + vld1.16 {d17}, [r2], r3 + vld1.16 {d18}, [r2], r3 + vld1.16 {d19}, [r2], r3 + vld1.16 {d20}, [r2], r3 + vld1.16 {d21}, [r2], r3 + vld1.16 {d22}, [r2], r3 + vld1.16 {d23}, [r2], r3 + vld1.16 {d24}, [r2], r3 + vld1.16 {d25}, [r2], r3 + vld1.16 {d26}, [r2], r3 + convolve4 q2, q3, d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15 + convolve4 q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8, q9 + do_store4 q2, d4, q3, d6, q14, d28, q15, d30, d5, d7, d29, d31, d2, \type + + subs r4, r4, #4 + beq 9f + + vld1.16 {d27}, [r2], r3 + vld1.16 {d28}, [r2], r3 + vld1.16 {d29}, [r2], r3 + vld1.16 {d30}, [r2], r3 + convolve4 q2, q3, d20, d21, d22, d23, d24, d25, d26, d27, d28, q8, q9 + convolve4 q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11 + do_store4 q2, d4, q3, d6, q8, d16, q9, d18, d5, d7, d17, d19, d2, \type + +9: + pop {r4-r6} + bx lr +endfunc +.endm + +do_8tap_4v put +do_8tap_4v avg + +.macro do_8tap_v_func type, filter, offset, size, bpp +function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1 + push {r4-r6} + ldr r4, [sp, #12] + ldr r5, [sp, #20] +.if \size >= 8 + vpush {q4-q7} +.endif + vmvn.u16 q1, #((0xffff << \bpp) & 0xffff) + movrelx r12, X(ff_vp9_subpel_filters), r6 + add r12, r12, 256*\offset + add r12, r12, r5, lsl #4 + mov r5, #\size +.if \size >= 8 + b \type\()_8tap_8v +.else + b \type\()_8tap_4v +.endif +endfunc +.endm + +.macro do_8tap_v_filters size, bpp +do_8tap_v_func put, regular, 1, \size, \bpp +do_8tap_v_func avg, regular, 1, \size, \bpp +do_8tap_v_func put, sharp, 2, \size, \bpp +do_8tap_v_func avg, sharp, 2, \size, \bpp +do_8tap_v_func put, smooth, 0, \size, \bpp +do_8tap_v_func avg, smooth, 0, \size, \bpp +.endm + +.macro do_8tap_v_filters_bpp bpp +do_8tap_v_filters 64, \bpp +do_8tap_v_filters 32, \bpp +do_8tap_v_filters 16, \bpp +do_8tap_v_filters 8, \bpp +do_8tap_v_filters 4, \bpp +.endm + +do_8tap_v_filters_bpp 10 +do_8tap_v_filters_bpp 12 diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S index 8d43ff1..bd8cda7 100644 --- a/libavcodec/arm/vp9mc_neon.S +++ b/libavcodec/arm/vp9mc_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2016 Google Inc. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -286,8 +286,7 @@ function \type\()_8tap_\size\()h_\idx1\idx2 sub r3, r3, #8 .endif @ Load the filter vector - vld1.8 {d0}, [r12,:64] - vmovl.s8 q0, d0 + vld1.16 {q0}, [r12,:128] 1: .if \size >= 16 mov r12, r5 @@ -416,9 +415,9 @@ function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 ldr r5, [sp, #20] .endif movrelx r12, X(ff_vp9_subpel_filters), r6 - add r12, r12, 120*\offset - 8 + add r12, r12, 256*\offset cmp r5, #8 - add r12, r12, r5, lsl #3 + add r12, r12, r5, lsl #4 mov r5, #\size .if \size >= 16 bge \type\()_8tap_16h_34 @@ -551,8 +550,7 @@ do_8tap_h_filters 4 function \type\()_8tap_8v_\idx1\idx2 sub r2, r2, r3, lsl #1 sub r2, r2, r3 - vld1.8 {d0}, [r12, :64] - vmovl.s8 q0, d0 + vld1.16 {q0}, [r12, :128] 1: mov r12, r4 @@ -622,8 +620,7 @@ do_8tap_8v avg, 4, 3 function \type\()_8tap_4v_\idx1\idx2 sub r2, r2, r3, lsl #1 sub r2, r2, r3 - vld1.8 {d0}, [r12, :64] - vmovl.s8 q0, d0 + vld1.16 {q0}, [r12, :128] vld1.32 {d2[]}, [r2], r3 vld1.32 {d3[]}, [r2], r3 @@ -693,8 +690,8 @@ function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 ldr r4, [sp, #72] movrelx r12, X(ff_vp9_subpel_filters), r5 ldr r5, [sp, #80] - add r12, r12, 120*\offset - 8 - add r12, r12, r5, lsl #3 + add r12, r12, 256*\offset + add r12, r12, r5, lsl #4 cmp r5, #8 mov r5, #\size .if \size >= 8 |