diff options
Diffstat (limited to 'libswscale/arm/rgb2yuv_neon_common.S')
-rw-r--r-- | libswscale/arm/rgb2yuv_neon_common.S | 291 |
1 files changed, 291 insertions, 0 deletions
diff --git a/libswscale/arm/rgb2yuv_neon_common.S b/libswscale/arm/rgb2yuv_neon_common.S new file mode 100644 index 0000000..30bcecd --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_common.S @@ -0,0 +1,291 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro alias name, tgt, set=1 +.if \set != 0 + \name .req \tgt +.else + .unreq \name +.endif +.endm + +.altmacro + +.macro alias_dw_all qw, dw_l, dw_h + alias q\qw\()_l, d\dw_l + alias q\qw\()_h, d\dw_h + .if \qw < 15 + alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2) + .endif +.endm + +alias_dw_all 0, 0, 1 + +.noaltmacro + +.macro alias_qw name, qw, set=1 + alias \name\(), \qw, \set + alias \name\()_l, \qw\()_l, \set + alias \name\()_h, \qw\()_h, \set +.endm + +.macro prologue + push {r4-r12, lr} + vpush {q4-q7} +.endm + +.macro epilogue + vpop {q4-q7} + pop {r4-r12, pc} +.endm + +.macro load_arg reg, ix + ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)] +.endm + + +/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma + * int width, int height, + * int y_stride, int c_stride, int src_stride, + * int32_t coeff_table[9]); + */ +.macro alias_loop_420sp set=1 + alias src, r0, \set + alias src0, src, \set + alias y, r1, \set + alias y0, y, \set + alias chroma, r2, \set + alias width, r3, \set + alias header, width, \set + + alias height, r4, \set + alias y_stride, r5, \set + alias c_stride, r6, \set + alias c_padding, c_stride, \set + alias src_stride, r7, \set + + alias y0_end, r8, \set + + alias src_padding,r9, \set + alias y_padding, r10, \set + + alias src1, r11, \set + alias y1, r12, \set + + alias coeff_table,r12, \set +.endm + + +.macro loop_420sp s_fmt, d_fmt, init, kernel, precision + +function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1 + prologue + + alias_loop_420sp + + load_arg height, 4 + load_arg y_stride, 5 + load_arg c_stride, 6 + load_arg src_stride, 7 + load_arg coeff_table, 8 + + \init coeff_table + + sub y_padding, y_stride, width + sub c_padding, c_stride, width + sub src_padding, src_stride, width, LSL #2 + + add y0_end, y0, width + and header, width, #15 + + add y1, y0, y_stride + add src1, src0, src_stride + +0: + cmp header, #0 + beq 1f + + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header + +1: + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma + + cmp y0, y0_end + blt 1b +2: + add y0, y1, y_padding + add y0_end, y1, y_stride + add chroma, chroma, c_padding + add src0, src1, src_padding + + add y1, y0, y_stride + add src1, src0, src_stride + + subs height, height, #2 + + bgt 0b + + epilogue + + alias_loop_420sp 0 + +endfunc +.endm + +.macro downsample + vpaddl.u8 r16x8, r8x16 + vpaddl.u8 g16x8, g8x16 + vpaddl.u8 b16x8, b8x16 +.endm + + +/* acculumate and right shift by 2 */ +.macro downsample_ars2 + vpadal.u8 r16x8, r8x16 + vpadal.u8 g16x8, g8x16 + vpadal.u8 b16x8, b8x16 + + vrshr.u16 r16x8, r16x8, #2 + vrshr.u16 g16x8, g16x8, #2 + vrshr.u16 b16x8, b16x8, #2 +.endm + +.macro store_y8_16x1 dst, count +.ifc "\count","" + vstmia \dst!, {y8x16} +.else + vstmia \dst, {y8x16} + add \dst, \dst, \count +.endif +.endm + +.macro store_chroma_nv12_8x1 dst, count +.ifc "\count","" + vst2.i8 {u8x8, v8x8}, [\dst]! +.else + vst2.i8 {u8x8, v8x8}, [\dst], \count +.endif +.endm + +.macro store_chroma_nv21_8x1 dst, count +.ifc "\count","" + vst2.i8 {v8x8, u8x8}, [\dst]! +.else + vst2.i8 {v8x8, u8x8}, [\dst], \count +.endif +.endm + +.macro load_8888_16x1 a, b, c, d, src, count +.ifc "\count","" + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]! +.else + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src] + sub \src, \src, #32 + add \src, \src, \count, LSL #2 +.endif +.endm + +.macro load_rgbx_16x1 src, count + load_8888_16x1 r, g, b, x, \src, \count +.endm + +.macro load_bgrx_16x1 src, count + load_8888_16x1 b, g, r, x, \src, \count +.endm + +.macro alias_src_rgbx set=1 + alias_src_8888 r, g, b, x, \set +.endm + +.macro alias_src_bgrx set=1 + alias_src_8888 b, g, r, x, \set +.endm + +.macro alias_dst_nv12 set=1 + alias u8x8, c8x8x2_l, \set + alias v8x8, c8x8x2_h, \set +.endm + +.macro alias_dst_nv21 set=1 + alias v8x8, c8x8x2_l, \set + alias u8x8, c8x8x2_h, \set +.endm + + +// common aliases + +alias CO_R d0 +CO_RY .dn d0.s16[0] +CO_RU .dn d0.s16[1] +CO_RV .dn d0.s16[2] + +alias CO_G d1 +CO_GY .dn d1.s16[0] +CO_GU .dn d1.s16[1] +CO_GV .dn d1.s16[2] + +alias CO_B d2 +CO_BY .dn d2.s16[0] +CO_BU .dn d2.s16[1] +CO_BV .dn d2.s16[2] + +alias BIAS_U, d3 +alias BIAS_V, BIAS_U + +alias BIAS_Y, q2 + + +/* q3-q6 R8G8B8X8 x16 */ + +.macro alias_src_8888 a, b, c, d, set + alias_qw \a\()8x16, q3, \set + alias_qw \b\()8x16, q4, \set + alias_qw \c\()8x16, q5, \set + alias_qw \d\()8x16, q6, \set +.endm + +.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count + alias_src_\rgb_fmt + alias_dst_\yuv_fmt + + load_\rgb_fmt\()_16x1 \rgb0, \count + + downsample + compute_y_16x1 + store_y8_16x1 \y0, \count + + + load_\rgb_fmt\()_16x1 \rgb1, \count + downsample_ars2 + compute_y_16x1 + store_y8_16x1 \y1, \count + + compute_chroma_8x1 u, U + compute_chroma_8x1 v, V + + store_chroma_\yuv_fmt\()_8x1 \chroma, \count + + alias_dst_\yuv_fmt 0 + alias_src_\rgb_fmt 0 +.endm |