From 89d9869d2491d4209d707a8e7f29c58227ae5a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandra=20H=C3=A1jkov=C3=A1?= Date: Wed, 12 Apr 2017 09:29:44 +0200 Subject: hevc: Add NEON 16x16 IDCT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The speedup vs C code is around 6-13x. Signed-off-by: Martin Storsjö --- libavcodec/arm/hevc_idct.S | 197 ++++++++++++++++++++++++++++++++++++++ libavcodec/arm/hevcdsp_init_arm.c | 4 + 2 files changed, 201 insertions(+) diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S index 4124fc8..156d476 100644 --- a/libavcodec/arm/hevc_idct.S +++ b/libavcodec/arm/hevc_idct.S @@ -222,7 +222,204 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1 endfunc .endm +.macro butterfly e, o, tmp_p, tmp_m + vadd.s32 \tmp_p, \e, \o + vsub.s32 \tmp_m, \e, \o +.endm + +.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7 + tr_4x4_8 \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, q14, q15 + + vmull.s16 q12, \in1, \in0[0] + vmull.s16 q13, \in1, \in0[1] + vmull.s16 q14, \in1, \in0[2] + vmull.s16 q15, \in1, \in0[3] + sum_sub q12, \in3, \in0[1], + + sum_sub q13, \in3, \in0[3], - + sum_sub q14, \in3, \in0[0], - + sum_sub q15, \in3, \in0[2], - + + sum_sub q12, \in5, \in0[2], + + sum_sub q13, \in5, \in0[0], - + sum_sub q14, \in5, \in0[3], + + sum_sub q15, \in5, \in0[1], + + + sum_sub q12, \in7, \in0[3], + + sum_sub q13, \in7, \in0[2], - + sum_sub q14, \in7, \in0[1], + + sum_sub q15, \in7, \in0[0], - + + butterfly q8, q12, q0, q7 + butterfly q9, q13, q1, q6 + butterfly q10, q14, q2, q5 + butterfly q11, q15, q3, q4 + add r4, sp, #512 + vst1.s16 {q0-q1}, [r4, :128]! + vst1.s16 {q2-q3}, [r4, :128]! + vst1.s16 {q4-q5}, [r4, :128]! + vst1.s16 {q6-q7}, [r4, :128] +.endm + +.macro load16 in0, in1, in2, in3, in4, in5, in6, in7 + vld1.s16 {\in0}, [r1, :64], r2 + vld1.s16 {\in1}, [r3, :64], r2 + vld1.s16 {\in2}, [r1, :64], r2 + vld1.s16 {\in3}, [r3, :64], r2 + vld1.s16 {\in4}, [r1, :64], r2 + vld1.s16 {\in5}, [r3, :64], r2 + vld1.s16 {\in6}, [r1, :64], r2 + vld1.s16 {\in7}, [r3, :64], r2 +.endm + +.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7 + sum_sub q5, \in, \t0, \op0 + sum_sub q6, \in, \t1, \op1 + sum_sub q7, \in, \t2, \op2 + sum_sub q8, \in, \t3, \op3 + sum_sub q9, \in, \t4, \op4 + sum_sub q10, \in, \t5, \op5 + sum_sub q11, \in, \t6, \op6 + sum_sub q12, \in, \t7, \op7 +.endm + +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 + vadd.s32 q4, \in0, \in1 + vsub.s32 \in0, \in0, \in1 + vadd.s32 \in1, \in2, \in3 + vsub.s32 \in2, \in2, \in3 + vadd.s32 \in3, \in4, \in5 + vsub.s32 \in4, \in4, \in5 + vadd.s32 \in5, \in6, \in7 + vsub.s32 \in6, \in6, \in7 +.endm + +.macro store16 in0, in1, in2, in3, in4, in5, in6, in7 + vst1.s16 \in0, [r1, :64], r2 + vst1.s16 \in1, [r3, :64], r4 + vst1.s16 \in2, [r1, :64], r2 + vst1.s16 \in3, [r3, :64], r4 + vst1.s16 \in4, [r1, :64], r2 + vst1.s16 \in5, [r3, :64], r4 + vst1.s16 \in6, [r1, :64], r2 + vst1.s16 \in7, [r3, :64], r4 +.endm + +.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, in3, in4, in5, in6, in7, shift + vqrshrn.s32 \out0, \in0, \shift + vqrshrn.s32 \out1, \in1, \shift + vqrshrn.s32 \out2, \in2, \shift + vqrshrn.s32 \out3, \in3, \shift + vqrshrn.s32 \out4, \in4, \shift + vqrshrn.s32 \out5, \in5, \shift + vqrshrn.s32 \out6, \in6, \shift + vqrshrn.s32 \out7, \in7, \shift +.endm + +.macro tr_16x4 name, shift +function func_tr_16x4_\name + mov r1, r5 + add r3, r5, #64 + mov r2, #128 + load16 d0, d1, d2, d3, d4, d5, d6, d7 + movrel r1, trans + + tr16_8x4 d0, d1, d2, d3, d4, d5, d6, d7 + + add r1, r5, #32 + add r3, r5, #(64 + 32) + mov r2, #128 + load16 d8, d9, d2, d3, d4, d5, d6, d7 + movrel r1, trans + 16 + vld1.s16 {q0}, [r1, :128] + vmull.s16 q5, d8, d0[0] + vmull.s16 q6, d8, d0[1] + vmull.s16 q7, d8, d0[2] + vmull.s16 q8, d8, d0[3] + vmull.s16 q9, d8, d1[0] + vmull.s16 q10, d8, d1[1] + vmull.s16 q11, d8, d1[2] + vmull.s16 q12, d8, d1[3] + + add_member d9, d0[1], d1[0], d1[3], d1[1], d0[2], d0[0], d0[3], d1[2], +, +, +, -, -, -, -, - + add_member d2, d0[2], d1[3], d0[3], d0[1], d1[2], d1[0], d0[0], d1[1], +, +, -, -, -, +, +, + + add_member d3, d0[3], d1[1], d0[1], d1[3], d0[0], d1[2], d0[2], d1[0], +, -, -, +, +, +, -, - + add_member d4, d1[0], d0[2], d1[2], d0[0], d1[3], d0[1], d1[1], d0[3], +, -, -, +, -, -, +, + + add_member d5, d1[1], d0[0], d1[0], d1[2], d0[1], d0[3], d1[3], d0[2], +, -, +, +, -, +, +, - + add_member d6, d1[2], d0[3], d0[0], d0[2], d1[1], d1[3], d1[0], d0[1], +, -, +, -, +, +, -, + + add_member d7, d1[3], d1[2], d1[1], d1[0], d0[3], d0[2], d0[1], d0[0], +, -, +, -, +, -, +, - + + add r4, sp, #512 + vld1.s16 {q0-q1}, [r4, :128]! + vld1.s16 {q2-q3}, [r4, :128]! + + butterfly16 q0, q5, q1, q6, q2, q7, q3, q8 + scale d26, d27, d28, d29, d30, d31, d16, d17, q4, q0, q5, q1, q6, q2, q7, q3, \shift + transpose8_4x4 d26, d28, d30, d16 + transpose8_4x4 d17, d31, d29, d27 + mov r1, r6 + add r3, r6, #(24 +3*32) + mov r2, #32 + mov r4, #-32 + store16 d26, d27, d28, d29, d30, d31, d16, d17 + + add r4, sp, #576 + vld1.s16 {q0-q1}, [r4, :128]! + vld1.s16 {q2-q3}, [r4, :128] + butterfly16 q0, q9, q1, q10, q2, q11, q3, q12 + scale d26, d27, d28, d29, d30, d31, d8, d9, q4, q0, q9, q1, q10, q2, q11, q3, \shift + transpose8_4x4 d26, d28, d30, d8 + transpose8_4x4 d9, d31, d29, d27 + + add r1, r6, #8 + add r3, r6, #(16 + 3 * 32) + mov r2, #32 + mov r4, #-32 + store16 d26, d27, d28, d29, d30, d31, d8, d9 + + bx lr +endfunc +.endm + +.macro idct_16x16 bitdepth +function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1 +@r0 - coeffs + push {r4-r7, lr} + vpush {q4-q7} + + @ Align the stack, allocate a temp buffer +T mov r7, sp +T and r7, r7, #15 +A and r7, sp, #15 + add r7, r7, #640 + sub sp, sp, r7 + +.irp i, 0, 1, 2, 3 + add r5, r0, #(8 * \i) + add r6, sp, #(8 * \i * 16) + bl func_tr_16x4_firstpass +.endr + +.irp i, 0, 1, 2, 3 + add r5, sp, #(8 * \i) + add r6, r0, #(8 * \i * 16) + bl func_tr_16x4_secondpass_\bitdepth +.endr + + add sp, sp, r7 + + vpop {q4-q7} + pop {r4-r7, pc} +endfunc +.endm + +tr_16x4 firstpass, 7 +tr_16x4 secondpass_8, 20 - 8 +tr_16x4 secondpass_10, 20 - 10 +.ltorg + idct_4x4 8 idct_4x4 10 idct_8x8 8 idct_8x8 10 +idct_16x16 8 +idct_16x16 10 diff --git a/libavcodec/arm/hevcdsp_init_arm.c b/libavcodec/arm/hevcdsp_init_arm.c index 1e984e6..e61587f 100644 --- a/libavcodec/arm/hevcdsp_init_arm.c +++ b/libavcodec/arm/hevcdsp_init_arm.c @@ -27,8 +27,10 @@ void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit); av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, int bit_depth) { @@ -38,10 +40,12 @@ av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, int bit_depth) if (bit_depth == 8) { c->idct[0] = ff_hevc_idct_4x4_8_neon; c->idct[1] = ff_hevc_idct_8x8_8_neon; + c->idct[2] = ff_hevc_idct_16x16_8_neon; } if (bit_depth == 10) { c->idct[0] = ff_hevc_idct_4x4_10_neon; c->idct[1] = ff_hevc_idct_8x8_10_neon; + c->idct[2] = ff_hevc_idct_16x16_10_neon; } } } -- cgit v1.1