diff options
Diffstat (limited to 'libavcodec/arm')
-rw-r--r-- | libavcodec/arm/vp9itxfm_neon.S | 75 |
1 files changed, 66 insertions, 9 deletions
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 2049241..5abe435 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -659,9 +659,8 @@ endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @ transpose into a horizontal 16x4 slice and store. @ r0 = dst (temp buffer) -@ r1 = unused +@ r1 = slice offset @ r2 = src -@ r3 = slice offset function \txfm\()16_1d_4x16_pass1_neon mov r12, #32 vmov.s16 q2, #0 @@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 @ Store the transposed 4x4 blocks horizontally. - cmp r3, #12 + cmp r1, #12 beq 1f .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 vst1.16 {d\i}, [r0,:64]! .endr bx lr 1: - @ Special case: For the last input column (r3 == 12), + @ Special case: For the last input column (r1 == 12), @ which would be stored as the last row in the temp buffer, @ don't store the first 4x4 block, but keep it in registers @ for the first slice of the second pass (where it is the @@ -781,15 +780,22 @@ endfunc itxfm16_1d_funcs idct itxfm16_1d_funcs iadst +@ This is the minimum eob value for each subpartition, in increments of 4 +const min_eob_idct_idct_16, align=4 + .short 0, 10, 38, 89 +endconst + .macro itxfm_func16x16 txfm1, txfm2 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1\()_\txfm2,idct_idct cmp r3, #1 beq idct16x16_dc_add_neon .endif - push {r4-r7,lr} + push {r4-r8,lr} .ifnc \txfm1\()_\txfm2,idct_idct vpush {q4-q7} +.else + movrel r8, min_eob_idct_idct_16 + 2 .endif @ Align the stack, allocate a temp buffer @@ -810,10 +816,36 @@ A and r7, sp, #15 .irp i, 0, 4, 8, 12 add r0, sp, #(\i*32) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i > 0 + ldrh_post r1, r8, #2 + cmp r3, r1 + it le + movle r1, #(16 - \i)/4 + ble 1f +.endif +.endif + mov r1, #\i add r2, r6, #(\i*2) - mov r3, #\i bl \txfm1\()16_1d_4x16_pass1_neon .endr + +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + @ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register + @ passthrough of coefficients to pass 2 and clear the end of the temp buffer + vmov.i16 q14, #0 + vmov.i16 q15, #0 +2: + subs r1, r1, #1 +.rept 4 + vst1.16 {q14-q15}, [r0,:128]! +.endr + bne 2b +3: +.endif + .ifc \txfm1\()_\txfm2,iadst_idct movrel r12, idct_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -830,7 +862,7 @@ A and r7, sp, #15 .ifnc \txfm1\()_\txfm2,idct_idct vpop {q4-q7} .endif - pop {r4-r7,pc} + pop {r4-r8,pc} endfunc .endm @@ -1110,11 +1142,16 @@ function idct32_1d_4x32_pass2_neon bx lr endfunc +const min_eob_idct_idct_32, align=4 + .short 0, 9, 34, 70, 135, 240, 336, 448 +endconst + function ff_vp9_idct_idct_32x32_add_neon, export=1 cmp r3, #1 beq idct32x32_dc_add_neon - push {r4-r7,lr} + push {r4-r8,lr} vpush {q4-q7} + movrel r8, min_eob_idct_idct_32 + 2 @ Align the stack, allocate a temp buffer T mov r7, sp @@ -1129,9 +1166,29 @@ A and r7, sp, #15 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r0, sp, #(\i*64) +.if \i > 0 + ldrh_post r1, r8, #2 + cmp r3, r1 + it le + movle r1, #(32 - \i)/2 + ble 1f +.endif add r2, r6, #(\i*2) bl idct32_1d_4x32_pass1_neon .endr + b 3f + +1: + @ Write zeros to the temp buffer for pass 2 + vmov.i16 q14, #0 + vmov.i16 q15, #0 +2: + subs r1, r1, #1 +.rept 4 + vst1.16 {q14-q15}, [r0,:128]! +.endr + bne 2b +3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r0, r4, #(\i) mov r1, r5 @@ -1141,5 +1198,5 @@ A and r7, sp, #15 add sp, sp, r7 vpop {q4-q7} - pop {r4-r7,pc} + pop {r4-r8,pc} endfunc |