summaryrefslogtreecommitdiffstats
path: root/libavcodec/arm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/vp9itxfm_neon.S75
1 files changed, 66 insertions, 9 deletions
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 2049241..5abe435 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -659,9 +659,8 @@ endfunc
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@ transpose into a horizontal 16x4 slice and store.
@ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = slice offset
@ r2 = src
-@ r3 = slice offset
function \txfm\()16_1d_4x16_pass1_neon
mov r12, #32
vmov.s16 q2, #0
@@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon
transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
@ Store the transposed 4x4 blocks horizontally.
- cmp r3, #12
+ cmp r1, #12
beq 1f
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
vst1.16 {d\i}, [r0,:64]!
.endr
bx lr
1:
- @ Special case: For the last input column (r3 == 12),
+ @ Special case: For the last input column (r1 == 12),
@ which would be stored as the last row in the temp buffer,
@ don't store the first 4x4 block, but keep it in registers
@ for the first slice of the second pass (where it is the
@@ -781,15 +780,22 @@ endfunc
itxfm16_1d_funcs idct
itxfm16_1d_funcs iadst
+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+ .short 0, 10, 38, 89
+endconst
+
.macro itxfm_func16x16 txfm1, txfm2
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifc \txfm1\()_\txfm2,idct_idct
cmp r3, #1
beq idct16x16_dc_add_neon
.endif
- push {r4-r7,lr}
+ push {r4-r8,lr}
.ifnc \txfm1\()_\txfm2,idct_idct
vpush {q4-q7}
+.else
+ movrel r8, min_eob_idct_idct_16 + 2
.endif
@ Align the stack, allocate a temp buffer
@@ -810,10 +816,36 @@ A and r7, sp, #15
.irp i, 0, 4, 8, 12
add r0, sp, #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+ ldrh_post r1, r8, #2
+ cmp r3, r1
+ it le
+ movle r1, #(16 - \i)/4
+ ble 1f
+.endif
+.endif
+ mov r1, #\i
add r2, r6, #(\i*2)
- mov r3, #\i
bl \txfm1\()16_1d_4x16_pass1_neon
.endr
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ b 3f
+1:
+ @ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register
+ @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ subs r1, r1, #1
+.rept 4
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+ bne 2b
+3:
+.endif
+
.ifc \txfm1\()_\txfm2,iadst_idct
movrel r12, idct_coeffs
vld1.16 {q0-q1}, [r12,:128]
@@ -830,7 +862,7 @@ A and r7, sp, #15
.ifnc \txfm1\()_\txfm2,idct_idct
vpop {q4-q7}
.endif
- pop {r4-r7,pc}
+ pop {r4-r8,pc}
endfunc
.endm
@@ -1110,11 +1142,16 @@ function idct32_1d_4x32_pass2_neon
bx lr
endfunc
+const min_eob_idct_idct_32, align=4
+ .short 0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp r3, #1
beq idct32x32_dc_add_neon
- push {r4-r7,lr}
+ push {r4-r8,lr}
vpush {q4-q7}
+ movrel r8, min_eob_idct_idct_32 + 2
@ Align the stack, allocate a temp buffer
T mov r7, sp
@@ -1129,9 +1166,29 @@ A and r7, sp, #15
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
add r0, sp, #(\i*64)
+.if \i > 0
+ ldrh_post r1, r8, #2
+ cmp r3, r1
+ it le
+ movle r1, #(32 - \i)/2
+ ble 1f
+.endif
add r2, r6, #(\i*2)
bl idct32_1d_4x32_pass1_neon
.endr
+ b 3f
+
+1:
+ @ Write zeros to the temp buffer for pass 2
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ subs r1, r1, #1
+.rept 4
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+ bne 2b
+3:
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
add r0, r4, #(\i)
mov r1, r5
@@ -1141,5 +1198,5 @@ A and r7, sp, #15
add sp, sp, r7
vpop {q4-q7}
- pop {r4-r7,pc}
+ pop {r4-r8,pc}
endfunc
OpenPOWER on IntegriCloud