summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xconfigure3
-rw-r--r--libavcodec/arm/aac.h3
-rw-r--r--libavcodec/arm/ac3dsp_arm.S1
-rw-r--r--libavcodec/arm/ac3dsp_armv6.S2
-rw-r--r--libavcodec/arm/ac3dsp_neon.S1
-rw-r--r--libavcodec/arm/asm.S93
-rw-r--r--libavcodec/arm/dcadsp_neon.S1
-rw-r--r--libavcodec/arm/dsputil_arm.S10
-rw-r--r--libavcodec/arm/dsputil_armv6.S76
-rw-r--r--libavcodec/arm/dsputil_neon.S3
-rw-r--r--libavcodec/arm/dsputil_vfp.S21
-rw-r--r--libavcodec/arm/fmtconvert_neon.S4
-rw-r--r--libavcodec/arm/fmtconvert_vfp.S3
-rw-r--r--libavcodec/arm/h264dsp_neon.S98
-rw-r--r--libavcodec/arm/h264idct_neon.S23
-rw-r--r--libavcodec/arm/mathops.h3
-rw-r--r--libavcodec/arm/mdct_neon.S4
-rw-r--r--libavcodec/arm/mpegaudiodsp_fixed_armv6.S6
-rw-r--r--libavcodec/arm/mpegvideo_armv5te_s.S12
-rw-r--r--libavcodec/arm/mpegvideo_neon.S4
-rw-r--r--libavcodec/arm/rdft_neon.S1
-rw-r--r--libavcodec/arm/simple_idct_arm.S32
-rw-r--r--libavcodec/arm/simple_idct_armv5te.S39
-rw-r--r--libavcodec/arm/simple_idct_armv6.S33
-rw-r--r--libavcodec/arm/simple_idct_neon.S6
-rw-r--r--libavcodec/arm/synth_filter_neon.S2
-rw-r--r--libavcodec/arm/vp56_arith.h27
-rw-r--r--libavcodec/arm/vp8_armv6.S36
-rw-r--r--libavcodec/arm/vp8dsp_neon.S16
-rw-r--r--libavutil/arm/intmath.h2
30 files changed, 419 insertions, 146 deletions
diff --git a/configure b/configure
index 4032e68..672d452 100755
--- a/configure
+++ b/configure
@@ -967,6 +967,7 @@ CONFIG_LIST="
static
swscale
swscale_alpha
+ thumb
vaapi
vdpau
version3
@@ -2607,7 +2608,7 @@ if enabled alpha; then
elif enabled arm; then
- check_cflags -marm
+ enabled thumb && check_cflags -mthumb || check_cflags -marm
nogas=die
if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then
diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h
index 6d5df49..83b5aef 100644
--- a/libavcodec/arm/aac.h
+++ b/libavcodec/arm/aac.h
@@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
"vmov d1, %2, %3 \n\t"
"lsls %6, %6, #1 \n\t"
"and %0, %5, #1<<31 \n\t"
+ "it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %1, %5, #1<<31 \n\t"
+ "it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %2, %5, #1<<31 \n\t"
+ "it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"vmov d4, %0, %1 \n\t"
"and %3, %5, #1<<31 \n\t"
diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S
index 545714c..9a7d20e 100644
--- a/libavcodec/arm/ac3dsp_arm.S
+++ b/libavcodec/arm/ac3dsp_arm.S
@@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1
lsl r3, lr, #1
ldrh r12, [r0, r3]
subs r2, r2, #1
+ it gt
ldrbgt lr, [r1], #1
add r12, r12, #1
strh r12, [r0, r3]
diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S
index 8026cb7..d3058ff 100644
--- a/libavcodec/arm/ac3dsp_armv6.S
+++ b/libavcodec/arm/ac3dsp_armv6.S
@@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1
mov r11, r10
ldrb r10, [r4], #1 @ band_start_tab[band++]
subs r9, r9, r5 @ - floor
+ it lt
movlt r9, #0
cmp r10, r3 @ - end
and r9, r9, r8 @ & 0x1fe0
+ ite gt
subgt r8, r3, r11
suble r8, r10, r11
add r9, r9, r5 @ + floor => m
diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S
index 946b39f..fdf1dea 100644
--- a/libavcodec/arm/ac3dsp_neon.S
+++ b/libavcodec/arm/ac3dsp_neon.S
@@ -41,6 +41,7 @@ endfunc
function ff_ac3_exponent_min_neon, export=1
cmp r1, #0
+ it eq
bxeq lr
push {lr}
mov r12, #256
diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S
index 8d7fe98..a7d3ace 100644
--- a/libavcodec/arm/asm.S
+++ b/libavcodec/arm/asm.S
@@ -26,7 +26,16 @@
# define ELF @
#endif
+#if CONFIG_THUMB
+# define A @
+# define T
+#else
+# define A
+# define T @
+#endif
+
.syntax unified
+T .thumb
.macro require8 val=1
ELF .eabi_attribute 24, \val
@@ -82,6 +91,90 @@ ELF .size \name, . - \name
#endif
.endm
+.macro ldr_pre rt, rn, rm:vararg
+A ldr \rt, [\rn, \rm]!
+T add \rn, \rn, \rm
+T ldr \rt, [\rn]
+.endm
+
+.macro ldr_post rt, rn, rm:vararg
+A ldr \rt, [\rn], \rm
+T ldr \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro ldrd_reg rt, rt2, rn, rm
+A ldrd \rt, \rt2, [\rn, \rm]
+T add \rt, \rn, \rm
+T ldrd \rt, \rt2, [\rt]
+.endm
+
+.macro ldrd_post rt, rt2, rn, rm
+A ldrd \rt, \rt2, [\rn], \rm
+T ldrd \rt, \rt2, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro ldrh_pre rt, rn, rm
+A ldrh \rt, [\rn, \rm]!
+T add \rn, \rn, \rm
+T ldrh \rt, [\rn]
+.endm
+
+.macro ldrh_dpre rt, rn, rm
+A ldrh \rt, [\rn, -\rm]!
+T sub \rn, \rn, \rm
+T ldrh \rt, [\rn]
+.endm
+
+.macro ldrh_post rt, rn, rm
+A ldrh \rt, [\rn], \rm
+T ldrh \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro str_post rt, rn, rm:vararg
+A str \rt, [\rn], \rm
+T str \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro strb_post rt, rn, rm:vararg
+A strb \rt, [\rn], \rm
+T strb \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro strd_post rt, rt2, rn, rm
+A strd \rt, \rt2, [\rn], \rm
+T strd \rt, \rt2, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro strh_pre rt, rn, rm
+A strh \rt, [\rn, \rm]!
+T add \rn, \rn, \rm
+T strh \rt, [\rn]
+.endm
+
+.macro strh_dpre rt, rn, rm
+A strh \rt, [\rn, -\rm]!
+T sub \rn, \rn, \rm
+T strh \rt, [\rn]
+.endm
+
+.macro strh_post rt, rn, rm
+A strh \rt, [\rn], \rm
+T strh \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro strh_dpost rt, rn, rm
+A strh \rt, [\rn], -\rm
+T strh \rt, [\rn]
+T sub \rn, \rn, \rm
+.endm
+
#if HAVE_VFP_ARGS
.eabi_attribute 28, 1
# define VFP
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
index 6120836..71f5dd8 100644
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1
add r5, r2, #256*4-16 @ cf1
sub r1, r1, #12
cmp r3, #32
+ ite eq
moveq r6, #256/32
movne r6, #256/64
NOVFP vldr s0, [sp, #16] @ scale
diff --git a/libavcodec/arm/dsputil_arm.S b/libavcodec/arm/dsputil_arm.S
index 2117628..eb20ad6 100644
--- a/libavcodec/arm/dsputil_arm.S
+++ b/libavcodec/arm/dsputil_arm.S
@@ -554,10 +554,12 @@ endfunc
and r9, r5, r14
and r10, r6, r14
and r11, r7, r14
+ it eq
andeq r14, r14, r14, \rnd #1
add r8, r8, r10
add r9, r9, r11
ldr r12, =0xfcfcfcfc >> 2
+ itt eq
addeq r8, r8, r14
addeq r9, r9, r14
and r4, r12, r4, lsr #2
@@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
+ it ne
movne r6, r5, lsr #24
tst r8, #0x100
+ it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #4] /* moved form [A] */
@@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
+ it ne
movne r6, r5, lsr #24
tst r8, #0x100
+ it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
ldr r4, [r1, #4] /* moved form [B] */
@@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
+ it ne
movne r6, r5, lsr #24
tst r8, #0x100
+ it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #12] /* moved from [D] */
@@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
+ it ne
movne r6, r5, lsr #24
tst r8, #0x100
+ it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
add r0, r0, #16 /* moved from [E] */
diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S
index 8acb96d..b846105 100644
--- a/libavcodec/arm/dsputil_armv6.S
+++ b/libavcodec/arm/dsputil_armv6.S
@@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1
ldr r5, [r1, #4]
ldr r6, [r1, #8]
ldr r7, [r1, #12]
- ldr r4, [r1], r2
+ ldr_post r4, r1, r2
strd r6, r7, [r0, #8]
ldr r9, [r1, #4]
- strd r4, r5, [r0], r2
+ strd_post r4, r5, r0, r2
ldr r10, [r1, #8]
ldr r11, [r1, #12]
- ldr r8, [r1], r2
+ ldr_post r8, r1, r2
strd r10, r11, [r0, #8]
subs r3, r3, #2
- strd r8, r9, [r0], r2
+ strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11}
@@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1
push {r4-r7}
1:
ldr r5, [r1, #4]
- ldr r4, [r1], r2
+ ldr_post r4, r1, r2
ldr r7, [r1, #4]
- strd r4, r5, [r0], r2
- ldr r6, [r1], r2
+ strd_post r4, r5, r0, r2
+ ldr_post r6, r1, r2
subs r3, r3, #2
- strd r6, r7, [r0], r2
+ strd_post r6, r7, r0, r2
bne 1b
pop {r4-r7}
@@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1
ldr r5, [r1, #4]
ldr r7, [r1, #5]
lsr r6, r4, #8
- ldr r8, [r1, r2]!
+ ldr_pre r8, r1, r2
orr r6, r6, r5, lsl #24
ldr r9, [r1, #4]
ldr r11, [r1, #5]
@@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1
uhadd8 r9, r9, r11
and r6, r6, r12
uadd8 r8, r8, r14
- strd r4, r5, [r0], r2
+ strd_post r4, r5, r0, r2
uadd8 r9, r9, r6
- strd r8, r9, [r0], r2
+ strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11, pc}
@@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1
orr r12, r12, r12, lsl #16
ldr r4, [r1]
ldr r5, [r1, #4]
- ldr r6, [r1, r2]!
+ ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
@@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1
uhadd8 r9, r5, r7
eor r11, r5, r7
and r10, r10, r12
- ldr r4, [r1, r2]!
+ ldr_pre r4, r1, r2
uadd8 r8, r8, r10
and r11, r11, r12
uadd8 r9, r9, r11
@@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1
eor r7, r5, r7
uadd8 r10, r10, r6
and r7, r7, r12
- ldr r6, [r1, r2]!
+ ldr_pre r6, r1, r2
uadd8 r11, r11, r7
- strd r8, r9, [r0], r2
+ strd_post r8, r9, r0, r2
ldr r7, [r1, #4]
- strd r10, r11, [r0], r2
+ strd_post r10, r11, r0, r2
bne 1b
pop {r4-r11}
@@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r7, [r1, #5]
- ldr r8, [r1, r2]!
+ ldr_pre r8, r1, r2
ldr r9, [r1, #4]
ldr r14, [r1, #5]
add r1, r1, r2
@@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1
push {r4-r9, lr}
ldr r4, [r1]
ldr r5, [r1, #4]
- ldr r6, [r1, r2]!
+ ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r8, r4, r6
- ldr r4, [r1, r2]!
+ ldr_pre r4, r1, r2
uhadd8 r9, r5, r7
ldr r5, [r1, #4]
uhadd8 r12, r4, r6
- ldr r6, [r1, r2]!
+ ldr_pre r6, r1, r2
uhadd8 r14, r5, r7
ldr r7, [r1, #4]
stm r0, {r8,r9}
@@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1
orr lr, lr, lr, lsl #16
ldrd r4, r5, [r0]
ldr r10, [r1, #4]
- ldr r9, [r1], r2
+ ldr_post r9, r1, r2
subs r3, r3, #2
1:
pld [r1, r2]
eor r8, r4, r9
uhadd8 r4, r4, r9
eor r12, r5, r10
- ldrd r6, r7, [r0, r2]
+ ldrd_reg r6, r7, r0, r2
uhadd8 r5, r5, r10
and r8, r8, lr
ldr r10, [r1, #4]
and r12, r12, lr
uadd8 r4, r4, r8
- ldr r9, [r1], r2
+ ldr_post r9, r1, r2
eor r8, r6, r9
uadd8 r5, r5, r12
pld [r1, r2, lsl #1]
eor r12, r7, r10
uhadd8 r6, r6, r9
- strd r4, r5, [r0], r2
+ strd_post r4, r5, r0, r2
uhadd8 r7, r7, r10
beq 2f
and r8, r8, lr
- ldrd r4, r5, [r0, r2]
+ ldrd_reg r4, r5, r0, r2
uadd8 r6, r6, r8
ldr r10, [r1, #4]
and r12, r12, lr
subs r3, r3, #2
uadd8 r7, r7, r12
- ldr r9, [r1], r2
- strd r6, r7, [r0], r2
+ ldr_post r9, r1, r2
+ strd_post r6, r7, r0, r2
b 1b
2:
and r8, r8, lr
and r12, r12, lr
uadd8 r6, r6, r8
uadd8 r7, r7, r12
- strd r6, r7, [r0], r2
+ strd_post r6, r7, r0, r2
pop {r4-r10, pc}
endfunc
@@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1
orr r6, r8, r5, lsl #8
orr r7, r4, lr, lsl #8
subs r3, r3, #1
- strd r6, r7, [r1], r2
+ strd_post r6, r7, r1, r2
bgt 1b
pop {r4-r8,pc}
endfunc
@@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1
push {r4-r8, lr}
mov lr, #8
1:
- ldrd r4, r5, [r1], r2
+ ldrd_post r4, r5, r1, r2
subs lr, lr, #1
uxtb16 r6, r4
uxtb16 r4, r4, ror #8
@@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1
push {r4-r9, lr}
mov lr, #8
1:
- ldrd r4, r5, [r1], r3
- ldrd r6, r7, [r2], r3
+ ldrd_post r4, r5, r1, r3
+ ldrd_post r6, r7, r2, r3
uxtb16 r8, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r6
@@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1
push {r4-r9, lr}
mov r0, #0
mov lr, #0
- ldrd r4, r5, [r1], r3
+ ldrd_post r4, r5, r1, r3
1:
subs r12, r12, #2
ldr r7, [r2, #4]
- ldr r6, [r2], r3
- ldrd r8, r9, [r1], r3
+ ldr_post r6, r2, r3
+ ldrd_post r8, r9, r1, r3
usada8 r0, r4, r6, r0
pld [r2, r3]
usada8 lr, r5, r7, lr
ldr r7, [r2, #4]
- ldr r6, [r2], r3
+ ldr_post r6, r2, r3
beq 2f
- ldrd r4, r5, [r1], r3
+ ldrd_post r4, r5, r1, r3
usada8 r0, r8, r6, r0
pld [r2, r3]
usada8 lr, r9, r7, lr
@@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1
ldr r7, [r0, #12]
usada8 r2, r6, lr, r2
beq 2f
- ldr r4, [r0, r1]!
+ ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3
bgt 1b
2:
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 5b80e40..5e3bf27 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1
2: vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]!
+ it lt
bxlt lr
3: vld1.32 {d2-d3}, [r1,:128]
@@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2
2: vst1.32 {q2},[r0,:128]!
vst1.32 {q3},[r0,:128]!
ands len, len, #15
+ it eq
bxeq lr
3: vld1.32 {q0},[r1,:128]!
vmul.f32 q0, q0, q8
@@ -638,6 +640,7 @@ NOVFP ldr r3, [sp]
2: vst1.32 {q8},[r0,:128]!
vst1.32 {q9},[r0,:128]!
ands r3, r3, #7
+ it eq
popeq {pc}
3: vld1.32 {q0},[r1,:128]!
ldr r12, [r2], #4
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
index 16ea25a..cbc4bd6 100644
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1
1:
subs r3, r3, #16
vmul.f32 s12, s4, s12
+ itttt ge
vldmiage r1!, {s16-s19}
vldmiage r2!, {s24-s27}
vldmiage r1!, {s20-s23}
vldmiage r2!, {s28-s31}
+ it ge
vmulge.f32 s24, s16, s24
vstmia r0!, {s8-s11}
vstmia r0!, {s12-s15}
+ it ge
vmulge.f32 s28, s20, s28
+ itttt gt
vldmiagt r1!, {s0-s3}
vldmiagt r2!, {s8-s11}
vldmiagt r1!, {s4-s7}
vldmiagt r2!, {s12-s15}
+ ittt ge
vmulge.f32 s8, s0, s8
vstmiage r0!, {s24-s27}
vstmiage r0!, {s28-s31}
@@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1
vmul.f32 s11, s0, s11
1:
subs r3, r3, #16
+ it ge
vldmdbge r2!, {s16-s19}
vmul.f32 s12, s7, s12
+ it ge
vldmiage r1!, {s24-s27}
vmul.f32 s13, s6, s13
+ it ge
vldmdbge r2!, {s20-s23}
vmul.f32 s14, s5, s14
+ it ge
vldmiage r1!, {s28-s31}
vmul.f32 s15, s4, s15
+ it ge
vmulge.f32 s24, s19, s24
+ it gt
vldmdbgt r2!, {s0-s3}
+ it ge
vmulge.f32 s25, s18, s25
vstmia r0!, {s8-s13}
+ it ge
vmulge.f32 s26, s17, s26
+ it gt
vldmiagt r1!, {s8-s11}
+ itt ge
vmulge.f32 s27, s16, s27
vmulge.f32 s28, s23, s28
+ it gt
vldmdbgt r2!, {s4-s7}
+ it ge
vmulge.f32 s29, s22, s29
vstmia r0!, {s14-s15}
+ ittt ge
vmulge.f32 s30, s21, s30
vmulge.f32 s31, s20, s31
vmulge.f32 s8, s3, s8
+ it gt
vldmiagt r1!, {s12-s15}
+ itttt ge
vmulge.f32 s9, s2, s9
vmulge.f32 s10, s1, s10
vstmiage r0!, {s24-s27}
vmulge.f32 s11, s0, s11
+ it ge
vstmiage r0!, {s28-s31}
bgt 1b
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
index 4b0e9a2..45cc84b 100644
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -71,6 +71,7 @@ endfunc
function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2
+ itt lt
ldrlt r1, [r1]
blt ff_float_to_int16_neon
bne 4f
@@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip
subs r3, r3, #4
+ it eq
popeq {r4-r8,pc}
cmp r3, #4
add r0, r0, #8
@@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2
add r0, r0, #4
+ it eq
popeq {r4-r8,pc}
@ 1 channel
@@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
+ it eq
popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index 1bb7f49..f7b0e3d 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1
vmov r5, r6, s2, s3
vmov r7, r8, s4, s5
vmov ip, lr, s6, s7
+ it gt
vldmiagt r1!, {s16-s23}
ssat r4, #16, r4
ssat r3, #16, r3
@@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1
ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16
+ itttt gt
vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19
+ itttt gt
vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index b76e447..0fa4a6b 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -71,7 +71,9 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
pld [r1]
pld [r1, r2]
- muls r7, r4, r5
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb ip, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
@@ -197,7 +199,9 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
pld [r1]
pld [r1, r2]
- muls r7, r4, r5
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb ip, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
@@ -368,10 +372,10 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
pop {r4-r6, pc}
2:
.ifc \type,put
- ldrh r5, [r1], r2
- strh r5, [r0], r2
- ldrh r6, [r1], r2
- strh r6, [r0], r2
+ ldrh_post r5, r1, r2
+ strh_post r5, r0, r2
+ ldrh_post r6, r1, r2
+ strh_post r6, r0, r2
.else
vld1.16 {d16[0]}, [r1], r2
vld1.16 {d16[1]}, [r1], r2
@@ -404,28 +408,17 @@ endfunc
ldr ip, [sp]
tst r2, r2
ldr ip, [ip]
+ it ne
tstne r3, r3
vmov.32 d24[0], ip
and ip, ip, ip, lsl #16
+ it eq
bxeq lr
ands ip, ip, ip, lsl #8
+ it lt
bxlt lr
.endm
- .macro align_push_regs
- and ip, sp, #15
- add ip, ip, #32
- sub sp, sp, ip
- vst1.64 {d12-d15}, [sp,:128]
- sub sp, sp, #32
- vst1.64 {d8-d11}, [sp,:128]
- .endm
-
- .macro align_pop_regs
- vld1.64 {d8-d11}, [sp,:128]!
- vld1.64 {d12-d15}, [sp,:128], ip
- .endm
-
.macro h264_loop_filter_luma
vdup.8 q11, r2 @ alpha
vmovl.u8 q12, d24
@@ -506,7 +499,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
vld1.64 {d18,d19}, [r0,:128], r1
vld1.64 {d16,d17}, [r0,:128], r1
- align_push_regs
+ vpush {d8-d15}
h264_loop_filter_luma
@@ -516,7 +509,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
vst1.64 {d0, d1}, [r0,:128], r1
vst1.64 {d10,d11}, [r0,:128]
- align_pop_regs
+ vpop {d8-d15}
bx lr
endfunc
@@ -543,7 +536,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
- align_push_regs
+ vpush {d8-d15}
h264_loop_filter_luma
@@ -568,7 +561,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
vst1.32 {d1[1]}, [r0], r1
vst1.32 {d11[1]}, [r0], r1
- align_pop_regs
+ vpop {d8-d15}
bx lr
endfunc
@@ -1116,6 +1109,7 @@ function \type\()_h264_qpel8_hv_lowpass_neon
vrhadd.u8 d11, d11, d7
sub r0, r0, r2, lsl #3
.endif
+
vst1.64 {d12}, [r0,:64], r2
vst1.64 {d13}, [r0,:64], r2
vst1.64 {d14}, [r0,:64], r2
@@ -1263,7 +1257,9 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1
\type\()_h264_qpel8_mc11:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #64
mov r0, sp
sub r1, r1, #2
@@ -1271,14 +1267,14 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1
mov ip, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
mov r3, r2
add ip, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r11, pc}
endfunc
@@ -1287,7 +1283,9 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1
\type\()_h264_qpel8_mc21:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, #2
mov r3, #8
@@ -1296,14 +1294,14 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
mov r4, r0
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4, r10, r11, pc}
endfunc
@@ -1330,7 +1328,9 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1
\type\()_h264_qpel8_mc12:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, r2, lsl #1
mov r3, r2
@@ -1339,20 +1339,22 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1
vpush {d8-d15}
bl put_h264_qpel8_v_lowpass_neon
mov r4, r0
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc22_neon, export=1
push {r4, r10, r11, lr}
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r4, r11, #15
+T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
@@ -1441,21 +1443,23 @@ function ff_\type\()_h264_qpel16_mc11_neon, export=1
\type\()_h264_qpel16_mc11:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #256
mov r0, sp
sub r1, r1, #2
mov r3, #16
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
mov r3, r2
add ip, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #16
bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4, r11, pc}
endfunc
@@ -1464,20 +1468,22 @@ function ff_\type\()_h264_qpel16_mc21_neon, export=1
\type\()_h264_qpel16_mc21:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, #2
mov r0, sp
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon_packed
mov r4, r0
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
@@ -1504,7 +1510,9 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1
\type\()_h264_qpel16_mc12:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, r2, lsl #1
mov r0, sp
@@ -1512,13 +1520,13 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1
vpush {d8-d15}
bl put_h264_qpel16_v_lowpass_neon_packed
mov r4, r0
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
mov r2, r3
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
@@ -1526,7 +1534,9 @@ function ff_\type\()_h264_qpel16_mc22_neon, export=1
push {r4, r9-r11, lr}
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r4, r11, #15
+T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index 3c743e1..eadf2e7 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
+ it ne
movne lr, #0
cmp lr, #0
- adrne lr, ff_h264_idct_dc_add_neon
- adreq lr, ff_h264_idct_add_neon
+ ite ne
+ adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
blx lr
2: subs ip, ip, #1
add r1, r1, #32
@@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1
add r0, r0, r4
cmp r8, #0
ldrsh r8, [r1]
- adrne lr, ff_h264_idct_add_neon
- adreq lr, ff_h264_idct_dc_add_neon
+ iteet ne
+ adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
subs ip, ip, #1
@@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1
add r1, r3, r12, lsl #5
cmp r8, #0
ldrsh r8, [r1]
- adrne lr, ff_h264_idct_add_neon
- adreq lr, ff_h264_idct_dc_add_neon
+ iteet ne
+ adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
add r12, r12, #1
cmp r12, #4
+ itt eq
moveq r12, #16
moveq r4, r9
cmp r12, #20
@@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
+ it ne
movne lr, #0
cmp lr, #0
- adrne lr, ff_h264_idct8_dc_add_neon
- adreq lr, ff_h264_idct8_add_neon
+ ite ne
+ adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
blx lr
2: subs r12, r12, #4
add r1, r1, #128
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
index b27b18f..3803fcd 100644
--- a/libavcodec/arm/mathops.h
+++ b/libavcodec/arm/mathops.h
@@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c)
__asm__ (
"mov %0, %2 \n\t"
"cmp %1, %2 \n\t"
+ "itt gt \n\t"
"movgt %0, %1 \n\t"
"movgt %1, %2 \n\t"
"cmp %1, %3 \n\t"
+ "it le \n\t"
"movle %1, %3 \n\t"
"cmp %0, %1 \n\t"
+ "it gt \n\t"
"movgt %0, %1 \n\t"
: "=&r"(m), "+r"(a)
: "r"(b), "r"(c)
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
index c375f4c..1ba3067 100644
--- a/libavcodec/arm/mdct_neon.S
+++ b/libavcodec/arm/mdct_neon.S
@@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1
vadd.f32 d17, d17, d3 @ in2u+in1d -I
1:
vmul.f32 d7, d0, d21 @ I*s
- ldr r10, [r3, lr, lsr #1]
+A ldr r10, [r3, lr, lsr #1]
+T lsr r10, lr, #1
+T ldr r10, [r3, r10]
vmul.f32 d6, d1, d20 @ -R*c
ldr r6, [r3, #4]!
vmul.f32 d4, d1, d21 @ -R*s
diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
index 9ec7314..b517b97 100644
--- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
+++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
sum8 r8, r9, r1, r0, r10, r11, r12, lr
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
round r10, r8, r9
- strh r10, [r3], r4
+ strh_post r10, r3, r4
mov lr, #15
1:
@@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
round r10, r8, r9
adds r8, r8, r4
adc r9, r9, r7
- strh r10, [r3], r12
+ strh_post r10, r3, r12
round r11, r8, r9
subs lr, lr, #1
- strh r11, [r5], -r12
+ strh_dpost r11, r5, r12
bgt 1b
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
index e346160..952c8d7 100644
--- a/libavcodec/arm/mpegvideo_armv5te_s.S
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -38,15 +38,21 @@
.macro dequant_t dst, src, mul, add, tmp
rsbs \tmp, ip, \src, asr #16
+ it gt
addgt \tmp, \add, #0
+ it lt
rsblt \tmp, \add, #0
+ it ne
smlatbne \dst, \src, \mul, \tmp
.endm
.macro dequant_b dst, src, mul, add, tmp
rsbs \tmp, ip, \src, lsl #16
+ it gt
addgt \tmp, \add, #0
+ it lt
rsblt \tmp, \add, #0
+ it ne
smlabbne \dst, \src, \mul, \tmp
.endm
@@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1
strh lr, [r0], #2
subs r3, r3, #8
+ it gt
ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
bgt 1b
adds r3, r3, #2
+ it le
pople {r4-r9,pc}
2:
ldrsh r9, [r0, #0]
ldrsh lr, [r0, #2]
mov r8, r2
cmp r9, #0
+ it lt
rsblt r8, r2, #0
+ it ne
smlabbne r9, r9, r1, r8
mov r8, r2
cmp lr, #0
+ it lt
rsblt r8, r2, #0
+ it ne
smlabbne lr, lr, r1, r8
strh r9, [r0], #2
strh lr, [r0], #2
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 365dcf6..206a71a 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1
subs r3, r3, #16
vst1.16 {q0}, [r1,:128]!
vst1.16 {q8}, [r1,:128]!
+ it le
bxle lr
cmp r3, #8
bgt 1b
@@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
ldr r6, [r0, #AC_PRED]
add lr, r0, #INTER_SCANTAB_RASTER_END
cmp r6, #0
+ it ne
movne r12, #63
bne 1f
ldr r12, [r12, r2, lsl #2]
@@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1
ldrsh r4, [r1]
cmp r5, #0
mov r5, r1
+ it ne
movne r2, #0
bne 2f
cmp r2, #4
+ it ge
addge r0, r0, #4
sub r2, r3, #1
ldr r6, [r0, #Y_DC_SCALE]
diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S
index 8aafdc9..fba275e 100644
--- a/libavcodec/arm/rdft_neon.S
+++ b/libavcodec/arm/rdft_neon.S
@@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1
vst1.32 {d22}, [r5,:64]
cmp r6, #0
+ it eq
popeq {r4-r8,pc}
vmul.f32 d22, d22, d18
diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
index 4e6dfa4..717b12c 100644
--- a/libavcodec/arm/simple_idct_arm.S
+++ b/libavcodec/arm/simple_idct_arm.S
@@ -121,11 +121,13 @@ __b_evaluation:
ldr r11, [r12, #offW7] @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- teq r2, #0 @ if null avoid muls
- mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ teq r2, #0 @ if null avoid muls
+ itttt ne
+ mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ -148,19 +150,23 @@ __b_evaluation:
@@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]);
mov r3, r3, asr #16 @ R3=ROWr16[5]
- teq r3, #0 @ if null avoid muls
+ teq r3, #0 @ if null avoid muls
+ it ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
mov r4, r4, asr #16 @ R4=ROWr16[7]
+ itttt ne
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5]
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now
- teq r4, #0 @ if null avoid muls
+ teq r4, #0 @ if null avoid muls
+ itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
+ it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
@@ R4 is free now
__end_b_evaluation:
@@ -204,16 +210,19 @@ __a_evaluation:
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #8] @ R11=ROWr16[4]
- teq r11, #0 @ if null avoid muls
+ teq r11, #0 @ if null avoid muls
+ it ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
ldrsh r9, [r14, #12] @ R9=ROWr16[6]
+ itttt ne
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
- teq r9, #0 @ if null avoid muls
+ teq r9, #0 @ if null avoid muls
+ itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ -222,6 +231,7 @@ __a_evaluation:
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
@@ -323,10 +333,12 @@ __b_evaluation2:
ldrsh r2, [r14, #48]
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if 0, then avoid muls
+ itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@@ -342,18 +354,22 @@ __b_evaluation2:
@@ MAC16(b1, -W5, col[7x8]);
ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
teq r3, #0 @ if 0 then avoid muls
+ itttt ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
+ it ne
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
@@ R3 is free now
teq r4, #0 @ if 0 then avoid muls
+ itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
+ it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
@@ R4 is free now
__end_b_evaluation2:
@@ -390,15 +406,18 @@ __a_evaluation2:
@@ a3 += W4*row[4]
ldrsh r11, [r14, #64] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
+ itttt ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
ldrsh r9, [r14, #96] @ R9=ROWr16[6]
+ it ne
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
+ itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ -407,6 +426,7 @@ __a_evaluation2:
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation2:
diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
index 29ebf5c..24641e4 100644
--- a/libavcodec/arm/simple_idct_armv5te.S
+++ b/libavcodec/arm/simple_idct_armv5te.S
@@ -49,6 +49,7 @@ function idct_row_armv5te
ldrd v1, [a1, #8]
ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
orrs v1, v1, v2
+ itt eq
cmpeq v1, a4
cmpeq v1, a3, lsr #16
beq row_dc_only
@@ -269,6 +270,7 @@ function idct_col_armv5te
ldmfd sp!, {a3, a4}
adds a2, a3, v1
mov a2, a2, lsr #20
+ it mi
orrmi a2, a2, #0xf000
add ip, a4, v2
mov ip, ip, asr #20
@@ -276,6 +278,7 @@ function idct_col_armv5te
str a2, [a1]
subs a3, a3, v1
mov a2, a3, lsr #20
+ it mi
orrmi a2, a2, #0xf000
sub a4, a4, v2
mov a4, a4, asr #20
@@ -285,6 +288,7 @@ function idct_col_armv5te
subs a2, a3, v3
mov a2, a2, lsr #20
+ it mi
orrmi a2, a2, #0xf000
sub ip, a4, v4
mov ip, ip, asr #20
@@ -292,6 +296,7 @@ function idct_col_armv5te
str a2, [a1, #(16*1)]
adds a3, a3, v3
mov a2, a3, lsr #20
+ it mi
orrmi a2, a2, #0xf000
add a4, a4, v4
mov a4, a4, asr #20
@@ -301,6 +306,7 @@ function idct_col_armv5te
adds a2, a3, v5
mov a2, a2, lsr #20
+ it mi
orrmi a2, a2, #0xf000
add ip, a4, v6
mov ip, ip, asr #20
@@ -308,6 +314,7 @@ function idct_col_armv5te
str a2, [a1, #(16*2)]
subs a3, a3, v5
mov a2, a3, lsr #20
+ it mi
orrmi a2, a2, #0xf000
sub a4, a4, v6
mov a4, a4, asr #20
@@ -317,6 +324,7 @@ function idct_col_armv5te
adds a2, a3, v7
mov a2, a2, lsr #20
+ it mi
orrmi a2, a2, #0xf000
add ip, a4, fp
mov ip, ip, asr #20
@@ -324,6 +332,7 @@ function idct_col_armv5te
str a2, [a1, #(16*3)]
subs a3, a3, v7
mov a2, a3, lsr #20
+ it mi
orrmi a2, a2, #0xf000
sub a4, a4, fp
mov a4, a4, asr #20
@@ -335,15 +344,19 @@ endfunc
.macro clip dst, src:vararg
movs \dst, \src
+ it mi
movmi \dst, #0
cmp \dst, #255
+ it gt
movgt \dst, #255
.endm
.macro aclip dst, src:vararg
adds \dst, \src
+ it mi
movmi \dst, #0
cmp \dst, #255
+ it gt
movgt \dst, #255
.endm
@@ -370,35 +383,35 @@ function idct_col_put_armv5te
orr a2, a3, a4, lsl #8
rsb v2, lr, lr, lsl #3
ldmfd sp!, {a3, a4}
- strh a2, [v2, v1]!
+ strh_pre a2, v2, v1
sub a2, a3, v3
clip a2, a2, asr #20
sub ip, a4, v4
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
- strh a2, [v1, lr]!
+ strh_pre a2, v1, lr
add a3, a3, v3
clip a2, a3, asr #20
add a4, a4, v4
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
- strh a2, [v2, -lr]!
+ strh_dpre a2, v2, lr
add a2, a3, v5
clip a2, a2, asr #20
add ip, a4, v6
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
- strh a2, [v1, lr]!
+ strh_pre a2, v1, lr
sub a3, a3, v5
clip a2, a3, asr #20
sub a4, a4, v6
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
- strh a2, [v2, -lr]!
+ strh_dpre a2, v2, lr
add a2, a3, v7
clip a2, a2, asr #20
@@ -411,7 +424,7 @@ function idct_col_put_armv5te
sub a4, a4, fp
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
- strh a2, [v2, -lr]
+ strh_dpre a2, v2, lr
ldr pc, [sp], #4
endfunc
@@ -436,7 +449,7 @@ function idct_col_add_armv5te
ldr v1, [sp, #32]
sub a4, a4, v2
rsb v2, v1, v1, lsl #3
- ldrh ip, [v2, lr]!
+ ldrh_pre ip, v2, lr
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
@@ -448,7 +461,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
- ldrh ip, [lr, v1]!
+ ldrh_pre ip, lr, v1
sub a2, a3, v3
add a3, a3, v3
and v3, ip, #255
@@ -458,7 +471,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
add a4, a4, v4
- ldrh ip, [v2, -v1]!
+ ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
@@ -468,7 +481,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
- ldrh ip, [lr, v1]!
+ ldrh_pre ip, lr, v1
add a2, a3, v5
sub a3, a3, v5
and v3, ip, #255
@@ -478,7 +491,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, v6
- ldrh ip, [v2, -v1]!
+ ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
@@ -488,7 +501,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
- ldrh ip, [lr, v1]!
+ ldrh_pre ip, lr, v1
add a2, a3, v7
sub a3, a3, v7
and v3, ip, #255
@@ -498,7 +511,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, fp
- ldrh ip, [v2, -v1]!
+ ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
index b2eb525..284eb1f 100644
--- a/libavcodec/arm/simple_idct_armv6.S
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -200,6 +200,7 @@ function idct_row_armv6
ldr r3, [r0, #8] /* r3 = row[3,1] */
ldr r2, [r0] /* r2 = row[2,0] */
orrs lr, lr, ip
+ itt eq
cmpeq lr, r3
cmpeq lr, r2, lsr #16
beq 1f
@@ -282,14 +283,14 @@ function idct_col_put_armv6
pop {r1, r2}
idct_finish_shift_sat COL_SHIFT
- strb r4, [r1], r2
- strb r5, [r1], r2
- strb r6, [r1], r2
- strb r7, [r1], r2
- strb r11,[r1], r2
- strb r10,[r1], r2
- strb r9, [r1], r2
- strb r8, [r1], r2
+ strb_post r4, r1, r2
+ strb_post r5, r1, r2
+ strb_post r6, r1, r2
+ strb_post r7, r1, r2
+ strb_post r11,r1, r2
+ strb_post r10,r1, r2
+ strb_post r9, r1, r2
+ strb_post r8, r1, r2
sub r1, r1, r2, lsl #3
@@ -318,16 +319,16 @@ function idct_col_add_armv6
add ip, r3, ip, asr #COL_SHIFT
usat ip, #8, ip
add r4, r7, r4, asr #COL_SHIFT
- strb ip, [r1], r2
+ strb_post ip, r1, r2
ldrb ip, [r1, r2]
usat r4, #8, r4
ldrb r11,[r1, r2, lsl #2]
add r5, ip, r5, asr #COL_SHIFT
usat r5, #8, r5
- strb r4, [r1], r2
+ strb_post r4, r1, r2
ldrb r3, [r1, r2]
ldrb ip, [r1, r2, lsl #2]
- strb r5, [r1], r2
+ strb_post r5, r1, r2
ldrb r7, [r1, r2]
ldrb r4, [r1, r2, lsl #2]
add r6, r3, r6, asr #COL_SHIFT
@@ -340,11 +341,11 @@ function idct_col_add_armv6
usat r8, #8, r8
add lr, r4, lr, asr #COL_SHIFT
usat lr, #8, lr
- strb r6, [r1], r2
- strb r10,[r1], r2
- strb r9, [r1], r2
- strb r8, [r1], r2
- strb lr, [r1], r2
+ strb_post r6, r1, r2
+ strb_post r10,r1, r2
+ strb_post r9, r1, r2
+ strb_post r8, r1, r2
+ strb_post lr, r1, r2
sub r1, r1, r2, lsl #3
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
index 0628b96..cbed9ee 100644
--- a/libavcodec/arm/simple_idct_neon.S
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -71,7 +71,7 @@ function idct_row4_pld_neon
add r3, r0, r1, lsl #2
pld [r0, r1]
pld [r0, r1, lsl #1]
- pld [r3, -r1]
+A pld [r3, -r1]
pld [r3]
pld [r3, r1]
add r3, r3, r1, lsl #1
@@ -164,6 +164,7 @@ function idct_col4_neon
orrs r4, r4, r5
idct_col4_top
+ it eq
addeq r2, r2, #16
beq 1f
@@ -176,6 +177,7 @@ function idct_col4_neon
1: orrs r6, r6, r7
ldrd r4, [r2, #16]
+ it eq
addeq r2, r2, #16
beq 2f
@@ -187,6 +189,7 @@ function idct_col4_neon
2: orrs r4, r4, r5
ldrd r4, [r2, #16]
+ it eq
addeq r2, r2, #16
beq 3f
@@ -199,6 +202,7 @@ function idct_col4_neon
vadd.i32 q13, q13, q8
3: orrs r4, r4, r5
+ it eq
addeq r2, r2, #16
beq 4f
diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S
index d4f67b7..1d6e5b2 100644
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale
vst1.32 {q9}, [r2,:128]
subs r1, r1, #1
+ it eq
popeq {r4-r11,pc}
cmp r4, #0
+ itt eq
subeq r8, r8, #512*4
subeq r9, r9, #512*4
sub r5, r5, #512*4
diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h
index 9ce3fd0..ef30ffe 100644
--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@@ -21,6 +21,14 @@
#ifndef AVCODEC_ARM_VP56_ARITH_H
#define AVCODEC_ARM_VP56_ARITH_H
+#if CONFIG_THUMB
+# define A(x)
+# define T(x) x
+#else
+# define A(x) x
+# define T(x)
+#endif
+
#if HAVE_ARMV6 && HAVE_INLINE_ASM
#define vp56_rac_get_prob vp56_rac_get_prob_armv6
@@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
unsigned bit;
__asm__ ("adds %3, %3, %0 \n"
+ "itt cs \n"
"cmpcs %7, %4 \n"
- "ldrcsh %2, [%4], #2 \n"
+ A("ldrcsh %2, [%4], #2 \n")
+ T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
+ T("itttt cs \n")
"rev16cs %2, %2 \n"
- "orrcs %1, %1, %2, lsl %3 \n"
+ T("lslcs %2, %2, %3 \n")
+ T("orrcs %1, %1, %2 \n")
+ A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"cmp %1, %0, lsl #16 \n"
+ "ittte ge \n"
"subge %1, %1, %0, lsl #16 \n"
"subge %0, %5, %0 \n"
"movge %2, #1 \n"
@@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
unsigned tmp;
__asm__ ("adds %3, %3, %0 \n"
+ "itt cs \n"
"cmpcs %7, %4 \n"
- "ldrcsh %2, [%4], #2 \n"
+ A("ldrcsh %2, [%4], #2 \n")
+ T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
+ T("itttt cs \n")
"rev16cs %2, %2 \n"
- "orrcs %1, %1, %2, lsl %3 \n"
+ T("lslcs %2, %2, %3 \n")
+ T("orrcs %1, %1, %2 \n")
+ A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"lsl %2, %0, #16 \n"
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
index 602c8a5..1d89c68 100644
--- a/libavcodec/arm/vp8_armv6.S
+++ b/libavcodec/arm/vp8_armv6.S
@@ -25,13 +25,18 @@
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
rsb \h, \pr, #256
+ it cs
ldrhcs \t1, [\buf], #2
smlabb \h, \t0, \pr, \h
+T itttt cs
rev16cs \t1, \t1
- orrcs \cw, \cw, \t1, lsl \bs
+A orrcs \cw, \cw, \t1, lsl \bs
+T lslcs \t1, \t1, \bs
+T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
+ itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
@@ -40,14 +45,20 @@
adds \bs, \bs, \t0
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
+ it cs
ldrhcs \t1, [\buf], #2
mov \h, #128
+ it cs
rev16cs \t1, \t1
add \h, \h, \t0, lsl #7
- orrcs \cw, \cw, \t1, lsl \bs
+A orrcs \cw, \cw, \t1, lsl \bs
+T ittt cs
+T lslcs \t1, \t1, \bs
+T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
+ itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
@@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1
cmp r3, #0
ldr r11, [r5]
ldm r0, {r5-r7} @ high, bits, buf
+ it ne
pkhtbne r11, r11, r11, asr #16
ldr r8, [r0, #16] @ code_word
0:
@@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1
adds r6, r6, r9
add r4, r4, #11
lsl r8, r8, r9
+ it cs
ldrhcs r10, [r7], #2
lsl r9, r5, r9
mov r5, #128
+ it cs
rev16cs r10, r10
add r5, r5, r9, lsl #7
- orrcs r8, r8, r10, lsl r6
+T ittt cs
+T lslcs r10, r10, r6
+T orrcs r8, r8, r10
+A orrcs r8, r8, r10, lsl r6
subcs r6, r6, #16
lsr r5, r5, #8
cmp r8, r5, lsl #16
movrel r10, zigzag_scan-1
+ itt ge
subge r8, r8, r5, lsl #16
subge r5, r9, r5
ldrb r10, [r10, r3]
+ it ge
rsbge r12, r12, #0
cmp r3, #16
strh r12, [r1, r10]
@@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1
ldr r0, [sp]
ldr r9, [r0, #12]
cmp r7, r9
+ it hi
movhi r7, r9
stm r0, {r5-r7} @ high, bits, buf
str r8, [r0, #16] @ code_word
@@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #2
ldrb r0, [r4, #4]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, #1
ldrb r9, [lr, r5]
blt 4f
ldrb r0, [r4, #5]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, #1
ldrb r9, [lr, r5]
b 4f
@@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #5
mov r0, #159
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
@@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #7
mov r0, #165
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, r12, #2
ldrb r9, [lr, r5]
mov r0, #145
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
3:
ldrb r0, [r4, #8]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r4, r4, #1
ldrb r9, [lr, r5]
+ ite ge
movge r12, #2
movlt r12, #0
ldrb r0, [r4, #9]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
mov r9, #8
+ it ge
addge r12, r12, #1
movrel r4, X(ff_vp8_dct_cat_prob)
lsl r9, r9, r12
@@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1
lsl r1, r1, #1
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r0, [r4], #1
+ it ge
addge r1, r1, #1
cmp r0, #0
bne 1b
@@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1
add r4, r2, r4
add r4, r4, #22
rac_get_128 r5, r6, r7, r8, r9, r10
+ it ge
rsbge r12, r12, #0
smulbb r12, r12, r11
movrel r9, zigzag_scan-1
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index 92b1ef4..1b9f24e 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1
push {r4-r6,lr}
1:
subs r12, r12, #4
- ldr r4, [r2], r3
- ldr r5, [r2], r3
- ldr r6, [r2], r3
- ldr lr, [r2], r3
- str r4, [r0], r1
- str r5, [r0], r1
- str r6, [r0], r1
- str lr, [r0], r1
+ ldr_post r4, r2, r3
+ ldr_post r5, r2, r3
+ ldr_post r6, r2, r3
+ ldr_post lr, r2, r3
+ str_post r4, r0, r1
+ str_post r5, r0, r1
+ str_post r6, r0, r1
+ str_post lr, r0, r1
bgt 1b
pop {r4-r6,pc}
endfunc
diff --git a/libavutil/arm/intmath.h b/libavutil/arm/intmath.h
index 4340b59..b6a45c1 100644
--- a/libavutil/arm/intmath.h
+++ b/libavutil/arm/intmath.h
@@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b)
int r;
__asm__ ("cmp %2, #2 \n\t"
"ldr %0, [%3, %2, lsl #2] \n\t"
+ "ite le \n\t"
"lsrle %0, %1, #1 \n\t"
"smmulgt %0, %0, %1 \n\t"
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
@@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a)
{
int x, y;
__asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t"
+ "itet ne \n\t"
"mvnne %1, #1<<31 \n\t"
"moveq %0, %Q2 \n\t"
"eorne %0, %1, %R2, asr #31 \n\t"
OpenPOWER on IntegriCloud