diff options
author | Martin Storsjö <martin@martin.st> | 2016-12-17 13:14:38 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2017-02-11 00:08:50 +0200 |
commit | 388e0d2515bc6bbc9d0c9af1d230bd16cf945fe7 (patch) | |
tree | 361f70c8830c1d01601fd30d4ac28337df6a397b /libavcodec | |
parent | fea92a4b57d1c328b1de226a5f213a629ee63754 (diff) | |
download | ffmpeg-streaming-388e0d2515bc6bbc9d0c9af1d230bd16cf945fe7.zip ffmpeg-streaming-388e0d2515bc6bbc9d0c9af1d230bd16cf945fe7.tar.gz |
aarch64: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter
No measured speedup on a Cortex A53, but other cores might benefit.
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/aarch64/vp9mc_neon.S | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S index 99f1809..95ed26c 100644 --- a/libavcodec/aarch64/vp9mc_neon.S +++ b/libavcodec/aarch64/vp9mc_neon.S @@ -202,9 +202,12 @@ endfunc ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) mla \dst2\().8h, v21.8h, v0.h[\offset] mla \dst4\().8h, v23.8h, v0.h[\offset] -.else +.elseif \size == 8 mla \dst1\().8h, v20.8h, v0.h[\offset] mla \dst3\().8h, v22.8h, v0.h[\offset] +.else + mla \dst1\().4h, v20.4h, v0.h[\offset] + mla \dst3\().4h, v22.4h, v0.h[\offset] .endif .endm // The same as above, but don't accumulate straight into the @@ -219,16 +222,24 @@ endfunc ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) mul v21.8h, v21.8h, v0.h[\offset] mul v23.8h, v23.8h, v0.h[\offset] -.else +.elseif \size == 8 mul v20.8h, v20.8h, v0.h[\offset] mul v22.8h, v22.8h, v0.h[\offset] +.else + mul v20.4h, v20.4h, v0.h[\offset] + mul v22.4h, v22.4h, v0.h[\offset] .endif +.if \size == 4 + sqadd \dst1\().4h, \dst1\().4h, v20.4h + sqadd \dst3\().4h, \dst3\().4h, v22.4h +.else sqadd \dst1\().8h, \dst1\().8h, v20.8h sqadd \dst3\().8h, \dst3\().8h, v22.8h .if \size >= 16 sqadd \dst2\().8h, \dst2\().8h, v21.8h sqadd \dst4\().8h, \dst4\().8h, v23.8h .endif +.endif .endm |