diff options
author | Martin Vignali <martin.vignali@gmail.com> | 2018-04-22 19:14:23 +0200 |
---|---|---|
committer | Martin Vignali <martin.vignali@gmail.com> | 2018-04-22 19:15:32 +0200 |
commit | 07a566e7d6fc4dbb4250da55d925b5591b3d03f9 (patch) | |
tree | a6d235e72da69b4a01dfb6c195972f9d6b131e4e /libswscale/x86 | |
parent | e6e4625862818043f1cb50d947c014fc4b8fb064 (diff) | |
download | ffmpeg-streaming-07a566e7d6fc4dbb4250da55d925b5591b3d03f9.zip ffmpeg-streaming-07a566e7d6fc4dbb4250da55d925b5591b3d03f9.tar.gz |
swscale/swscale_unscaled : add X86_64 (SSE2 and AVX) for uyvyto422
and checkasm test
Diffstat (limited to 'libswscale/x86')
-rw-r--r-- | libswscale/x86/rgb2rgb.c | 19 | ||||
-rw-r--r-- | libswscale/x86/rgb_2_rgb.asm | 150 |
2 files changed, 169 insertions, 0 deletions
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index e5f318a..1191081 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -150,6 +150,15 @@ void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size) void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size); +#if ARCH_X86_64 +void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +#endif + av_cold void rgb2rgb_init_x86(void) { int cpu_flags = av_get_cpu_flags(); @@ -167,6 +176,11 @@ av_cold void rgb2rgb_init_x86(void) rgb2rgb_init_avx(); #endif /* HAVE_INLINE_ASM */ + if (EXTERNAL_SSE2(cpu_flags)) { +#if ARCH_X86_64 + uyvytoyuv422 = ff_uyvytoyuv422_sse2; +#endif + } if (EXTERNAL_SSSE3(cpu_flags)) { shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3; shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3; @@ -174,4 +188,9 @@ av_cold void rgb2rgb_init_x86(void) shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3; shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3; } + if (EXTERNAL_AVX(cpu_flags)) { +#if ARCH_X86_64 + uyvytoyuv422 = ff_uyvytoyuv422_avx; +#endif + } } diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index db45e31..156b4d2 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -32,6 +32,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION .text +%macro RSHIFT_COPY 3 +; %1 dst ; %2 src ; %3 shift +%if cpuflag(avx) + psrldq %1, %2, %3 +%else + mova %1, %2 + RSHIFT %1, %3 +%endif +%endmacro + ;------------------------------------------------------------------------------ ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) ;------------------------------------------------------------------------------ @@ -84,3 +94,143 @@ SHUFFLE_BYTES 0, 3, 2, 1 SHUFFLE_BYTES 1, 2, 3, 0 SHUFFLE_BYTES 3, 0, 1, 2 SHUFFLE_BYTES 3, 2, 1, 0 + +;----------------------------------------------------------------------------------------------- +; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, +; const uint8_t *src, int width, int height, +; int lumStride, int chromStride, int srcStride) +;----------------------------------------------------------------------------------------------- +%macro UYVY_TO_YUV422 0 +cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w + pxor m0, m0 + pcmpeqw m1, m1 + psrlw m1, 8 + + movsxdifnidn wq, wd + movsxdifnidn lum_strideq, lum_strided + movsxdifnidn chrom_strideq, chrom_strided + movsxdifnidn src_strideq, src_strided + + mov back_wq, wq + mov whalfq, wq + shr whalfq, 1 ; whalf = width / 2 + + lea srcq, [srcq + wq * 2] + add ydstq, wq + add udstq, whalfq + add vdstq, whalfq + +.loop_line: + mov xq, wq + mov wtwoq, wq + add wtwoq, wtwoq ; wtwo = width * 2 + + neg wq + neg wtwoq + neg whalfq + + ;calc scalar loop count + and xq, mmsize * 2 - 1 + je .loop_simd + + .loop_scalar: + mov tmpb, [srcq + wtwoq + 0] + mov [udstq + whalfq], tmpb + + mov tmpb, [srcq + wtwoq + 1] + mov [ydstq + wq], tmpb + + mov tmpb, [srcq + wtwoq + 2] + mov [vdstq + whalfq], tmpb + + mov tmpb, [srcq + wtwoq + 3] + mov [ydstq + wq + 1], tmpb + + add wq, 2 + add wtwoq, 4 + add whalfq, 1 + sub xq, 2 + jg .loop_scalar + + ; check if simd loop is need + cmp wq, 0 + jge .end_line + + .loop_simd: + movu m2, [srcq + wtwoq ] + movu m3, [srcq + wtwoq + mmsize ] + movu m4, [srcq + wtwoq + mmsize * 2] + movu m5, [srcq + wtwoq + mmsize * 3] + + ; extract y part 1 + RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... + pand m6, m1; YxYx YxYx... + + RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... + pand m7, m1 ; YxYx YxYx... + + packuswb m6, m7 ; YYYY YYYY... + movu [ydstq + wq], m6 + + ; extract y part 2 + RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... + pand m6, m1; YxYx YxYx... + + RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... + pand m7, m1 ; YxYx YxYx... + + packuswb m6, m7 ; YYYY YYYY... + movu [ydstq + wq + mmsize], m6 + + ; extract uv + pand m2, m1 ; UxVx... + pand m3, m1 ; UxVx... + pand m4, m1 ; UxVx... + pand m5, m1 ; UxVx... + + packuswb m2, m3 ; UVUV... + packuswb m4, m5 ; UVUV... + + ; U + pand m6, m2, m1 ; UxUx... + pand m7, m4, m1 ; UxUx... + + packuswb m6, m7 ; UUUU + movu [udstq + whalfq], m6 + + + ; V + psrlw m2, 8 ; VxVx... + psrlw m4, 8 ; VxVx... + packuswb m2, m4 ; VVVV + movu [vdstq + whalfq], m2 + + add whalfq, mmsize + add wtwoq, mmsize * 4 + add wq, mmsize * 2 + jl .loop_simd + + .end_line: + add srcq, src_strideq + add ydstq, lum_strideq + add udstq, chrom_strideq + add vdstq, chrom_strideq + + ;restore initial state of line variable + mov wq, back_wq + mov xq, wq + mov whalfq, wq + shr whalfq, 1 ; whalf = width / 2 + sub hd, 1 + jg .loop_line + + RET +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse2 +UYVY_TO_YUV422 + +INIT_XMM avx +UYVY_TO_YUV422 +%endif |