diff options
Diffstat (limited to 'libswscale/x86/input.asm')
-rw-r--r-- | libswscale/x86/input.asm | 122 |
1 files changed, 62 insertions, 60 deletions
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 66d8845..0124c66 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -36,8 +36,8 @@ SECTION_RODATA %define GV 0xD0E3 %define BV 0xF6E4 -rgb_Yrnd: times 4 dd 0x84000 ; 16.5 << 15 -rgb_UVrnd: times 4 dd 0x404000 ; 128.5 << 15 +rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15 +rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15 bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY @@ -83,7 +83,7 @@ SECTION .text ; %1 = nr. of XMM registers ; %2 = rgb or bgr %macro RGB24_TO_Y_FN 2-3 -cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w +cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3 %if mmsize == 8 mova m5, [%2_Ycoeff_12x4] mova m6, [%2_Ycoeff_3x56] @@ -115,6 +115,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w %if ARCH_X86_64 movsxd wq, wd %endif + add wq, wq add dstq, wq neg wq %if notcpuflag(ssse3) @@ -158,12 +159,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7] paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] } paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] } - psrad m0, 15 - psrad m2, 15 + psrad m0, 9 + psrad m2, 9 packssdw m0, m2 ; (word) { Y[0-7] } - packuswb m0, m0 ; (byte) { Y[0-7] } - movh [dstq+wq], m0 - add wq, mmsize / 2 + mova [dstq+wq], m0 + add wq, mmsize jl .loop REP_RET %endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 @@ -172,7 +172,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w ; %1 = nr. of XMM registers ; %2 = rgb or bgr %macro RGB24_TO_UV_FN 2-3 -cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3 %if ARCH_X86_64 mova m8, [%2_Ucoeff_12x4] mova m9, [%2_Ucoeff_3x56] @@ -203,10 +203,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w %endif ; x86-32/64 %endif ; cpuflag(ssse3) %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif + add wq, wq add dstUq, wq add dstVq, wq neg wq @@ -264,23 +265,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] } paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] } paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] } - psrad m0, 15 - psrad m2, 15 - psrad m1, 15 - psrad m4, 15 + psrad m0, 9 + psrad m2, 9 + psrad m1, 9 + psrad m4, 9 packssdw m0, m1 ; (word) { U[0-7] } packssdw m2, m4 ; (word) { V[0-7] } %if mmsize == 8 - packuswb m0, m0 ; (byte) { U[0-3] } - packuswb m2, m2 ; (byte) { V[0-3] } - movh [dstUq+wq], m0 - movh [dstVq+wq], m2 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %else ; mmsize == 16 - packuswb m0, m2 ; (byte) { U[0-7], V[0-7] } - movh [dstUq+wq], m0 - movhps [dstVq+wq], m0 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %endif ; mmsize == 8/16 - add wq, mmsize / 2 + add wq, mmsize jl .loop REP_RET %endif ; ARCH_X86_64 && %0 == 3 @@ -306,13 +304,15 @@ RGB24_FUNCS 10, 12 INIT_XMM ssse3 RGB24_FUNCS 11, 13 +%if HAVE_AVX_EXTERNAL INIT_XMM avx RGB24_FUNCS 11, 13 +%endif ; %1 = nr. of XMM registers ; %2-5 = rgba, bgra, argb or abgr (in individual characters) %macro RGB32_TO_Y_FN 5-6 -cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w +cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3 mova m5, [rgba_Ycoeff_%2%4] mova m6, [rgba_Ycoeff_%3%5] %if %0 == 6 @@ -323,6 +323,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w movsxd wq, wd %endif lea srcq, [srcq+wq*4] + add wq, wq add dstq, wq neg wq mova m4, [rgb_Yrnd] @@ -330,8 +331,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w psrlw m7, 8 ; (word) { 0x00ff } x4 .loop: ; FIXME check alignment and use mova - movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] @@ -341,12 +342,11 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w paddd m2, m4 ; += rgb_Yrnd paddd m0, m1 ; (dword) { Y[0-3] } paddd m2, m3 ; (dword) { Y[4-7] } - psrad m0, 15 - psrad m2, 15 + psrad m0, 9 + psrad m2, 9 packssdw m0, m2 ; (word) { Y[0-7] } - packuswb m0, m0 ; (byte) { Y[0-7] } - movh [dstq+wq], m0 - add wq, mmsize / 2 + mova [dstq+wq], m0 + add wq, mmsize jl .loop REP_RET %endif ; %0 == 3 @@ -355,7 +355,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w ; %1 = nr. of XMM registers ; %2-5 = rgba, bgra, argb or abgr (in individual characters) %macro RGB32_TO_UV_FN 5-6 -cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3 %if ARCH_X86_64 mova m8, [rgba_Ucoeff_%2%4] mova m9, [rgba_Ucoeff_%3%5] @@ -376,21 +376,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w %else ; ARCH_X86_64 && %0 == 6 .body: %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif + add wq, wq add dstUq, wq add dstVq, wq - lea srcq, [srcq+wq*4] + lea srcq, [srcq+wq*2] neg wq pcmpeqb m7, m7 psrlw m7, 8 ; (word) { 0x00ff } x4 mova m6, [rgb_UVrnd] .loop: ; FIXME check alignment and use mova - movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] @@ -406,25 +407,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] paddd m3, m6 ; += rgb_UVrnd paddd m5, m6 ; += rgb_UVrnd - psrad m0, 15 + psrad m0, 9 paddd m1, m3 ; (dword) { V[4-7] } paddd m4, m5 ; (dword) { U[4-7] } - psrad m2, 15 - psrad m4, 15 - psrad m1, 15 + psrad m2, 9 + psrad m4, 9 + psrad m1, 9 packssdw m0, m4 ; (word) { U[0-7] } packssdw m2, m1 ; (word) { V[0-7] } %if mmsize == 8 - packuswb m0, m0 ; (byte) { U[0-7] } - packuswb m2, m2 ; (byte) { V[0-7] } - movh [dstUq+wq], m0 - movh [dstVq+wq], m2 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %else ; mmsize == 16 - packuswb m0, m2 ; (byte) { U[0-7], V[0-7] } - movh [dstUq+wq], m0 - movhps [dstVq+wq], m0 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %endif ; mmsize == 8/16 - add wq, mmsize / 2 + add wq, mmsize jl .loop REP_RET %endif ; ARCH_X86_64 && %0 == 3 @@ -452,8 +450,10 @@ RGB32_FUNCS 0, 0 INIT_XMM sse2 RGB32_FUNCS 8, 12 +%if HAVE_AVX_EXTERNAL INIT_XMM avx RGB32_FUNCS 8, 12 +%endif ;----------------------------------------------------------------------------- ; YUYV/UYVY/NV12/NV21 packed pixel shuffling. @@ -490,7 +490,7 @@ RGB32_FUNCS 8, 12 ; will be the same (i.e. YUYV+AVX), and thus we don't need to ; split the loop in an aligned and unaligned case %macro YUYV_TO_Y_FN 2-3 -cglobal %2ToY, 3, 3, %1, dst, src, w +cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w %if ARCH_X86_64 movsxd wq, wd %endif @@ -560,11 +560,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w ; will be the same (i.e. UYVY+AVX), and thus we don't need to ; split the loop in an aligned and unaligned case %macro YUYV_TO_UV_FN 2-3 -cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif add dstUq, wq add dstVq, wq @@ -594,8 +594,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w .loop_%1: mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } - pand m2, m0, m4 ; (word) { U0, U1, ..., U7 } - pand m3, m1, m4 ; (word) { U8, U9, ..., U15 } + pand m2, m0, m5 ; (word) { U0, U1, ..., U7 } + pand m3, m1, m5 ; (word) { U8, U9, ..., U15 } psrlw m0, 8 ; (word) { V0, V1, ..., V7 } psrlw m1, 8 ; (word) { V8, V9, ..., V15 } packuswb m2, m3 ; (byte) { U0, ..., U15 } @@ -615,11 +615,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w ; %1 = nr. of XMM registers ; %2 = nv12 or nv21 %macro NVXX_TO_UV_FN 2 -cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif add dstUq, wq add dstVq, wq @@ -627,8 +627,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w test srcq, 15 %endif lea srcq, [srcq+wq*2] - pcmpeqb m4, m4 ; (byte) { 0xff } x 16 - psrlw m4, 8 ; (word) { 0x00ff } x 8 + pcmpeqb m5, m5 ; (byte) { 0xff } x 16 + psrlw m5, 8 ; (word) { 0x00ff } x 8 %if mmsize == 16 jnz .loop_u_start neg wq @@ -660,6 +660,7 @@ YUYV_TO_UV_FN 3, uyvy NVXX_TO_UV_FN 5, nv12 NVXX_TO_UV_FN 5, nv21 +%if HAVE_AVX_EXTERNAL INIT_XMM avx ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but ; that's not faster in practice @@ -667,3 +668,4 @@ YUYV_TO_UV_FN 3, yuyv YUYV_TO_UV_FN 3, uyvy, 1 NVXX_TO_UV_FN 5, nv12 NVXX_TO_UV_FN 5, nv21 +%endif |