diff options
-rw-r--r-- | libswscale/x86/input.asm | 20 |
1 files changed, 19 insertions, 1 deletions
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 1f0f1ef..af9afca 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -347,8 +347,9 @@ cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table %if ARCH_X86_64 movsxd wq, wd %endif - lea srcq, [srcq+wq*4] add wq, wq + sub wq, mmsize - 1 + lea srcq, [srcq+wq*2] add dstq, wq neg wq mova m4, [rgb_Yrnd] @@ -373,6 +374,23 @@ cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table mova [dstq+wq], m0 add wq, mmsize jl .loop + sub wq, mmsize - 1 + jz .end + add srcq, 2*mmsize - 2 + add dstq, mmsize - 1 +.loop2: + movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] + pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] + pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] + paddd m0, m4 ; += rgb_Yrnd + paddd m0, m1 ; (dword) { Y[0-3] } + psrad m0, 9 + packssdw m0, m0 ; (word) { Y[0-7] } + movd [dstq+wq], m0 + add wq, 2 + jl .loop2 +.end: REP_RET %endif ; %0 == 3 %endmacro |