diff options
Diffstat (limited to 'libavcodec/x86/mpegvideoencdsp.asm')
-rw-r--r-- | libavcodec/x86/mpegvideoencdsp.asm | 151 |
1 files changed, 105 insertions, 46 deletions
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm index 9326ee7..aec73f8 100644 --- a/libavcodec/x86/mpegvideoencdsp.asm +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -4,92 +4,151 @@ ;* Copyright (c) 2000, 2001 Fabrice Bellard ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** %include "libavutil/x86/x86util.asm" -SECTION .text +SECTION_RODATA -INIT_MMX mmx +cextern pw_1 + +SECTION .text ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) -cglobal pix_sum16, 2, 3 +; %1 = number of loops +; %2 = number of GPRs used +%macro PIX_SUM16 3 +cglobal pix_sum16, 2, %2, 6 movsxdifnidn r1, r1d - mov r2, r1 - neg r2 - shl r2, 4 - sub r0, r2 - pxor m7, m7 - pxor m6, m6 + mov r2, %1 +%if mmsize == 16 + lea r3, [r1*3] +%endif +%if notcpuflag(xop) + pxor m5, m5 +%endif + pxor m4, m4 .loop: - mova m0, [r0+r2+0] - mova m1, [r0+r2+0] - mova m2, [r0+r2+8] - mova m3, [r0+r2+8] - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 +%if cpuflag(xop) + vphaddubq m0, [r0] + vphaddubq m1, [r0+r1] + vphaddubq m2, [r0+r1*2] + vphaddubq m3, [r0+r3] +%else + mova m0, [r0] +%if mmsize == 8 + mova m1, [r0+8] +%if cpuflag(mmxext) + mova m2, [r0+r1] + mova m3, [r0+r1+8] +%endif +%else ; sse2 + mova m1, [r0+r1] + mova m2, [r0+r1*2] + mova m3, [r0+r3] +%endif +%if cpuflag(mmxext) + psadbw m0, m5 + psadbw m1, m5 + psadbw m2, m5 + psadbw m3, m5 +%else ; mmx + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif ; cpuflag(mmxext) +%endif ; cpuflag(xop) paddw m1, m0 paddw m3, m2 paddw m3, m1 - paddw m6, m3 - add r2, r1 - js .loop - mova m5, m6 - psrlq m6, 32 - paddw m6, m5 - mova m5, m6 - psrlq m6, 16 - paddw m6, m5 - movd eax, m6 - and eax, 0xffff + paddw m4, m3 +%if cpuflag(mmxext) + lea r0, [r0+r1*%3] +%else + add r0, r1 +%endif + dec r2 + jne .loop +%if mmsize == 16 + pshufd m0, m4, q0032 + paddd m4, m0 +%elif notcpuflag(mmxext) + HADDW m4, m5 +%endif + movd eax, m4 RET +%endmacro +%if ARCH_X86_32 INIT_MMX mmx +PIX_SUM16 16, 3, 0 +INIT_MMX mmxext +PIX_SUM16 8, 4, 2 +%endif +INIT_XMM sse2 +PIX_SUM16 4, 4, 4 +%if HAVE_XOP_EXTERNAL +INIT_XMM xop +PIX_SUM16 4, 4, 4 +%endif + ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) -cglobal pix_norm1, 2, 4 +; %1 = number of xmm registers used +; %2 = number of loops +%macro PIX_NORM1 2 +cglobal pix_norm1, 2, 3, %1 movsxdifnidn r1, r1d - mov r2, 16 + mov r2, %2 pxor m0, m0 - pxor m7, m7 + pxor m5, m5 .loop: mova m2, [r0+0] +%if mmsize == 8 mova m3, [r0+8] - mova m1, m2 - punpckhbw m1, m0 +%else + mova m3, [r0+r1] +%endif + punpckhbw m1, m2, m0 punpcklbw m2, m0 - mova m4, m3 - punpckhbw m3, m0 - punpcklbw m4, m0 + punpckhbw m4, m3, m0 + punpcklbw m3, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m2, m1 paddd m4, m3 - paddd m7, m2 + paddd m5, m2 + paddd m5, m4 +%if mmsize == 8 add r0, r1 - paddd m7, m4 +%else + lea r0, [r0+r1*2] +%endif dec r2 jne .loop - mova m1, m7 - psrlq m7, 32 - paddd m1, m7 - movd eax, m1 + HADDD m5, m1 + movd eax, m5 RET +%endmacro + +INIT_MMX mmx +PIX_NORM1 0, 16 +INIT_XMM sse2 +PIX_NORM1 6, 8 |