From 38cfdc83f014323e5b37c72b0d2c1793d32c6d82 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Fri, 3 Nov 2006 14:28:30 +0000 Subject: move luma tc0 related init into asm 5% faster filter_mb_fast() on P3 Originally committed as revision 6884 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/i386/h264dsp_mmx.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'libavcodec') diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c index 185989f..bb52d75 100644 --- a/libavcodec/i386/h264dsp_mmx.c +++ b/libavcodec/i386/h264dsp_mmx.c @@ -377,10 +377,7 @@ static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) { - uint64_t tmp0; - uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101; - // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask - uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff }; + uint64_t tmp0[2]; asm volatile( "movq (%1,%3), %%mm0 \n\t" //p1 @@ -388,8 +385,16 @@ static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph "movq (%2), %%mm2 \n\t" //q0 "movq (%2,%3), %%mm3 \n\t" //q1 H264_DEBLOCK_MASK(%6, %7) - "pand %5, %%mm7 \n\t" - "movq %%mm7, %0 \n\t" + + "movd %5, %%mm4 \n\t" + "punpcklbw %%mm4, %%mm4 \n\t" + "punpcklwd %%mm4, %%mm4 \n\t" + "pcmpeqb %%mm3, %%mm3 \n\t" + "movq %%mm4, %%mm6 \n\t" + "pcmpgtb %%mm3, %%mm4 \n\t" + "movq %%mm6, 8+%0 \n\t" + "pand %%mm4, %%mm7 \n\t" + "movq %%mm7, %0 \n\t" /* filter p1 */ "movq (%1), %%mm3 \n\t" //p2 @@ -397,7 +402,7 @@ static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph "pandn %%mm7, %%mm6 \n\t" "pcmpeqb %%mm7, %%mm6 \n\t" "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|