From 38cfdc83f014323e5b37c72b0d2c1793d32c6d82 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Fri, 3 Nov 2006 14:28:30 +0000
Subject: move luma tc0 related init into asm 5% faster filter_mb_fast() on P3

Originally committed as revision 6884 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/i386/h264dsp_mmx.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c
index 185989f..bb52d75 100644
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -377,10 +377,7 @@ static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
 
 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
 {
-    uint64_t tmp0;
-    uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101;
-    // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask
-    uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff };
+    uint64_t tmp0[2];
 
     asm volatile(
         "movq    (%1,%3), %%mm0    \n\t" //p1
@@ -388,8 +385,16 @@ static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
         "movq    (%2),    %%mm2    \n\t" //q0
         "movq    (%2,%3), %%mm3    \n\t" //q1
         H264_DEBLOCK_MASK(%6, %7)
-        "pand     %5,     %%mm7    \n\t"
-        "movq     %%mm7,  %0       \n\t"
+
+        "movd      %5,    %%mm4    \n\t"
+        "punpcklbw %%mm4, %%mm4    \n\t"
+        "punpcklwd %%mm4, %%mm4    \n\t"
+        "pcmpeqb   %%mm3, %%mm3    \n\t"
+        "movq      %%mm4, %%mm6    \n\t"
+        "pcmpgtb   %%mm3, %%mm4    \n\t"
+        "movq      %%mm6, 8+%0     \n\t"
+        "pand      %%mm4, %%mm7    \n\t"
+        "movq      %%mm7, %0       \n\t"
 
         /* filter p1 */
         "movq     (%1),   %%mm3    \n\t" //p2
@@ -397,7 +402,7 @@ static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
         "pandn    %%mm7,  %%mm6    \n\t"
         "pcmpeqb  %%mm7,  %%mm6    \n\t"
         "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
-        "pshufw  $80, %4, %%mm4    \n\t"
+        "movq     8+%0,   %%mm4    \n\t"
         "pand     %%mm7,  %%mm4    \n\t" // mask & tc0
         "movq     %%mm4,  %%mm7    \n\t"
         "psubb    %%mm6,  %%mm7    \n\t"
@@ -410,21 +415,21 @@ static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
         "pandn    %0,     %%mm6    \n\t"
         "pcmpeqb  %0,     %%mm6    \n\t"
         "pand     %0,     %%mm6    \n\t"
-        "pshufw  $80, %4, %%mm5    \n\t"
+        "movq     8+%0,   %%mm5    \n\t"
         "pand     %%mm6,  %%mm5    \n\t"
         "psubb    %%mm6,  %%mm7    \n\t"
         "movq    (%2,%3), %%mm3    \n\t"
         H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
 
         /* filter p0, q0 */
-        H264_DEBLOCK_P0_Q0(%8, %9)
+        H264_DEBLOCK_P0_Q0(%8, unused)
         "movq      %%mm1, (%1,%3,2) \n\t"
         "movq      %%mm2, (%2)      \n\t"
 
-        : "=m"(tmp0)
+        : "=m"(*tmp0)
         : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
-          "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1),
-          "m"(mm_bone), "m"(ff_pb_3F)
+          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
+          "m"(mm_bone)
     );
 }
 
-- 
cgit v1.1