From 4a26fdd8520d5ad7ea6458854610521bbda880d5 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 27 Jul 2012 15:17:27 -0700
Subject: vp3: port x86 SIMD to cpuflags.

---
 libavcodec/x86/vp3dsp.asm | 94 +++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index af2f60c..5877520 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -102,8 +102,8 @@ SECTION .text
     mov  [r0+r3  -1], r2w
 %endmacro
 
-INIT_MMX
-cglobal vp3_v_loop_filter_mmx2, 3, 4
+INIT_MMX mmx2
+cglobal vp3_v_loop_filter, 3, 4
 %if ARCH_X86_64
     movsxd        r1, r1d
 %endif
@@ -120,7 +120,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4
     movq     [r0   ], m3
     RET
 
-cglobal vp3_h_loop_filter_mmx2, 3, 4
+cglobal vp3_h_loop_filter, 3, 4
 %if ARCH_X86_64
     movsxd        r1, r1d
 %endif
@@ -354,38 +354,6 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
     movq        I(2), m2
 %endmacro
 
-%macro VP3_IDCT_mmx 1
-    ; eax = quantized input
-    ; ebx = dequantizer matrix
-    ; ecx = IDCT constants
-    ;  M(I) = ecx + MaskOffset(0) + I * 8
-    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
-    ; edx = output
-    ; r0..r7 = mm0..mm7
-%define OC_8 [pw_8]
-%define C(x) [vp3_idct_data+16*(x-1)]
-
-    ; at this point, function has completed dequantization + dezigzag +
-    ; partial transposition; now do the idct itself
-%define I(x) [%1+16* x     ]
-%define J(x) [%1+16*(x-4)+8]
-    RowIDCT
-    Transpose
-
-%define I(x) [%1+16* x   +64]
-%define J(x) [%1+16*(x-4)+72]
-    RowIDCT
-    Transpose
-
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
-    ColumnIDCT
-
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
-    ColumnIDCT
-%endmacro
-
 %macro VP3_1D_IDCT_SSE2 0
     movdqa        m2, I(3)      ; xmm2 = i3
     movdqa        m6, C(3)      ; xmm6 = c3
@@ -501,7 +469,8 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
     movdqa      O(7), m%8
 %endmacro
 
-%macro VP3_IDCT_sse2 1
+%macro VP3_IDCT 1
+%if mmsize == 16
 %define I(x) [%1+16*x]
 %define O(x) [%1+16*x]
 %define C(x) [vp3_idct_data+16*(x-1)]
@@ -519,11 +488,42 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
 %define ADD(x)   paddsw x, [pw_8]
         VP3_1D_IDCT_SSE2
         PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
+%else ; mmsize == 8
+    ; eax = quantized input
+    ; ebx = dequantizer matrix
+    ; ecx = IDCT constants
+    ;  M(I) = ecx + MaskOffset(0) + I * 8
+    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
+    ; edx = output
+    ; r0..r7 = mm0..mm7
+%define OC_8 [pw_8]
+%define C(x) [vp3_idct_data+16*(x-1)]
+
+    ; at this point, function has completed dequantization + dezigzag +
+    ; partial transposition; now do the idct itself
+%define I(x) [%1+16* x     ]
+%define J(x) [%1+16*(x-4)+8]
+    RowIDCT
+    Transpose
+
+%define I(x) [%1+16* x   +64]
+%define J(x) [%1+16*(x-4)+72]
+    RowIDCT
+    Transpose
+
+%define I(x) [%1+16*x]
+%define J(x) [%1+16*x]
+    ColumnIDCT
+
+%define I(x) [%1+16*x+8]
+%define J(x) [%1+16*x+8]
+    ColumnIDCT
+%endif ; mmsize == 16/8
 %endmacro
 
-%macro vp3_idct_funcs 1
-cglobal vp3_idct_put_%1, 3, 4, 9
-    VP3_IDCT_%1   r2
+%macro vp3_idct_funcs 0
+cglobal vp3_idct_put, 3, 4, 9
+    VP3_IDCT      r2
 
     movsxdifnidn  r1, r1d
     mova          m4, [pb_80]
@@ -565,8 +565,8 @@ cglobal vp3_idct_put_%1, 3, 4, 9
 %endrep
     RET
 
-cglobal vp3_idct_add_%1, 3, 4, 9
-    VP3_IDCT_%1   r2
+cglobal vp3_idct_add, 3, 4, 9
+    VP3_IDCT      r2
 
     mov           r3, 4
     pxor          m4, m4
@@ -607,10 +607,10 @@ cglobal vp3_idct_add_%1, 3, 4, 9
     RET
 %endmacro
 
-INIT_MMX
-vp3_idct_funcs mmx
-INIT_XMM
-vp3_idct_funcs sse2
+INIT_MMX mmx
+vp3_idct_funcs
+INIT_XMM sse2
+vp3_idct_funcs
 
 %macro DC_ADD 0
     movq          m2, [r0     ]
@@ -631,8 +631,8 @@ vp3_idct_funcs sse2
     movq   [r0+r3  ], m5
 %endmacro
 
-INIT_MMX
-cglobal vp3_idct_dc_add_mmx2, 3, 4
+INIT_MMX mmx2
+cglobal vp3_idct_dc_add, 3, 4
 %if ARCH_X86_64
     movsxd        r1, r1d
 %endif
-- 
cgit v1.1