From aeae5d537b35356a783e156fb218eb161d7eb93e Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Tue, 18 Feb 2003 19:22:34 +0000 Subject: optimize Originally committed as revision 9455 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc --- postproc/rgb2rgb.c | 5 +++ postproc/rgb2rgb_template.c | 74 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 77 insertions(+), 2 deletions(-) (limited to 'postproc') diff --git a/postproc/rgb2rgb.c b/postproc/rgb2rgb.c index be21af0..c07301c 100644 --- a/postproc/rgb2rgb.c +++ b/postproc/rgb2rgb.c @@ -28,6 +28,11 @@ static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFU static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL; static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL; static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; +static const uint64_t mask3216br __attribute__((aligned(8)))=0x00F800F800F800F8ULL; +static const uint64_t mask3216g __attribute__((aligned(8)))=0x0000FC000000FC00ULL; +static const uint64_t mask3215g __attribute__((aligned(8)))=0x0000F8000000F800ULL; +static const uint64_t mul3216 __attribute__((aligned(8))) = 0x2000000420000004ULL; +static const uint64_t mul3215 __attribute__((aligned(8))) = 0x2000000820000008ULL; static const uint64_t mask24b __attribute__((aligned(8))) = 0x00FF0000FF0000FFULL; static const uint64_t mask24g __attribute__((aligned(8))) = 0xFF0000FF0000FF00ULL; static const uint64_t mask24r __attribute__((aligned(8))) = 0x0000FF0000FF0000ULL; diff --git a/postproc/rgb2rgb_template.c b/postproc/rgb2rgb_template.c index e299b0c..01ba6ed 100644 --- a/postproc/rgb2rgb_template.c +++ b/postproc/rgb2rgb_template.c @@ -318,12 +318,46 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned uint16_t *d = (uint16_t *)dst; end = s + src_size; #ifdef HAVE_MMX + mm_end = end - 15; +#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) + asm volatile( + "movq %3, %%mm5 \n\t" + "movq %4, %%mm6 \n\t" + "movq %5, %%mm7 \n\t" + ".balign 16 \n\t" + "1: \n\t" + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm6, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pmaddwd %%mm7, %%mm0 \n\t" + "pmaddwd %%mm7, %%mm3 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm5, %%mm4 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "psrld $5, %%mm0 \n\t" + "pslld $11, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + "addl $16, %1 \n\t" + "addl $8, %0 \n\t" + "cmpl %2, %1 \n\t" + " jb 1b \n\t" + : "+r" (d), "+r"(s) + : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) + ); +#else __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_16mask),"m"(green_16mask)); - mm_end = end - 15; while(s < mm_end) { __asm __volatile( @@ -359,6 +393,7 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned d += 4; s += 16; } +#endif __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory"); #endif @@ -441,12 +476,46 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned uint16_t *d = (uint16_t *)dst; end = s + src_size; #ifdef HAVE_MMX + mm_end = end - 15; +#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) + asm volatile( + "movq %3, %%mm5 \n\t" + "movq %4, %%mm6 \n\t" + "movq %5, %%mm7 \n\t" + ".balign 16 \n\t" + "1: \n\t" + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm6, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pmaddwd %%mm7, %%mm0 \n\t" + "pmaddwd %%mm7, %%mm3 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm5, %%mm4 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "psrld $6, %%mm0 \n\t" + "pslld $10, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + "addl $16, %1 \n\t" + "addl $8, %0 \n\t" + "cmpl %2, %1 \n\t" + " jb 1b \n\t" + : "+r" (d), "+r"(s) + : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) + ); +#else __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_15mask),"m"(green_15mask)); - mm_end = end - 15; while(s < mm_end) { __asm __volatile( @@ -482,6 +551,7 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned d += 4; s += 16; } +#endif __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory"); #endif -- cgit v1.1