diff options
Diffstat (limited to 'libswscale/x86/swscale_template.c')
-rw-r--r-- | libswscale/x86/swscale_template.c | 142 |
1 files changed, 67 insertions, 75 deletions
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 50e4e4a..c934c78 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -118,7 +118,7 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; if (uDest) { - x86_reg uv_off = c->uv_off_byte >> 1; + x86_reg uv_off = c->uv_offx2 >> 1; dither_8to16(c, chrDither, 0); YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) dither_8to16(c, chrDither, 1); @@ -166,17 +166,12 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, "paddd %%mm2, %%mm6 \n\t"\ "paddd %%mm0, %%mm7 \n\t"\ " jnz 1b \n\t"\ - "psrad $16, %%mm4 \n\t"\ - "psrad $16, %%mm5 \n\t"\ - "psrad $16, %%mm6 \n\t"\ - "psrad $16, %%mm7 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ + "psrad $19, %%mm4 \n\t"\ + "psrad $19, %%mm5 \n\t"\ + "psrad $19, %%mm6 \n\t"\ + "psrad $19, %%mm7 \n\t"\ "packssdw %%mm5, %%mm4 \n\t"\ "packssdw %%mm7, %%mm6 \n\t"\ - "paddw %%mm0, %%mm4 \n\t"\ - "paddw %%mm0, %%mm6 \n\t"\ - "psraw $3, %%mm4 \n\t"\ - "psraw $3, %%mm6 \n\t"\ "packuswb %%mm6, %%mm4 \n\t"\ MOVNTQ(%%mm4, (%1, %3))\ "add $8, %3 \n\t"\ @@ -261,7 +256,7 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; if (uDest) { - x86_reg uv_off = c->uv_off_byte >> 1; + x86_reg uv_off = c->uv_offx2 >> 1; dither_8to32(c, chrDither, 0); YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) dither_8to32(c, chrDither, 1); @@ -324,7 +319,8 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, while (p--) { if (dst[p]) { - dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2); + int i; + for(i=0; i<8; i++) c->dither16[i] = (p == 2 || p == 3) ? lumDither[i] : chrDither[i]; __asm__ volatile( "mov %2, %%"REG_a" \n\t" "movq "DITHER16"+0(%3), %%mm6 \n\t" @@ -574,7 +570,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX_ACCURATE @@ -607,7 +603,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX @@ -664,7 +660,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -688,7 +684,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -741,7 +737,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -765,7 +761,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -898,7 +894,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -922,7 +918,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -963,7 +959,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -984,7 +980,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -1002,10 +998,10 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFF_PX"("#c"), "#index" \n\t" \ + "add "UV_OFFx2"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ + "sub "UV_OFFx2"("#c"), "#index" \n\t" \ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ @@ -1100,8 +1096,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], : "%r8" ); #else - *(const uint16_t **)(&c->u_temp)=abuf0; - *(const uint16_t **)(&c->v_temp)=abuf1; + c->u_temp=(intptr_t)abuf0; + c->v_temp=(intptr_t)abuf1; __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -1233,10 +1229,10 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFF_PX"("#c"), "#index" \n\t" \ + "add "UV_OFFx2"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ + "sub "UV_OFFx2"("#c"), "#index" \n\t" \ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ @@ -1288,9 +1284,9 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], ".p2align 4 \n\t"\ "1: \n\t"\ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ - "add "UV_OFF_PX"("#c"), "#index" \n\t" \ + "add "UV_OFFx2"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ + "sub "UV_OFFx2"("#c"), "#index" \n\t" \ "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ @@ -1341,10 +1337,10 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFF_PX"("#c"), "#index" \n\t" \ + "add "UV_OFFx2"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ + "sub "UV_OFFx2"("#c"), "#index" \n\t" \ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ @@ -1608,9 +1604,9 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, ".p2align 4 \n\t"\ "1: \n\t"\ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ - "add "UV_OFF_PX"("#c"), "#index" \n\t" \ + "add "UV_OFFx2"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ + "sub "UV_OFFx2"("#c"), "#index" \n\t" \ "psraw $7, %%mm3 \n\t" \ "psraw $7, %%mm4 \n\t" \ "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ @@ -1626,10 +1622,10 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "1: \n\t"\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFF_PX"("#c"), "#index" \n\t" \ + "add "UV_OFFx2"("#c"), "#index" \n\t" \ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFF_PX"("#c"), "#index" \n\t" \ + "sub "UV_OFFx2"("#c"), "#index" \n\t" \ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ "psrlw $8, %%mm3 \n\t" \ @@ -1678,7 +1674,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, #if !COMPILE_TEMPLATE_MMX2 //FIXME yuy2* can read up to 7 samples too much -static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, +static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { __asm__ volatile( @@ -1699,7 +1695,7 @@ static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, } static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, + const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { __asm__ volatile( @@ -1728,7 +1724,7 @@ static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, /* This is almost identical to the previous, end exists only because * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ -static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, +static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { __asm__ volatile( @@ -1748,7 +1744,7 @@ static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, } static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, + const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { __asm__ volatile( @@ -1802,21 +1798,21 @@ static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2, } static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, + const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { RENAME(nvXXtoUV)(dstU, dstV, src1, width); } static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, + const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { RENAME(nvXXtoUV)(dstV, dstU, src1, width); } #endif /* !COMPILE_TEMPLATE_MMX2 */ -static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, +static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src, int width, enum PixelFormat srcFormat) { @@ -1857,32 +1853,31 @@ static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *s "paddd %%mm3, %%mm2 \n\t" "paddd %%mm4, %%mm0 \n\t" "paddd %%mm4, %%mm2 \n\t" - "psrad $15, %%mm0 \n\t" - "psrad $15, %%mm2 \n\t" + "psrad $9, %%mm0 \n\t" + "psrad $9, %%mm2 \n\t" "packssdw %%mm2, %%mm0 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "movd %%mm0, (%1, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" + "movq %%mm0, (%1, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" " js 1b \n\t" : "+r" (src) - : "r" (dst+width), "g" ((x86_reg)-width) + : "r" (dst+width), "g" ((x86_reg)-2*width) : "%"REG_a ); } -static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, +static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24); } -static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, +static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24); } -static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, +static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV, const uint8_t *src, int width, enum PixelFormat srcFormat) { @@ -1924,34 +1919,32 @@ static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, "paddd %%mm3, %%mm2 \n\t" "paddd %%mm3, %%mm1 \n\t" "paddd %%mm3, %%mm4 \n\t" - "psrad $15, %%mm0 \n\t" - "psrad $15, %%mm2 \n\t" - "psrad $15, %%mm1 \n\t" - "psrad $15, %%mm4 \n\t" + "psrad $9, %%mm0 \n\t" + "psrad $9, %%mm2 \n\t" + "psrad $9, %%mm1 \n\t" + "psrad $9, %%mm4 \n\t" "packssdw %%mm1, %%mm0 \n\t" "packssdw %%mm4, %%mm2 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm2, %%mm2 \n\t" - "movd %%mm0, (%1, %%"REG_a") \n\t" - "movd %%mm2, (%2, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" + "movq %%mm0, (%1, %%"REG_a") \n\t" + "movq %%mm2, (%2, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" " js 1b \n\t" : "+r" (src) - : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24]) + : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24]) : "%"REG_a ); } -static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, +static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV, + const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24); assert(src1 == src2); } -static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, +static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV, + const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { assert(src1==src2); @@ -2099,9 +2092,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) enum PixelFormat srcFormat = c->srcFormat, dstFormat = c->dstFormat; - if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && - dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) { - if (!(c->flags & SWS_BITEXACT)) { + if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12 + && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { if (c->flags & SWS_ACCURATE_RND) { c->yuv2yuv1 = RENAME(yuv2yuv1_ar ); c->yuv2yuvX = RENAME(yuv2yuvX_ar ); @@ -2116,7 +2108,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } } else { - c->yuv2yuv1 = RENAME(yuv2yuv1 ); + int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat); + c->yuv2yuv1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 ); c->yuv2yuvX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { @@ -2129,7 +2122,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } } - } if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case PIX_FMT_RGB32: |