diff options
Diffstat (limited to 'libswscale')
40 files changed, 5729 insertions, 2214 deletions
diff --git a/libswscale/Makefile b/libswscale/Makefile index 3e8614d..067e2b9 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -1,9 +1,12 @@ +include $(SUBDIR)../config.mak + NAME = swscale HEADERS = swscale.h \ version.h \ -OBJS = input.o \ +OBJS = hscale_fast_bilinear.o \ + input.o \ options.o \ output.o \ rgb2rgb.o \ @@ -12,5 +15,8 @@ OBJS = input.o \ utils.o \ yuv2rgb.o \ +# Windows resource file +SLIBOBJS-$(HAVE_GNU_WINDRES) += swscaleres.o + TESTPROGS = colorspace \ swscale \ diff --git a/libswscale/arm/Makefile b/libswscale/arm/Makefile new file mode 100644 index 0000000..8b5a97b --- /dev/null +++ b/libswscale/arm/Makefile @@ -0,0 +1,4 @@ +# OBJS += arm/swscale_unscaled.o + +# NEON-OBJS += arm/rgb2yuv_neon_32.o +# NEON-OBJS += arm/rgb2yuv_neon_16.o diff --git a/libswscale/arm/rgb2yuv_neon_16.S b/libswscale/arm/rgb2yuv_neon_16.S new file mode 100644 index 0000000..601bc9a --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_16.S @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rgb2yuv_neon_common.S" + +/* downsampled R16G16B16 x8 */ +alias_qw r16x8, q7 +alias_qw g16x8, q8 +alias_qw b16x8, q9 + +alias n16x16_l, q11 +alias n16x16_h, q12 + +alias y16x16_l, q13 +alias y16x16_h, q14 + +alias_qw y8x16, q15 + +.macro init src + vld3.i32 {q13_l, q14_l, q15_l}, [\src]! + vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] + vrshrn.i32 CO_R, q13, #7 + vrshrn.i32 CO_G, q14, #7 + vrshrn.i32 CO_B, q15, #7 + + vmov.u8 BIAS_Y, #16 + vmov.u8 BIAS_U, #128 +.endm + + +.macro compute_y_16x1_step action, s8x16, coeff + vmovl.u8 n16x16_l, \s8x16\()_l + vmovl.u8 n16x16_h, \s8x16\()_h + + \action y16x16_l, n16x16_l, \coeff + \action y16x16_h, n16x16_h, \coeff +.endm + +.macro compute_y_16x1 + compute_y_16x1_step vmul, r8x16, CO_RY + compute_y_16x1_step vmla, g8x16, CO_GY + compute_y_16x1_step vmla, b8x16, CO_BY + + vrshrn.i16 y8x16_l, y16x16_l, #8 + vrshrn.i16 y8x16_h, y16x16_h, #8 + + vadd.u8 y8x16, y8x16, BIAS_Y +.endm + +alias c16x8, q15 +alias_qw c8x8x2, q10 + + +.macro compute_chroma_8x1 c, C + vmul c16x8, r16x8, CO_R\C + vmla c16x8, g16x8, CO_G\C + vmla c16x8, b16x8, CO_B\C + + vrshrn.i16 \c\()8x8, c16x8, #8 + vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C +.endm + + loop_420sp rgbx, nv12, init, kernel_420_16x2, 16 diff --git a/libswscale/arm/rgb2yuv_neon_32.S b/libswscale/arm/rgb2yuv_neon_32.S new file mode 100644 index 0000000..f51a5f1 --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_32.S @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rgb2yuv_neon_common.S" + +/* downsampled R16G16B16 x8 */ +alias_qw r16x8, q7 +alias_qw g16x8, q8 +alias_qw b16x8, q9 + +alias n16x16_o, q11 +alias n16x16_ol, q11_l +alias n16x16_oh, q11_h + +alias y32x16_el, q12 +alias y32x16_eh, q13 +alias y32x16_ol, q14 +alias y32x16_oh, q15 + +alias y16x16_e, q12 +alias y16x16_el, q12_l +alias y16x16_eh, q12_h +alias y16x16_o, q13 +alias y16x16_ol, q13_l +alias y16x16_oh, q13_h + + +alias y8x16, y16x16_e + + +.macro init src + // load s32x3x3, narrow to s16x3x3 + vld3.i32 {q13_l, q14_l, q15_l}, [\src]! + vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] + + vmovn.i32 CO_R, q13 + vmovn.i32 CO_G, q14 + vmovn.i32 CO_B, q15 + + vmov.u8 BIAS_Y, #16 + vmov.u8 BIAS_U, #128 +.endm + + +.macro compute_y_16x1_step action, s8x16, coeff + vmov.u8 n16x16_o, #0 + vtrn.u8 \s8x16, n16x16_o + + \action y32x16_el, \s8x16\()_l, \coeff + \action y32x16_eh, \s8x16\()_h, \coeff + \action y32x16_ol, n16x16_ol, \coeff + \action y32x16_oh, n16x16_oh, \coeff +.endm + +/* + * in: r8x16, g8x16, b8x16 + * out: y8x16 + * clobber: q11-q15, r8x16, g8x16, b8x16 + */ +.macro compute_y_16x1 + compute_y_16x1_step vmull, r8x16, CO_RY + compute_y_16x1_step vmlal, g8x16, CO_GY + compute_y_16x1_step vmlal, b8x16, CO_BY + + vrshrn.i32 y16x16_el, y32x16_el, #15 + vrshrn.i32 y16x16_eh, y32x16_eh, #15 + vrshrn.i32 y16x16_ol, y32x16_ol, #15 + vrshrn.i32 y16x16_oh, y32x16_oh, #15 + + vtrn.8 y16x16_e, y16x16_o + vadd.u8 y8x16, y8x16, BIAS_Y +.endm + +alias c32x8_l, q14 +alias c32x8_h, q15 + +alias_qw c16x8, q13 +alias_qw c8x8x2, q10 + +.macro compute_chroma_8x1_step action, s16x8, coeff + \action c32x8_l, \s16x8\()_l, \coeff + \action c32x8_h, \s16x8\()_h, \coeff +.endm + +/* + * in: r16x8, g16x8, b16x8 + * out: c8x8 + * clobber: q14-q15 + */ +.macro compute_chroma_8x1 c, C + compute_chroma_8x1_step vmull, r16x8, CO_R\C + compute_chroma_8x1_step vmlal, g16x8, CO_G\C + compute_chroma_8x1_step vmlal, b16x8, CO_B\C + + vrshrn.i32 c16x8_l, c32x8_l, #15 + vrshrn.i32 c16x8_h, c32x8_h, #15 + vmovn.i16 \c\()8x8, c16x8 + vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C +.endm + + + loop_420sp rgbx, nv12, init, kernel_420_16x2, 32 diff --git a/libswscale/arm/rgb2yuv_neon_common.S b/libswscale/arm/rgb2yuv_neon_common.S new file mode 100644 index 0000000..30bcecd --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_common.S @@ -0,0 +1,291 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro alias name, tgt, set=1 +.if \set != 0 + \name .req \tgt +.else + .unreq \name +.endif +.endm + +.altmacro + +.macro alias_dw_all qw, dw_l, dw_h + alias q\qw\()_l, d\dw_l + alias q\qw\()_h, d\dw_h + .if \qw < 15 + alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2) + .endif +.endm + +alias_dw_all 0, 0, 1 + +.noaltmacro + +.macro alias_qw name, qw, set=1 + alias \name\(), \qw, \set + alias \name\()_l, \qw\()_l, \set + alias \name\()_h, \qw\()_h, \set +.endm + +.macro prologue + push {r4-r12, lr} + vpush {q4-q7} +.endm + +.macro epilogue + vpop {q4-q7} + pop {r4-r12, pc} +.endm + +.macro load_arg reg, ix + ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)] +.endm + + +/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma + * int width, int height, + * int y_stride, int c_stride, int src_stride, + * int32_t coeff_table[9]); + */ +.macro alias_loop_420sp set=1 + alias src, r0, \set + alias src0, src, \set + alias y, r1, \set + alias y0, y, \set + alias chroma, r2, \set + alias width, r3, \set + alias header, width, \set + + alias height, r4, \set + alias y_stride, r5, \set + alias c_stride, r6, \set + alias c_padding, c_stride, \set + alias src_stride, r7, \set + + alias y0_end, r8, \set + + alias src_padding,r9, \set + alias y_padding, r10, \set + + alias src1, r11, \set + alias y1, r12, \set + + alias coeff_table,r12, \set +.endm + + +.macro loop_420sp s_fmt, d_fmt, init, kernel, precision + +function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1 + prologue + + alias_loop_420sp + + load_arg height, 4 + load_arg y_stride, 5 + load_arg c_stride, 6 + load_arg src_stride, 7 + load_arg coeff_table, 8 + + \init coeff_table + + sub y_padding, y_stride, width + sub c_padding, c_stride, width + sub src_padding, src_stride, width, LSL #2 + + add y0_end, y0, width + and header, width, #15 + + add y1, y0, y_stride + add src1, src0, src_stride + +0: + cmp header, #0 + beq 1f + + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header + +1: + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma + + cmp y0, y0_end + blt 1b +2: + add y0, y1, y_padding + add y0_end, y1, y_stride + add chroma, chroma, c_padding + add src0, src1, src_padding + + add y1, y0, y_stride + add src1, src0, src_stride + + subs height, height, #2 + + bgt 0b + + epilogue + + alias_loop_420sp 0 + +endfunc +.endm + +.macro downsample + vpaddl.u8 r16x8, r8x16 + vpaddl.u8 g16x8, g8x16 + vpaddl.u8 b16x8, b8x16 +.endm + + +/* acculumate and right shift by 2 */ +.macro downsample_ars2 + vpadal.u8 r16x8, r8x16 + vpadal.u8 g16x8, g8x16 + vpadal.u8 b16x8, b8x16 + + vrshr.u16 r16x8, r16x8, #2 + vrshr.u16 g16x8, g16x8, #2 + vrshr.u16 b16x8, b16x8, #2 +.endm + +.macro store_y8_16x1 dst, count +.ifc "\count","" + vstmia \dst!, {y8x16} +.else + vstmia \dst, {y8x16} + add \dst, \dst, \count +.endif +.endm + +.macro store_chroma_nv12_8x1 dst, count +.ifc "\count","" + vst2.i8 {u8x8, v8x8}, [\dst]! +.else + vst2.i8 {u8x8, v8x8}, [\dst], \count +.endif +.endm + +.macro store_chroma_nv21_8x1 dst, count +.ifc "\count","" + vst2.i8 {v8x8, u8x8}, [\dst]! +.else + vst2.i8 {v8x8, u8x8}, [\dst], \count +.endif +.endm + +.macro load_8888_16x1 a, b, c, d, src, count +.ifc "\count","" + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]! +.else + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src] + sub \src, \src, #32 + add \src, \src, \count, LSL #2 +.endif +.endm + +.macro load_rgbx_16x1 src, count + load_8888_16x1 r, g, b, x, \src, \count +.endm + +.macro load_bgrx_16x1 src, count + load_8888_16x1 b, g, r, x, \src, \count +.endm + +.macro alias_src_rgbx set=1 + alias_src_8888 r, g, b, x, \set +.endm + +.macro alias_src_bgrx set=1 + alias_src_8888 b, g, r, x, \set +.endm + +.macro alias_dst_nv12 set=1 + alias u8x8, c8x8x2_l, \set + alias v8x8, c8x8x2_h, \set +.endm + +.macro alias_dst_nv21 set=1 + alias v8x8, c8x8x2_l, \set + alias u8x8, c8x8x2_h, \set +.endm + + +// common aliases + +alias CO_R d0 +CO_RY .dn d0.s16[0] +CO_RU .dn d0.s16[1] +CO_RV .dn d0.s16[2] + +alias CO_G d1 +CO_GY .dn d1.s16[0] +CO_GU .dn d1.s16[1] +CO_GV .dn d1.s16[2] + +alias CO_B d2 +CO_BY .dn d2.s16[0] +CO_BU .dn d2.s16[1] +CO_BV .dn d2.s16[2] + +alias BIAS_U, d3 +alias BIAS_V, BIAS_U + +alias BIAS_Y, q2 + + +/* q3-q6 R8G8B8X8 x16 */ + +.macro alias_src_8888 a, b, c, d, set + alias_qw \a\()8x16, q3, \set + alias_qw \b\()8x16, q4, \set + alias_qw \c\()8x16, q5, \set + alias_qw \d\()8x16, q6, \set +.endm + +.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count + alias_src_\rgb_fmt + alias_dst_\yuv_fmt + + load_\rgb_fmt\()_16x1 \rgb0, \count + + downsample + compute_y_16x1 + store_y8_16x1 \y0, \count + + + load_\rgb_fmt\()_16x1 \rgb1, \count + downsample_ars2 + compute_y_16x1 + store_y8_16x1 \y1, \count + + compute_chroma_8x1 u, U + compute_chroma_8x1 v, V + + store_chroma_\yuv_fmt\()_8x1 \chroma, \count + + alias_dst_\yuv_fmt 0 + alias_src_\rgb_fmt 0 +.endm diff --git a/libswscale/arm/swscale_unscaled.c b/libswscale/arm/swscale_unscaled.c new file mode 100644 index 0000000..04be762 --- /dev/null +++ b/libswscale/arm/swscale_unscaled.c @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" +#include "libavutil/arm/cpu.h" + +extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma, + int width, int height, + int y_stride, int c_stride, int src_stride, + int32_t coeff_tbl[9]); + +extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma, + int width, int height, + int y_stride, int c_stride, int src_stride, + int32_t coeff_tbl[9]); + +static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) { + + rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0], + dst[0] + srcSliceY * dstStride[0], + dst[1] + (srcSliceY / 2) * dstStride[1], + context->srcW, srcSliceH, + dstStride[0], dstStride[1], srcStride[0], + context->input_rgb2yuv_table); + + return 0; +} + +static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) { + + rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0], + dst[0] + srcSliceY * dstStride[0], + dst[1] + (srcSliceY / 2) * dstStride[1], + context->srcW, srcSliceH, + dstStride[0], dstStride[1], srcStride[0], + context->input_rgb2yuv_table); + + return 0; +} + +static void get_unscaled_swscale_neon(SwsContext *c) { + int accurate_rnd = c->flags & SWS_ACCURATE_RND; + if (c->srcFormat == AV_PIX_FMT_RGBA + && c->dstFormat == AV_PIX_FMT_NV12 + && (c->srcW >= 16)) { + c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper + : rgbx_to_nv12_neon_16_wrapper; + } +} + +void ff_get_unscaled_swscale_arm(SwsContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + if (have_neon(cpu_flags)) + get_unscaled_swscale_neon(c); +} diff --git a/libswscale/bayer_template.c b/libswscale/bayer_template.c new file mode 100644 index 0000000..67ab2ae --- /dev/null +++ b/libswscale/bayer_template.c @@ -0,0 +1,236 @@ +/* + * Bayer-to-RGB/YV12 template + * Copyright (c) 2011-2014 Peter Ross <pross@xvid.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#if defined(BAYER_BGGR) || defined(BAYER_GBRG) +#define BAYER_R 0 +#define BAYER_G 1 +#define BAYER_B 2 +#endif +#if defined(BAYER_RGGB) || defined(BAYER_GRBG) +#define BAYER_R 2 +#define BAYER_G 1 +#define BAYER_B 0 +#endif + +#if defined(BAYER_8) +#define BAYER_READ(x) (x) +#define BAYER_SIZEOF 1 +#define BAYER_SHIFT 0 +#endif +#if defined(BAYER_16LE) +#define BAYER_READ(x) AV_RL16(&x) +#define BAYER_SIZEOF 2 +#define BAYER_SHIFT 8 +#endif +#if defined(BAYER_16BE) +#define BAYER_READ(x) AV_RB16(&x) +#define BAYER_SIZEOF 2 +#define BAYER_SHIFT 8 +#endif + +#define S(y, x) BAYER_READ(src[(y)*src_stride + BAYER_SIZEOF*(x)]) +#define T(y, x) (unsigned int)S(y, x) +#define R(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_R] +#define G(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_G] +#define B(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_B] + +#if defined(BAYER_BGGR) || defined(BAYER_RGGB) +#define BAYER_TO_RGB24_COPY \ + R(0, 0) = \ + R(0, 1) = \ + R(1, 1) = \ + R(1, 0) = S(1, 1) >> BAYER_SHIFT; \ + \ + G(0, 1) = S(0, 1) >> BAYER_SHIFT; \ + G(0, 0) = \ + G(1, 1) = (T(0, 1) + T(1, 0)) >> (1 + BAYER_SHIFT); \ + G(1, 0) = S(1, 0) >> BAYER_SHIFT; \ + \ + B(1, 1) = \ + B(0, 0) = \ + B(0, 1) = \ + B(1, 0) = S(0, 0) >> BAYER_SHIFT; +#define BAYER_TO_RGB24_INTERPOLATE \ + R(0, 0) = (T(-1, -1) + T(-1, 1) + T(1, -1) + T(1, 1)) >> (2 + BAYER_SHIFT); \ + G(0, 0) = (T(-1, 0) + T( 0, -1) + T(0, 1) + T(1, 0)) >> (2 + BAYER_SHIFT); \ + B(0, 0) = S(0, 0) >> BAYER_SHIFT; \ + \ + R(0, 1) = (T(-1, 1) + T(1, 1)) >> (1 + BAYER_SHIFT); \ + G(0, 1) = S(0, 1) >> BAYER_SHIFT; \ + B(0, 1) = (T(0, 0) + T(0, 2)) >> (1 + BAYER_SHIFT); \ + \ + R(1, 0) = (T(1, -1) + T(1, 1)) >> (1 + BAYER_SHIFT); \ + G(1, 0) = S(1, 0) >> BAYER_SHIFT; \ + B(1, 0) = (T(0, 0) + T(2, 0)) >> (1 + BAYER_SHIFT); \ + \ + R(1, 1) = S(1, 1) >> BAYER_SHIFT; \ + G(1, 1) = (T(0, 1) + T(1, 0) + T(1, 2) + T(2, 1)) >> (2 + BAYER_SHIFT); \ + B(1, 1) = (T(0, 0) + T(0, 2) + T(2, 0) + T(2, 2)) >> (2 + BAYER_SHIFT); +#else +#define BAYER_TO_RGB24_COPY \ + R(0, 0) = \ + R(0, 1) = \ + R(1, 1) = \ + R(1, 0) = S(1, 0) >> BAYER_SHIFT; \ + \ + G(0, 0) = S(0, 0) >> BAYER_SHIFT; \ + G(1, 1) = S(1, 1) >> BAYER_SHIFT; \ + G(0, 1) = \ + G(1, 0) = (T(0, 0) + T(1, 1)) >> (1 + BAYER_SHIFT); \ + \ + B(1, 1) = \ + B(0, 0) = \ + B(0, 1) = \ + B(1, 0) = S(0, 1) >> BAYER_SHIFT; +#define BAYER_TO_RGB24_INTERPOLATE \ + R(0, 0) = (T(-1, 0) + T(1, 0)) >> (1 + BAYER_SHIFT); \ + G(0, 0) = S(0, 0) >> BAYER_SHIFT; \ + B(0, 0) = (T(0, -1) + T(0, 1)) >> (1 + BAYER_SHIFT); \ + \ + R(0, 1) = (T(-1, 0) + T(-1, 2) + T(1, 0) + T(1, 2)) >> (2 + BAYER_SHIFT); \ + G(0, 1) = (T(-1, 1) + T(0, 0) + T(0, 2) + T(1, 1)) >> (2 + BAYER_SHIFT); \ + B(0, 1) = S(0, 1) >> BAYER_SHIFT; \ + \ + R(1, 0) = S(1, 0) >> BAYER_SHIFT; \ + G(1, 0) = (T(0, 0) + T(1, -1) + T(1, 1) + T(2, 0)) >> (2 + BAYER_SHIFT); \ + B(1, 0) = (T(0, -1) + T(0, 1) + T(2, -1) + T(2, 1)) >> (2 + BAYER_SHIFT); \ + \ + R(1, 1) = (T(1, 0) + T(1, 2)) >> (1 + BAYER_SHIFT); \ + G(1, 1) = S(1, 1) >> BAYER_SHIFT; \ + B(1, 1) = (T(0, 1) + T(2, 1)) >> (1 + BAYER_SHIFT); +#endif + +/** + * invoke ff_rgb24toyv12 for 2x2 pixels + */ +#define rgb24toyv12_2x2(src, dstY, dstU, dstV, luma_stride, src_stride, rgb2yuv) \ + ff_rgb24toyv12(src, dstY, dstV, dstU, 2, 2, luma_stride, 0, src_stride, rgb2yuv) + +static void BAYER_RENAME(rgb24_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width) +{ + int i; + for (i = 0 ; i < width; i+= 2) { + BAYER_TO_RGB24_COPY + src += 2 * BAYER_SIZEOF; + dst += 6; + } +} + +static void BAYER_RENAME(rgb24_interpolate)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width) +{ + int i; + + BAYER_TO_RGB24_COPY + src += 2 * BAYER_SIZEOF; + dst += 6; + + for (i = 2 ; i < width - 2; i+= 2) { + BAYER_TO_RGB24_INTERPOLATE + src += 2 * BAYER_SIZEOF; + dst += 6; + } + + if (width > 2) { + BAYER_TO_RGB24_COPY + } +} + +static void BAYER_RENAME(yv12_copy)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv) +{ + uint8_t dst[12]; + const int dst_stride = 6; + int i; + for (i = 0 ; i < width; i+= 2) { + BAYER_TO_RGB24_COPY + rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv); + src += 2 * BAYER_SIZEOF; + dstY += 2; + dstU++; + dstV++; + } +} + +static void BAYER_RENAME(yv12_interpolate)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv) +{ + uint8_t dst[12]; + const int dst_stride = 6; + int i; + + BAYER_TO_RGB24_COPY + rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv); + src += 2 * BAYER_SIZEOF; + dstY += 2; + dstU++; + dstV++; + + for (i = 2 ; i < width - 2; i+= 2) { + BAYER_TO_RGB24_INTERPOLATE + rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv); + src += 2 * BAYER_SIZEOF; + dstY += 2; + dstU++; + dstV++; + } + + if (width > 2) { + BAYER_TO_RGB24_COPY + rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv); + } +} + +#undef S +#undef T +#undef R +#undef G +#undef B +#undef BAYER_TO_RGB24_COPY +#undef BAYER_TO_RGB24_INTERPOLATE + +#undef BAYER_RENAME + +#undef BAYER_R +#undef BAYER_G +#undef BAYER_B +#undef BAYER_READ +#undef BAYER_SIZEOF +#undef BAYER_SHIFT + +#if defined(BAYER_BGGR) +#undef BAYER_BGGR +#endif +#if defined(BAYER_RGGB) +#undef BAYER_RGGB +#endif +#if defined(BAYER_GBRG) +#undef BAYER_GBRG +#endif +#if defined(BAYER_GRBG) +#undef BAYER_GRBG +#endif +#if defined(BAYER_8) +#undef BAYER_8 +#endif +#if defined(BAYER_16LE) +#undef BAYER_16LE +#endif +#if defined(BAYER_16BE) +#undef BAYER_16BE +#endif diff --git a/libswscale/colorspace-test.c b/libswscale/colorspace-test.c index fbf595d..42a915b 100644 --- a/libswscale/colorspace-test.c +++ b/libswscale/colorspace-test.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -33,7 +33,7 @@ #define FUNC(s, d, n) { s, d, #n, n } -int main(void) +int main(int argc, char **argv) { int i, funcNum; uint8_t *srcBuffer = av_malloc(SIZE); @@ -54,6 +54,7 @@ int main(void) const char *name; void (*func)(const uint8_t *src, uint8_t *dst, int src_size); } func_info[] = { + FUNC(2, 2, rgb12to15), FUNC(2, 2, rgb15to16), FUNC(2, 3, rgb15to24), FUNC(2, 4, rgb15to32), @@ -66,6 +67,7 @@ int main(void) FUNC(4, 2, rgb32to16), FUNC(4, 3, rgb32to24), FUNC(2, 2, rgb16to15), + FUNC(2, 2, rgb12tobgr12), FUNC(2, 2, rgb15tobgr15), FUNC(2, 2, rgb15tobgr16), FUNC(2, 3, rgb15tobgr24), @@ -82,6 +84,12 @@ int main(void) FUNC(4, 2, rgb32tobgr16), FUNC(4, 3, rgb32tobgr24), FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */ + FUNC(6, 6, rgb48tobgr48_nobswap), + FUNC(6, 6, rgb48tobgr48_bswap), + FUNC(8, 6, rgb64to48_nobswap), + FUNC(8, 6, rgb64to48_bswap), + FUNC(8, 6, rgb64tobgr48_nobswap), + FUNC(8, 6, rgb64tobgr48_bswap), FUNC(0, 0, NULL) }; int width; diff --git a/libswscale/hscale_fast_bilinear.c b/libswscale/hscale_fast_bilinear.c new file mode 100644 index 0000000..82d6177 --- /dev/null +++ b/libswscale/hscale_fast_bilinear.c @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "swscale_internal.h" + +void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth, + const uint8_t *src, int srcW, int xInc) +{ + int i; + unsigned int xpos = 0; + for (i = 0; i < dstWidth; i++) { + register unsigned int xx = xpos >> 16; + register unsigned int xalpha = (xpos & 0xFFFF) >> 9; + dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha; + xpos += xInc; + } + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) + dst[i] = src[srcW-1]*128; +} + +void ff_hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2, + int dstWidth, const uint8_t *src1, + const uint8_t *src2, int srcW, int xInc) +{ + int i; + unsigned int xpos = 0; + for (i = 0; i < dstWidth; i++) { + register unsigned int xx = xpos >> 16; + register unsigned int xalpha = (xpos & 0xFFFF) >> 9; + dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha); + dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha); + xpos += xInc; + } + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { + dst1[i] = src1[srcW-1]*128; + dst2[i] = src2[srcW-1]*128; + } +} diff --git a/libswscale/input.c b/libswscale/input.c index f583b3f..6716f0d 100644 --- a/libswscale/input.c +++ b/libswscale/input.c @@ -1,24 +1,23 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <assert.h> #include <math.h> #include <stdint.h> #include <stdio.h> @@ -30,38 +29,117 @@ #include "libavutil/intreadwrite.h" #include "libavutil/mathematics.h" #include "libavutil/pixdesc.h" +#include "libavutil/avassert.h" #include "config.h" #include "rgb2rgb.h" #include "swscale.h" #include "swscale_internal.h" -#define RGB2YUV_SHIFT 15 -#define BY ((int)(0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define BV (-(int)(0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define BU ((int)(0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define GY ((int)(0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define GV (-(int)(0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define GU (-(int)(0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define RY ((int)(0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define RV ((int)(0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define RU (-(int)(0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) - #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) -#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE) ? b_r : r_b) -#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE) ? r_b : b_r) +#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? b_r : r_b) +#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? r_b : b_r) + +static av_always_inline void +rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width, + enum AVPixelFormat origin, int32_t *rgb2yuv) +{ + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + int i; + for (i = 0; i < width; i++) { + unsigned int r_b = input_pixel(&src[i*4+0]); + unsigned int g = input_pixel(&src[i*4+1]); + unsigned int b_r = input_pixel(&src[i*4+2]); + + dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + } +} + +static av_always_inline void +rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV, + const uint16_t *src1, const uint16_t *src2, + int width, enum AVPixelFormat origin, int32_t *rgb2yuv) +{ + int i; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + av_assert1(src1==src2); + for (i = 0; i < width; i++) { + int r_b = input_pixel(&src1[i*4+0]); + int g = input_pixel(&src1[i*4+1]); + int b_r = input_pixel(&src1[i*4+2]); + + dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + } +} + +static av_always_inline void +rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV, + const uint16_t *src1, const uint16_t *src2, + int width, enum AVPixelFormat origin, int32_t *rgb2yuv) +{ + int i; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + av_assert1(src1==src2); + for (i = 0; i < width; i++) { + int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1; + int g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1; + int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1; + + dstU[i]= (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + dstV[i]= (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; + } +} + +#define rgb64funcs(pattern, BE_LE, origin) \ +static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\ + int width, uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src = (const uint16_t *) _src; \ + uint16_t *dst = (uint16_t *) _dst; \ + rgb64ToY_c_template(dst, src, width, origin, rgb2yuv); \ +} \ + \ +static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \ + const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \ + int width, uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src1 = (const uint16_t *) _src1, \ + *src2 = (const uint16_t *) _src2; \ + uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \ + rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ +} \ + \ +static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \ + const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \ + int width, uint32_t *rgb2yuv) \ +{ \ + const uint16_t *src1 = (const uint16_t *) _src1, \ + *src2 = (const uint16_t *) _src2; \ + uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \ + rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ +} + +rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE) +rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE) +rgb64funcs(bgr, LE, AV_PIX_FMT_BGRA64LE) +rgb64funcs(bgr, BE, AV_PIX_FMT_BGRA64BE) static av_always_inline void rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width, - enum AVPixelFormat origin) + enum AVPixelFormat origin, + int32_t *rgb2yuv) { + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; int i; for (i = 0; i < width; i++) { unsigned int r_b = input_pixel(&src[i * 3 + 0]); unsigned int g = input_pixel(&src[i * 3 + 1]); unsigned int b_r = input_pixel(&src[i * 3 + 2]); - dst[i] = (RY * r + GY * g + BY * b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; } } @@ -70,17 +148,20 @@ static av_always_inline void rgb48ToUV_c_template(uint16_t *dstU, const uint16_t *src1, const uint16_t *src2, int width, - enum AVPixelFormat origin) + enum AVPixelFormat origin, + int32_t *rgb2yuv) { int i; - assert(src1 == src2); + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + av_assert1(src1 == src2); for (i = 0; i < width; i++) { int r_b = input_pixel(&src1[i * 3 + 0]); int g = input_pixel(&src1[i * 3 + 1]); int b_r = input_pixel(&src1[i * 3 + 2]); - dstU[i] = (RU * r + GU * g + BU * b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; - dstV[i] = (RV * r + GV * g + BV * b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; } } @@ -89,10 +170,13 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU, const uint16_t *src1, const uint16_t *src2, int width, - enum AVPixelFormat origin) + enum AVPixelFormat origin, + int32_t *rgb2yuv) { int i; - assert(src1 == src2); + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + av_assert1(src1 == src2); for (i = 0; i < width; i++) { int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1; @@ -101,8 +185,8 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU, int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1; - dstU[i] = (RU * r + GU * g + BU * b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; - dstV[i] = (RV * r + GV * g + BV * b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; } } @@ -113,40 +197,43 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU, #define rgb48funcs(pattern, BE_LE, origin) \ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, \ const uint8_t *_src, \ + const uint8_t *unused0, const uint8_t *unused1,\ int width, \ - uint32_t *unused) \ + uint32_t *rgb2yuv) \ { \ const uint16_t *src = (const uint16_t *)_src; \ uint16_t *dst = (uint16_t *)_dst; \ - rgb48ToY_c_template(dst, src, width, origin); \ + rgb48ToY_c_template(dst, src, width, origin, rgb2yuv); \ } \ \ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, \ uint8_t *_dstV, \ + const uint8_t *unused0, \ const uint8_t *_src1, \ const uint8_t *_src2, \ int width, \ - uint32_t *unused) \ + uint32_t *rgb2yuv) \ { \ const uint16_t *src1 = (const uint16_t *)_src1, \ *src2 = (const uint16_t *)_src2; \ uint16_t *dstU = (uint16_t *)_dstU, \ *dstV = (uint16_t *)_dstV; \ - rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \ + rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ } \ \ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, \ uint8_t *_dstV, \ + const uint8_t *unused0, \ const uint8_t *_src1, \ const uint8_t *_src2, \ int width, \ - uint32_t *unused) \ + uint32_t *rgb2yuv) \ { \ const uint16_t *src1 = (const uint16_t *)_src1, \ *src2 = (const uint16_t *)_src2; \ uint16_t *dstU = (uint16_t *)_dstU, \ *dstV = (uint16_t *)_dstV; \ - rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \ + rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ } rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE) @@ -162,7 +249,7 @@ rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE) : (isBE(origin) ? AV_RB16(&src[(i) * 2]) \ : AV_RL16(&src[(i) * 2]))) -static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst, +static av_always_inline void rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src, int width, enum AVPixelFormat origin, @@ -170,10 +257,11 @@ static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst, int shb, int shp, int maskr, int maskg, int maskb, int rsh, - int gsh, int bsh, int S) + int gsh, int bsh, int S, + int32_t *rgb2yuv) { - const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh; - const unsigned rnd = 33u << (S - 1); + const int ry = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh, by = rgb2yuv[BY_IDX]<<bsh; + const unsigned rnd = (32<<((S)-1)) + (1<<(S-7)); int i; for (i = 0; i < width; i++) { @@ -182,12 +270,12 @@ static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst, int g = (px & maskg) >> shg; int r = (px & maskr) >> shr; - dst[i] = (ry * r + gy * g + by * b + rnd) >> S; + dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6); } } -static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU, - uint8_t *dstV, +static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU, + int16_t *dstV, const uint8_t *src, int width, enum AVPixelFormat origin, @@ -195,11 +283,12 @@ static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU, int shb, int shp, int maskr, int maskg, int maskb, int rsh, - int gsh, int bsh, int S) + int gsh, int bsh, int S, + int32_t *rgb2yuv) { - const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh, - rv = RV << rsh, gv = GV << gsh, bv = BV << bsh; - const unsigned rnd = 257u << (S - 1); + const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, bu = rgb2yuv[BU_IDX] << bsh, + rv = rgb2yuv[RV_IDX] << rsh, gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh; + const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7)); int i; for (i = 0; i < width; i++) { @@ -208,13 +297,13 @@ static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU, int g = (px & maskg) >> shg; int r = (px & maskr) >> shr; - dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S; - dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S; + dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6); + dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6); } } -static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU, - uint8_t *dstV, +static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU, + int16_t *dstV, const uint8_t *src, int width, enum AVPixelFormat origin, @@ -222,20 +311,21 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU, int shb, int shp, int maskr, int maskg, int maskb, int rsh, - int gsh, int bsh, int S) + int gsh, int bsh, int S, + int32_t *rgb2yuv) { - const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh, - rv = RV << rsh, gv = GV << gsh, bv = BV << bsh, + const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, bu = rgb2yuv[BU_IDX] << bsh, + rv = rgb2yuv[RV_IDX] << rsh, gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh, maskgx = ~(maskr | maskb); - const unsigned rnd = 257u << S; + const unsigned rnd = (256U<<(S)) + (1<<(S-6)); int i; maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1; for (i = 0; i < width; i++) { - int px0 = input_pixel(2 * i + 0) >> shp; - int px1 = input_pixel(2 * i + 1) >> shp; + unsigned px0 = input_pixel(2 * i + 0) >> shp; + unsigned px1 = input_pixel(2 * i + 1) >> shp; int b, r, g = (px0 & maskgx) + (px1 & maskgx); int rb = px0 + px1 - g; @@ -249,8 +339,8 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU, } r = (rb & maskr) >> shr; - dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1); - dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1); + dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1); + dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1); } } @@ -258,31 +348,31 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU, #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \ maskg, maskb, rsh, gsh, bsh, S) \ -static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \ - int width, uint32_t *unused) \ +static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \ + int width, uint32_t *tab) \ { \ - rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \ - maskr, maskg, maskb, rsh, gsh, bsh, S); \ + rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp, \ + maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \ } \ \ static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \ - const uint8_t *src, const uint8_t *dummy, \ - int width, uint32_t *unused) \ + const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \ + int width, uint32_t *tab) \ { \ - rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, \ + rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \ shr, shg, shb, shp, \ - maskr, maskg, maskb, rsh, gsh, bsh, S); \ + maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\ } \ \ static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \ - const uint8_t *src, \ + const uint8_t *unused0, const uint8_t *src, \ const uint8_t *dummy, \ - int width, uint32_t *unused) \ + int width, uint32_t *tab) \ { \ - rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, \ + rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \ shr, shg, shb, shp, \ maskr, maskg, maskb, \ - rsh, gsh, bsh, S); \ + rsh, gsh, bsh, S, tab); \ } rgb16_32_wrapper(AV_PIX_FMT_BGR32, bgr32, 16, 0, 0, 0, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8) @@ -302,71 +392,127 @@ rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0, 0, 0, 0xF800, 0x07E0, rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7) rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4) -static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, - uint32_t *unused) +static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc, + int width, uint32_t *rgb2yuv) { + uint16_t *dstU = (uint16_t *)_dstU; + uint16_t *dstV = (uint16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + int i; - for (i = 0; i < width; i++) - dst[i] = src[4 * i]; + for (i = 0; i < width; i++) { + unsigned int g = gsrc[2*i] + gsrc[2*i+1]; + unsigned int b = bsrc[2*i] + bsrc[2*i+1]; + unsigned int r = rsrc[2*i] + rsrc[2*i+1]; + + dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1); + dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1); + } } -static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, - uint32_t *unused) +static void rgba64ToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, + const uint8_t *unused2, int width, uint32_t *unused) { + int16_t *dst = (int16_t *)_dst; + const uint16_t *src = (const uint16_t *)_src; int i; for (i = 0; i < width; i++) dst[i] = src[4 * i + 3]; } -static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal) +static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) +{ + int16_t *dst = (int16_t *)_dst; + int i; + for (i=0; i<width; i++) { + dst[i]= src[4*i]<<6; + } +} + +static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) +{ + int16_t *dst = (int16_t *)_dst; + int i; + for (i=0; i<width; i++) { + dst[i]= src[4*i+3]<<6; + } +} + +static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal) { + int16_t *dst = (int16_t *)_dst; + int i; + for (i=0; i<width; i++) { + int d= src[i]; + + dst[i]= (pal[d] >> 24)<<6; + } +} + +static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal) +{ + int16_t *dst = (int16_t *)_dst; int i; for (i = 0; i < width; i++) { int d = src[i]; - dst[i] = pal[d] & 0xFF; + dst[i] = (pal[d] & 0xFF)<<6; } } -static void palToUV_c(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, +static void palToUV_c(uint8_t *_dstU, uint8_t *_dstV, + const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *pal) { + uint16_t *dstU = (uint16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; int i; - assert(src1 == src2); + av_assert1(src1 == src2); for (i = 0; i < width; i++) { int p = pal[src1[i]]; - dstU[i] = p >> 8; - dstV[i] = p >> 16; + dstU[i] = (uint8_t)(p>> 8)<<6; + dstV[i] = (uint8_t)(p>>16)<<6; } } -static void monowhite2Y_c(uint8_t *dst, const uint8_t *src, - int width, uint32_t *unused) +static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { + int16_t *dst = (int16_t *)_dst; int i, j; width = (width + 7) >> 3; for (i = 0; i < width; i++) { int d = ~src[i]; for (j = 0; j < 8; j++) - dst[8 * i + j] = ((d >> (7 - j)) & 1) * 255; + dst[8*i+j]= ((d>>(7-j))&1) * 16383; + } + if(width&7){ + int d= ~src[i]; + for (j = 0; j < (width&7); j++) + dst[8*i+j]= ((d>>(7-j))&1) * 16383; } } -static void monoblack2Y_c(uint8_t *dst, const uint8_t *src, - int width, uint32_t *unused) +static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { + int16_t *dst = (int16_t *)_dst; int i, j; width = (width + 7) >> 3; for (i = 0; i < width; i++) { int d = src[i]; for (j = 0; j < 8; j++) - dst[8 * i + j] = ((d >> (7 - j)) & 1) * 255; + dst[8*i+j]= ((d>>(7-j))&1) * 16383; + } + if(width&7){ + int d = src[i]; + for (j = 0; j < (width&7); j++) + dst[8*i+j] = ((d>>(7-j))&1) * 16383; } } -static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width, +static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { int i; @@ -374,7 +520,7 @@ static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width, dst[i] = src[2 * i]; } -static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, +static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { int i; @@ -382,10 +528,10 @@ static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, dstU[i] = src1[4 * i + 1]; dstV[i] = src1[4 * i + 3]; } - assert(src1 == src2); + av_assert1(src1 == src2); } -static void yvy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, +static void yvy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { int i; @@ -393,10 +539,10 @@ static void yvy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, dstV[i] = src1[4 * i + 1]; dstU[i] = src1[4 * i + 3]; } - assert(src1 == src2); + av_assert1(src1 == src2); } -static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, +static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { int i; @@ -406,7 +552,7 @@ static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, dst[i] = av_bswap16(src[i]); } -static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1, +static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, int width, uint32_t *unused) { int i; @@ -419,7 +565,7 @@ static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1, } } -static void read_ya16le_gray_c(uint8_t *dst, const uint8_t *src, int width, +static void read_ya16le_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { int i; @@ -427,7 +573,7 @@ static void read_ya16le_gray_c(uint8_t *dst, const uint8_t *src, int width, AV_WN16(dst + i * 2, AV_RL16(src + i * 4)); } -static void read_ya16le_alpha_c(uint8_t *dst, const uint8_t *src, int width, +static void read_ya16le_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { int i; @@ -435,7 +581,7 @@ static void read_ya16le_alpha_c(uint8_t *dst, const uint8_t *src, int width, AV_WN16(dst + i * 2, AV_RL16(src + i * 4 + 2)); } -static void read_ya16be_gray_c(uint8_t *dst, const uint8_t *src, int width, +static void read_ya16be_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { int i; @@ -443,7 +589,7 @@ static void read_ya16be_gray_c(uint8_t *dst, const uint8_t *src, int width, AV_WN16(dst + i * 2, AV_RB16(src + i * 4)); } -static void read_ya16be_alpha_c(uint8_t *dst, const uint8_t *src, int width, +static void read_ya16be_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { int i; @@ -453,7 +599,7 @@ static void read_ya16be_alpha_c(uint8_t *dst, const uint8_t *src, int width, /* This is almost identical to the previous, end exists only because * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */ -static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width, +static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { int i; @@ -461,7 +607,7 @@ static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width, dst[i] = src[2 * i + 1]; } -static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, +static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { int i; @@ -469,7 +615,7 @@ static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, dstU[i] = src1[4 * i + 0]; dstV[i] = src1[4 * i + 2]; } - assert(src1 == src2); + av_assert1(src1 == src2); } static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2, @@ -483,14 +629,14 @@ static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2, } static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, + const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { nvXXtoUV_c(dstU, dstV, src1, width); } static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, + const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { nvXXtoUV_c(dstV, dstU, src1, width); @@ -498,218 +644,213 @@ static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV, #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) -static void bgr24ToY_c(uint8_t *dst, const uint8_t *src, - int width, uint32_t *unused) +static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *rgb2yuv) { + int16_t *dst = (int16_t *)_dst; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; int i; for (i = 0; i < width; i++) { int b = src[i * 3 + 0]; int g = src[i * 3 + 1]; int r = src[i * 3 + 2]; - dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); + dst[i] = ((ry*r + gy*g + by*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6)); } } -static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, - const uint8_t *src2, int width, uint32_t *unused) +static void bgr24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) { + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; int i; for (i = 0; i < width; i++) { int b = src1[3 * i + 0]; int g = src1[3 * i + 1]; int r = src1[3 * i + 2]; - dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; - dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstU[i] = (ru*r + gu*g + bu*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); } - assert(src1 == src2); + av_assert1(src1 == src2); } -static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, - const uint8_t *src2, int width, uint32_t *unused) +static void bgr24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) { + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; int i; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; for (i = 0; i < width; i++) { int b = src1[6 * i + 0] + src1[6 * i + 3]; int g = src1[6 * i + 1] + src1[6 * i + 4]; int r = src1[6 * i + 2] + src1[6 * i + 5]; - dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1); - dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1); + dstU[i] = (ru*r + gu*g + bu*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5); + dstV[i] = (rv*r + gv*g + bv*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5); } - assert(src1 == src2); + av_assert1(src1 == src2); } -static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width, - uint32_t *unused) +static void rgb24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, + uint32_t *rgb2yuv) { + int16_t *dst = (int16_t *)_dst; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; int i; for (i = 0; i < width; i++) { int r = src[i * 3 + 0]; int g = src[i * 3 + 1]; int b = src[i * 3 + 2]; - dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); + dst[i] = ((ry*r + gy*g + by*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6)); } } -static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, - const uint8_t *src2, int width, uint32_t *unused) +static void rgb24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) { + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; int i; - assert(src1 == src2); + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + av_assert1(src1 == src2); for (i = 0; i < width; i++) { int r = src1[3 * i + 0]; int g = src1[3 * i + 1]; int b = src1[3 * i + 2]; - dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; - dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstU[i] = (ru*r + gu*g + bu*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); } } -static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, - const uint8_t *src2, int width, uint32_t *unused) +static void rgb24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) { + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; int i; - assert(src1 == src2); + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + av_assert1(src1 == src2); for (i = 0; i < width; i++) { int r = src1[6 * i + 0] + src1[6 * i + 3]; int g = src1[6 * i + 1] + src1[6 * i + 4]; int b = src1[6 * i + 2] + src1[6 * i + 5]; - dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1); - dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1); + dstU[i] = (ru*r + gu*g + bu*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5); + dstV[i] = (rv*r + gv*g + bv*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5); } } -static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width) +static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv) { + uint16_t *dst = (uint16_t *)_dst; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; int i; for (i = 0; i < width; i++) { int g = src[0][i]; int b = src[1][i]; int r = src[2][i]; - dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); + dst[i] = (ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); } } -static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], int width) +static void planar_rgb_to_a(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *unused) { + uint16_t *dst = (uint16_t *)_dst; + int i; + for (i = 0; i < width; i++) + dst[i] = src[3][i] << 6; +} + +static void planar_rgb_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv) +{ + uint16_t *dstU = (uint16_t *)_dstU; + uint16_t *dstV = (uint16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; int i; for (i = 0; i < width; i++) { int g = src[0][i]; int b = src[1][i]; int r = src[2][i]; - dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; - dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT; + dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); } } #define rdpx(src) \ is_be ? AV_RB16(src) : AV_RL16(src) static av_always_inline void planar_rgb16_to_y(uint8_t *_dst, const uint8_t *_src[4], - int width, int bpc, int is_be) + int width, int bpc, int is_be, int32_t *rgb2yuv) { int i; const uint16_t **src = (const uint16_t **)_src; uint16_t *dst = (uint16_t *)_dst; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + int shift = bpc < 16 ? bpc : 14; for (i = 0; i < width; i++) { int g = rdpx(src[0] + i); int b = rdpx(src[1] + i); int r = rdpx(src[2] + i); - dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT + bpc - 9))) >> RGB2YUV_SHIFT); + dst[i] = ((ry*r + gy*g + by*b + (33 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + shift - 14)); } } -static void planar_rgb9le_to_y(uint8_t *dst, const uint8_t *src[4], int w) -{ - planar_rgb16_to_y(dst, src, w, 9, 0); -} - -static void planar_rgb9be_to_y(uint8_t *dst, const uint8_t *src[4], int w) -{ - planar_rgb16_to_y(dst, src, w, 9, 1); -} - -static void planar_rgb10le_to_y(uint8_t *dst, const uint8_t *src[4], int w) -{ - planar_rgb16_to_y(dst, src, w, 10, 0); -} - -static void planar_rgb10be_to_y(uint8_t *dst, const uint8_t *src[4], int w) -{ - planar_rgb16_to_y(dst, src, w, 10, 1); -} - -static void planar_rgb16le_to_y(uint8_t *dst, const uint8_t *src[4], int w) -{ - planar_rgb16_to_y(dst, src, w, 16, 0); -} - -static void planar_rgb16be_to_y(uint8_t *dst, const uint8_t *src[4], int w) -{ - planar_rgb16_to_y(dst, src, w, 16, 1); -} - static av_always_inline void planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width, - int bpc, int is_be) + int bpc, int is_be, int32_t *rgb2yuv) { int i; const uint16_t **src = (const uint16_t **)_src; uint16_t *dstU = (uint16_t *)_dstU; uint16_t *dstV = (uint16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + int shift = bpc < 16 ? bpc : 14; for (i = 0; i < width; i++) { int g = rdpx(src[0] + i); int b = rdpx(src[1] + i); int r = rdpx(src[2] + i); - dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> RGB2YUV_SHIFT; - dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> RGB2YUV_SHIFT; + dstU[i] = (ru*r + gu*g + bu*b + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + shift - 14); + dstV[i] = (rv*r + gv*g + bv*b + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + shift - 14); } } #undef rdpx -static void planar_rgb9le_to_uv(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src[4], int w) -{ - planar_rgb16_to_uv(dstU, dstV, src, w, 9, 0); -} - -static void planar_rgb9be_to_uv(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src[4], int w) -{ - planar_rgb16_to_uv(dstU, dstV, src, w, 9, 1); -} - -static void planar_rgb10le_to_uv(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src[4], int w) -{ - planar_rgb16_to_uv(dstU, dstV, src, w, 10, 0); -} - -static void planar_rgb10be_to_uv(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src[4], int w) -{ - planar_rgb16_to_uv(dstU, dstV, src, w, 10, 1); -} - -static void planar_rgb16le_to_uv(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src[4], int w) -{ - planar_rgb16_to_uv(dstU, dstV, src, w, 16, 0); -} - -static void planar_rgb16be_to_uv(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src[4], int w) -{ - planar_rgb16_to_uv(dstU, dstV, src, w, 16, 1); -} +#define rgb9plus_planar_funcs_endian(nbits, endian_name, endian) \ +static void planar_rgb##nbits##endian_name##_to_y(uint8_t *dst, const uint8_t *src[4], \ + int w, int32_t *rgb2yuv) \ +{ \ + planar_rgb16_to_y(dst, src, w, nbits, endian, rgb2yuv); \ +} \ +static void planar_rgb##nbits##endian_name##_to_uv(uint8_t *dstU, uint8_t *dstV, \ + const uint8_t *src[4], int w, int32_t *rgb2yuv) \ +{ \ + planar_rgb16_to_uv(dstU, dstV, src, w, nbits, endian, rgb2yuv); \ +} \ + +#define rgb9plus_planar_funcs(nbits) \ + rgb9plus_planar_funcs_endian(nbits, le, 0) \ + rgb9plus_planar_funcs_endian(nbits, be, 1) + +rgb9plus_planar_funcs(9) +rgb9plus_planar_funcs(10) +rgb9plus_planar_funcs(12) +rgb9plus_planar_funcs(14) +rgb9plus_planar_funcs(16) av_cold void ff_sws_init_input_funcs(SwsContext *c) { @@ -745,6 +886,13 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_GBRP10LE: c->readChrPlanar = planar_rgb10le_to_uv; break; + case AV_PIX_FMT_GBRP12LE: + c->readChrPlanar = planar_rgb12le_to_uv; + break; + case AV_PIX_FMT_GBRP14LE: + c->readChrPlanar = planar_rgb14le_to_uv; + break; + case AV_PIX_FMT_GBRAP16LE: case AV_PIX_FMT_GBRP16LE: c->readChrPlanar = planar_rgb16le_to_uv; break; @@ -754,9 +902,17 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_GBRP10BE: c->readChrPlanar = planar_rgb10be_to_uv; break; + case AV_PIX_FMT_GBRP12BE: + c->readChrPlanar = planar_rgb12be_to_uv; + break; + case AV_PIX_FMT_GBRP14BE: + c->readChrPlanar = planar_rgb14be_to_uv; + break; + case AV_PIX_FMT_GBRAP16BE: case AV_PIX_FMT_GBRP16BE: c->readChrPlanar = planar_rgb16be_to_uv; break; + case AV_PIX_FMT_GBRAP: case AV_PIX_FMT_GBRP: c->readChrPlanar = planar_rgb_to_uv; break; @@ -767,14 +923,21 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_YUV422P10LE: case AV_PIX_FMT_YUV444P10LE: case AV_PIX_FMT_YUV420P10LE: + case AV_PIX_FMT_YUV422P12LE: + case AV_PIX_FMT_YUV444P12LE: + case AV_PIX_FMT_YUV420P12LE: + case AV_PIX_FMT_YUV422P14LE: + case AV_PIX_FMT_YUV444P14LE: + case AV_PIX_FMT_YUV420P14LE: case AV_PIX_FMT_YUV420P16LE: case AV_PIX_FMT_YUV422P16LE: case AV_PIX_FMT_YUV444P16LE: + case AV_PIX_FMT_YUVA444P9LE: case AV_PIX_FMT_YUVA422P9LE: case AV_PIX_FMT_YUVA420P9LE: - case AV_PIX_FMT_YUVA422P10LE: case AV_PIX_FMT_YUVA444P10LE: + case AV_PIX_FMT_YUVA422P10LE: case AV_PIX_FMT_YUVA420P10LE: case AV_PIX_FMT_YUVA420P16LE: case AV_PIX_FMT_YUVA422P16LE: @@ -788,14 +951,21 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_YUV444P10BE: case AV_PIX_FMT_YUV422P10BE: case AV_PIX_FMT_YUV420P10BE: + case AV_PIX_FMT_YUV444P12BE: + case AV_PIX_FMT_YUV422P12BE: + case AV_PIX_FMT_YUV420P12BE: + case AV_PIX_FMT_YUV444P14BE: + case AV_PIX_FMT_YUV422P14BE: + case AV_PIX_FMT_YUV420P14BE: case AV_PIX_FMT_YUV420P16BE: case AV_PIX_FMT_YUV422P16BE: case AV_PIX_FMT_YUV444P16BE: + case AV_PIX_FMT_YUVA444P9BE: case AV_PIX_FMT_YUVA422P9BE: case AV_PIX_FMT_YUVA420P9BE: - case AV_PIX_FMT_YUVA422P10BE: case AV_PIX_FMT_YUVA444P10BE: + case AV_PIX_FMT_YUVA422P10BE: case AV_PIX_FMT_YUVA420P10BE: case AV_PIX_FMT_YUVA420P16BE: case AV_PIX_FMT_YUVA422P16BE: @@ -806,6 +976,18 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) } if (c->chrSrcHSubSample) { switch (srcFormat) { + case AV_PIX_FMT_RGBA64BE: + c->chrToYV12 = rgb64BEToUV_half_c; + break; + case AV_PIX_FMT_RGBA64LE: + c->chrToYV12 = rgb64LEToUV_half_c; + break; + case AV_PIX_FMT_BGRA64BE: + c->chrToYV12 = bgr64BEToUV_half_c; + break; + case AV_PIX_FMT_BGRA64LE: + c->chrToYV12 = bgr64LEToUV_half_c; + break; case AV_PIX_FMT_RGB48BE: c->chrToYV12 = rgb48BEToUV_half_c; break; @@ -839,6 +1021,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break; + case AV_PIX_FMT_GBRAP: + case AV_PIX_FMT_GBRP: + c->chrToYV12 = gbr24pToUV_half_c; + break; case AV_PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_half_c; break; @@ -875,6 +1061,18 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) } } else { switch (srcFormat) { + case AV_PIX_FMT_RGBA64BE: + c->chrToYV12 = rgb64BEToUV_c; + break; + case AV_PIX_FMT_RGBA64LE: + c->chrToYV12 = rgb64LEToUV_c; + break; + case AV_PIX_FMT_BGRA64BE: + c->chrToYV12 = bgr64BEToUV_c; + break; + case AV_PIX_FMT_BGRA64LE: + c->chrToYV12 = bgr64LEToUV_c; + break; case AV_PIX_FMT_RGB48BE: c->chrToYV12 = rgb48BEToUV_c; break; @@ -953,6 +1151,13 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_GBRP10LE: c->readLumPlanar = planar_rgb10le_to_y; break; + case AV_PIX_FMT_GBRP12LE: + c->readLumPlanar = planar_rgb12le_to_y; + break; + case AV_PIX_FMT_GBRP14LE: + c->readLumPlanar = planar_rgb14le_to_y; + break; + case AV_PIX_FMT_GBRAP16LE: case AV_PIX_FMT_GBRP16LE: c->readLumPlanar = planar_rgb16le_to_y; break; @@ -962,9 +1167,18 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_GBRP10BE: c->readLumPlanar = planar_rgb10be_to_y; break; + case AV_PIX_FMT_GBRP12BE: + c->readLumPlanar = planar_rgb12be_to_y; + break; + case AV_PIX_FMT_GBRP14BE: + c->readLumPlanar = planar_rgb14be_to_y; + break; + case AV_PIX_FMT_GBRAP16BE: case AV_PIX_FMT_GBRP16BE: c->readLumPlanar = planar_rgb16be_to_y; break; + case AV_PIX_FMT_GBRAP: + c->readAlpPlanar = planar_rgb_to_a; case AV_PIX_FMT_GBRP: c->readLumPlanar = planar_rgb_to_y; break; @@ -975,9 +1189,16 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_YUV444P10LE: case AV_PIX_FMT_YUV422P10LE: case AV_PIX_FMT_YUV420P10LE: + case AV_PIX_FMT_YUV444P12LE: + case AV_PIX_FMT_YUV422P12LE: + case AV_PIX_FMT_YUV420P12LE: + case AV_PIX_FMT_YUV444P14LE: + case AV_PIX_FMT_YUV422P14LE: + case AV_PIX_FMT_YUV420P14LE: case AV_PIX_FMT_YUV420P16LE: case AV_PIX_FMT_YUV422P16LE: case AV_PIX_FMT_YUV444P16LE: + case AV_PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break; @@ -1000,9 +1221,16 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_YUV444P10BE: case AV_PIX_FMT_YUV422P10BE: case AV_PIX_FMT_YUV420P10BE: + case AV_PIX_FMT_YUV444P12BE: + case AV_PIX_FMT_YUV422P12BE: + case AV_PIX_FMT_YUV420P12BE: + case AV_PIX_FMT_YUV444P14BE: + case AV_PIX_FMT_YUV422P14BE: + case AV_PIX_FMT_YUV420P14BE: case AV_PIX_FMT_YUV420P16BE: case AV_PIX_FMT_YUV422P16BE: case AV_PIX_FMT_YUV444P16BE: + case AV_PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break; @@ -1114,9 +1342,28 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break; + case AV_PIX_FMT_RGBA64BE: + c->lumToYV12 = rgb64BEToY_c; + break; + case AV_PIX_FMT_RGBA64LE: + c->lumToYV12 = rgb64LEToY_c; + break; + case AV_PIX_FMT_BGRA64BE: + c->lumToYV12 = bgr64BEToY_c; + break; + case AV_PIX_FMT_BGRA64LE: + c->lumToYV12 = bgr64LEToY_c; } if (c->alpPixBuf) { + if (is16BPS(srcFormat) || isNBPS(srcFormat)) { + if (HAVE_BIGENDIAN == !isBE(srcFormat)) + c->alpToYV12 = bswap16Y_c; + } switch (srcFormat) { + case AV_PIX_FMT_BGRA64LE: + case AV_PIX_FMT_BGRA64BE: + case AV_PIX_FMT_RGBA64LE: + case AV_PIX_FMT_RGBA64BE: c->alpToYV12 = rgba64ToA_c; break; case AV_PIX_FMT_BGRA: case AV_PIX_FMT_RGBA: c->alpToYV12 = rgbaToA_c; @@ -1128,6 +1375,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_YA8: c->alpToYV12 = uyvyToY_c; break; + case AV_PIX_FMT_PAL8 : + c->alpToYV12 = palToA_c; + break; } } } diff --git a/libswscale/options.c b/libswscale/options.c index e7765d6..5433d55 100644 --- a/libswscale/options.c +++ b/libswscale/options.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -34,7 +34,7 @@ static const char *sws_context_to_name(void *ptr) #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM static const AVOption swscale_options[] = { - { "sws_flags", "scaler flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, { .i64 = DEFAULT }, 0, UINT_MAX, VE, "sws_flags" }, + { "sws_flags", "scaler flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, { .i64 = SWS_BICUBIC }, 0, UINT_MAX, VE, "sws_flags" }, { "fast_bilinear", "fast bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FAST_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" }, { "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" }, { "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBIC }, INT_MIN, INT_MAX, VE, "sws_flags" }, @@ -51,6 +51,7 @@ static const AVOption swscale_options[] = { { "full_chroma_int", "full chroma interpolation", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" }, { "full_chroma_inp", "full chroma input", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" }, { "bitexact", "", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" }, + { "error_diffusion", "error diffusion dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ERROR_DIFFUSION}, INT_MIN, INT_MAX, VE, "sws_flags" }, { "srcw", "source width", OFFSET(srcW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE }, { "srch", "source height", OFFSET(srcH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE }, @@ -63,10 +64,28 @@ static const AVOption swscale_options[] = { { "param0", "scaler param 0", OFFSET(param[0]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE }, { "param1", "scaler param 1", OFFSET(param[1]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE }, + { "src_v_chr_pos", "source vertical chroma position in luma grid/256" , OFFSET(src_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 512, VE }, + { "src_h_chr_pos", "source horizontal chroma position in luma grid/256", OFFSET(src_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 512, VE }, + { "dst_v_chr_pos", "destination vertical chroma position in luma grid/256" , OFFSET(dst_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 512, VE }, + { "dst_h_chr_pos", "destination horizontal chroma position in luma grid/256", OFFSET(dst_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 512, VE }, + + { "sws_dither", "set dithering algorithm", OFFSET(dither), AV_OPT_TYPE_INT, { .i64 = SWS_DITHER_AUTO }, 0, NB_SWS_DITHER, VE, "sws_dither" }, + { "auto", "leave choice to sws", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_AUTO }, INT_MIN, INT_MAX, VE, "sws_dither" }, + { "bayer", "bayer dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_BAYER }, INT_MIN, INT_MAX, VE, "sws_dither" }, + { "ed", "error diffusion", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_ED }, INT_MIN, INT_MAX, VE, "sws_dither" }, + { "a_dither", "arithmetic addition dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_A_DITHER}, INT_MIN, INT_MAX, VE, "sws_dither" }, + { "x_dither", "arithmetic xor dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_X_DITHER}, INT_MIN, INT_MAX, VE, "sws_dither" }, + { NULL } }; -const AVClass sws_context_class = { "SWScaler", sws_context_to_name, swscale_options }; +const AVClass sws_context_class = { + .class_name = "SWScaler", + .item_name = sws_context_to_name, + .option = swscale_options, + .category = AV_CLASS_CATEGORY_SWSCALER, + .version = LIBAVUTIL_VERSION_INT, +}; const AVClass *sws_get_class(void) { diff --git a/libswscale/output.c b/libswscale/output.c index 125d998..eee6b48 100644 --- a/libswscale/output.c +++ b/libswscale/output.c @@ -1,24 +1,23 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <assert.h> #include <math.h> #include <stdint.h> #include <stdio.h> @@ -26,6 +25,7 @@ #include "libavutil/attributes.h" #include "libavutil/avutil.h" +#include "libavutil/avassert.h" #include "libavutil/bswap.h" #include "libavutil/cpu.h" #include "libavutil/intreadwrite.h" @@ -36,24 +36,27 @@ #include "swscale.h" #include "swscale_internal.h" -DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={ +DECLARE_ALIGNED(8, const uint8_t, ff_dither_2x2_4)[][8] = { { 1, 3, 1, 3, 1, 3, 1, 3, }, { 2, 0, 2, 0, 2, 0, 2, 0, }, +{ 1, 3, 1, 3, 1, 3, 1, 3, }, }; -DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={ +DECLARE_ALIGNED(8, const uint8_t, ff_dither_2x2_8)[][8] = { { 6, 2, 6, 2, 6, 2, 6, 2, }, { 0, 4, 0, 4, 0, 4, 0, 4, }, +{ 6, 2, 6, 2, 6, 2, 6, 2, }, }; -DECLARE_ALIGNED(8, const uint8_t, ff_dither_4x4_16)[4][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_dither_4x4_16)[][8] = { { 8, 4, 11, 7, 8, 4, 11, 7, }, { 2, 14, 1, 13, 2, 14, 1, 13, }, { 10, 6, 9, 5, 10, 6, 9, 5, }, { 0, 12, 3, 15, 0, 12, 3, 15, }, +{ 8, 4, 11, 7, 8, 4, 11, 7, }, }; -DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_32)[8][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_32)[][8] = { { 17, 9, 23, 15, 16, 8, 22, 14, }, { 5, 29, 3, 27, 4, 28, 2, 26, }, { 21, 13, 19, 11, 20, 12, 18, 10, }, @@ -62,9 +65,10 @@ DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_32)[8][8] = { { 4, 28, 2, 26, 5, 29, 3, 27, }, { 20, 12, 18, 10, 21, 13, 19, 11, }, { 1, 25, 7, 31, 0, 24, 6, 30, }, +{ 17, 9, 23, 15, 16, 8, 22, 14, }, }; -DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_73)[8][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_73)[][8] = { { 0, 55, 14, 68, 3, 58, 17, 72, }, { 37, 18, 50, 32, 40, 22, 54, 35, }, { 9, 64, 5, 59, 13, 67, 8, 63, }, @@ -73,10 +77,11 @@ DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_73)[8][8] = { { 39, 21, 52, 34, 38, 19, 51, 33, }, { 11, 66, 7, 62, 10, 65, 6, 60, }, { 48, 30, 43, 25, 47, 29, 42, 24, }, +{ 0, 55, 14, 68, 3, 58, 17, 72, }, }; #if 1 -DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[8][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = { {117, 62, 158, 103, 113, 58, 155, 100, }, { 34, 199, 21, 186, 31, 196, 17, 182, }, {144, 89, 131, 76, 141, 86, 127, 72, }, @@ -85,10 +90,11 @@ DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[8][8] = { { 28, 193, 14, 179, 38, 203, 24, 189, }, {138, 83, 124, 69, 148, 93, 134, 79, }, { 7, 172, 48, 213, 3, 168, 45, 210, }, +{117, 62, 158, 103, 113, 58, 155, 100, }, }; #elif 1 // tries to correct a gamma of 1.5 -DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[8][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = { { 0, 143, 18, 200, 2, 156, 25, 215, }, { 78, 28, 125, 64, 89, 36, 138, 74, }, { 10, 180, 3, 161, 16, 195, 8, 175, }, @@ -97,10 +103,11 @@ DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[8][8] = { { 85, 33, 134, 71, 81, 30, 130, 67, }, { 14, 190, 6, 171, 12, 185, 5, 166, }, {117, 57, 101, 44, 113, 54, 97, 41, }, +{ 0, 143, 18, 200, 2, 156, 25, 215, }, }; #elif 1 // tries to correct a gamma of 2.0 -DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[8][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = { { 0, 124, 8, 193, 0, 140, 12, 213, }, { 55, 14, 104, 42, 66, 19, 119, 52, }, { 3, 168, 1, 145, 6, 187, 3, 162, }, @@ -109,10 +116,11 @@ DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[8][8] = { { 62, 17, 114, 48, 58, 16, 109, 45, }, { 5, 181, 2, 157, 4, 175, 1, 151, }, { 95, 36, 78, 26, 90, 34, 74, 24, }, +{ 0, 124, 8, 193, 0, 140, 12, 213, }, }; #else // tries to correct a gamma of 2.5 -DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[8][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = { { 0, 107, 3, 187, 0, 125, 6, 212, }, { 39, 7, 86, 28, 49, 11, 102, 36, }, { 1, 158, 0, 131, 3, 180, 1, 151, }, @@ -121,6 +129,7 @@ DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[8][8] = { { 45, 9, 96, 33, 42, 8, 91, 30, }, { 2, 172, 1, 144, 2, 165, 0, 137, }, { 77, 23, 60, 15, 72, 21, 56, 14, }, +{ 0, 107, 3, 187, 0, 125, 6, 212, }, }; #endif @@ -136,7 +145,8 @@ yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW, int big_endian, int output_bits) { int i; - int shift = 19 - output_bits; + int shift = 3; + av_assert0(output_bits == 16); for (i = 0; i < dstW; i++) { int val = src[i] + (1 << (shift - 1)); @@ -150,10 +160,11 @@ yuv2planeX_16_c_template(const int16_t *filter, int filterSize, int big_endian, int output_bits) { int i; - int shift = 15 + 16 - output_bits; + int shift = 15; + av_assert0(output_bits == 16); for (i = 0; i < dstW; i++) { - int val = 1 << (30-output_bits); + int val = 1 << (shift - 1); int j; /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline @@ -163,7 +174,7 @@ yuv2planeX_16_c_template(const int16_t *filter, int filterSize, * reasonable filterSize), and re-add that at the end. */ val -= 0x40000000; for (j = 0; j < filterSize; j++) - val += src[j][i] * filter[j]; + val += src[j][i] * (unsigned)filter[j]; output_pixel(&dest[i], val, 0x8000, int); } @@ -200,7 +211,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize, int shift = 11 + 16 - output_bits; for (i = 0; i < dstW; i++) { - int val = 1 << (26-output_bits); + int val = 1 << (shift - 1); int j; for (j = 0; j < filterSize; j++) @@ -232,6 +243,10 @@ yuv2NBPS( 9, BE, 1, 10, int16_t) yuv2NBPS( 9, LE, 0, 10, int16_t) yuv2NBPS(10, BE, 1, 10, int16_t) yuv2NBPS(10, LE, 0, 10, int16_t) +yuv2NBPS(12, BE, 1, 10, int16_t) +yuv2NBPS(12, LE, 0, 10, int16_t) +yuv2NBPS(14, BE, 1, 10, int16_t) +yuv2NBPS(14, LE, 0, 10, int16_t) yuv2NBPS(16, BE, 1, 16, int32_t) yuv2NBPS(16, LE, 0, 16, int32_t) @@ -317,6 +332,7 @@ yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter, const uint8_t * const d128 = ff_dither_8x8_220[y&7]; int i; unsigned acc = 0; + int err = 0; for (i = 0; i < dstW; i += 2) { int j; @@ -333,12 +349,25 @@ yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter, Y1 = av_clip_uint8(Y1); Y2 = av_clip_uint8(Y2); } - accumulate_bit(acc, Y1 + d128[(i + 0) & 7]); - accumulate_bit(acc, Y2 + d128[(i + 1) & 7]); + if (c->dither == SWS_DITHER_ED) { + Y1 += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4; + c->dither_error[0][i] = err; + acc = 2*acc + (Y1 >= 128); + Y1 -= 220*(acc&1); + + err = Y2 + ((7*Y1 + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4); + c->dither_error[0][i+1] = Y1; + acc = 2*acc + (err >= 128); + err -= 220*(acc&1); + } else { + accumulate_bit(acc, Y1 + d128[(i + 0) & 7]); + accumulate_bit(acc, Y2 + d128[(i + 1) & 7]); + } if ((i & 7) == 6) { output_pixel(*dest++, acc); } } + c->dither_error[0][i] = err; if (i & 6) { output_pixel(*dest, acc); @@ -357,6 +386,29 @@ yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2], int yalpha1 = 4096 - yalpha; int i; + if (c->dither == SWS_DITHER_ED) { + int err = 0; + int acc = 0; + for (i = 0; i < dstW; i +=2) { + int Y; + + Y = (buf0[i + 0] * yalpha1 + buf1[i + 0] * yalpha) >> 19; + Y += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4; + c->dither_error[0][i] = err; + acc = 2*acc + (Y >= 128); + Y -= 220*(acc&1); + + err = (buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19; + err += (7*Y + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4; + c->dither_error[0][i+1] = Y; + acc = 2*acc + (err >= 128); + err -= 220*(acc&1); + + if ((i & 7) == 6) + output_pixel(*dest++, acc); + } + c->dither_error[0][i] = err; + } else { for (i = 0; i < dstW; i += 8) { int Y, acc = 0; @@ -379,6 +431,7 @@ yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2], output_pixel(*dest++, acc); } + } } static av_always_inline void @@ -390,20 +443,43 @@ yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0, const uint8_t * const d128 = ff_dither_8x8_220[y & 7]; int i; + if (c->dither == SWS_DITHER_ED) { + int err = 0; + int acc = 0; + for (i = 0; i < dstW; i +=2) { + int Y; + + Y = ((buf0[i + 0] + 64) >> 7); + Y += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4; + c->dither_error[0][i] = err; + acc = 2*acc + (Y >= 128); + Y -= 220*(acc&1); + + err = ((buf0[i + 1] + 64) >> 7); + err += (7*Y + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4; + c->dither_error[0][i+1] = Y; + acc = 2*acc + (err >= 128); + err -= 220*(acc&1); + + if ((i & 7) == 6) + output_pixel(*dest++, acc); + } + c->dither_error[0][i] = err; + } else { for (i = 0; i < dstW; i += 8) { int acc = 0; - - accumulate_bit(acc, (buf0[i + 0] >> 7) + d128[0]); - accumulate_bit(acc, (buf0[i + 1] >> 7) + d128[1]); - accumulate_bit(acc, (buf0[i + 2] >> 7) + d128[2]); - accumulate_bit(acc, (buf0[i + 3] >> 7) + d128[3]); - accumulate_bit(acc, (buf0[i + 4] >> 7) + d128[4]); - accumulate_bit(acc, (buf0[i + 5] >> 7) + d128[5]); - accumulate_bit(acc, (buf0[i + 6] >> 7) + d128[6]); - accumulate_bit(acc, (buf0[i + 7] >> 7) + d128[7]); + accumulate_bit(acc, ((buf0[i + 0] + 64) >> 7) + d128[0]); + accumulate_bit(acc, ((buf0[i + 1] + 64) >> 7) + d128[1]); + accumulate_bit(acc, ((buf0[i + 2] + 64) >> 7) + d128[2]); + accumulate_bit(acc, ((buf0[i + 3] + 64) >> 7) + d128[3]); + accumulate_bit(acc, ((buf0[i + 4] + 64) >> 7) + d128[4]); + accumulate_bit(acc, ((buf0[i + 5] + 64) >> 7) + d128[5]); + accumulate_bit(acc, ((buf0[i + 6] + 64) >> 7) + d128[6]); + accumulate_bit(acc, ((buf0[i + 7] + 64) >> 7) + d128[7]); output_pixel(*dest++, acc); } + } } #undef output_pixel @@ -521,10 +597,12 @@ yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2], int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha) >> 19; int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha) >> 19; - Y1 = av_clip_uint8(Y1); - Y2 = av_clip_uint8(Y2); - U = av_clip_uint8(U); - V = av_clip_uint8(V); + if ((Y1 | Y2 | U | V) & 0x100) { + Y1 = av_clip_uint8(Y1); + Y2 = av_clip_uint8(Y2); + U = av_clip_uint8(U); + V = av_clip_uint8(V); + } output_pixels(i * 4, Y1, U, Y2, V); } @@ -541,10 +619,17 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0, if (uvalpha < 2048) { for (i = 0; i < ((dstW + 1) >> 1); i++) { - int Y1 = buf0[i * 2] >> 7; - int Y2 = buf0[i * 2 + 1] >> 7; - int U = ubuf0[i] >> 7; - int V = vbuf0[i] >> 7; + int Y1 = (buf0[i * 2 ]+64) >> 7; + int Y2 = (buf0[i * 2 + 1]+64) >> 7; + int U = (ubuf0[i] +64) >> 7; + int V = (vbuf0[i] +64) >> 7; + + if ((Y1 | Y2 | U | V) & 0x100) { + Y1 = av_clip_uint8(Y1); + Y2 = av_clip_uint8(Y2); + U = av_clip_uint8(U); + V = av_clip_uint8(V); + } Y1 = av_clip_uint8(Y1); Y2 = av_clip_uint8(Y2); @@ -556,10 +641,17 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0, } else { const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; for (i = 0; i < ((dstW + 1) >> 1); i++) { - int Y1 = buf0[i * 2] >> 7; - int Y2 = buf0[i * 2 + 1] >> 7; - int U = (ubuf0[i] + ubuf1[i]) >> 8; - int V = (vbuf0[i] + vbuf1[i]) >> 8; + int Y1 = (buf0[i * 2 ] + 64) >> 7; + int Y2 = (buf0[i * 2 + 1] + 64) >> 7; + int U = (ubuf0[i] + ubuf1[i]+128) >> 8; + int V = (vbuf0[i] + vbuf1[i]+128) >> 8; + + if ((Y1 | Y2 | U | V) & 0x100) { + Y1 = av_clip_uint8(Y1); + Y2 = av_clip_uint8(Y2); + U = av_clip_uint8(U); + V = av_clip_uint8(V); + } Y1 = av_clip_uint8(Y1); Y2 = av_clip_uint8(Y2); @@ -577,8 +669,8 @@ YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422) YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422) YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422) -#define R_B ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE) ? R : B) -#define B_R ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE) ? B : R) +#define R_B ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? R : B) +#define B_R ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? B : R) #define output_pixel(pos, val) \ if (isBE(target)) { \ AV_WB16(pos, val); \ @@ -587,12 +679,231 @@ YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422) } static av_always_inline void +yuv2rgba64_X_c_template(SwsContext *c, const int16_t *lumFilter, + const int32_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int32_t **chrUSrc, + const int32_t **chrVSrc, int chrFilterSize, + const int32_t **alpSrc, uint16_t *dest, int dstW, + int y, enum AVPixelFormat target, int hasAlpha) +{ + int i; + int A1 = 0xffff<<14, A2 = 0xffff<<14; + + for (i = 0; i < ((dstW + 1) >> 1); i++) { + int j; + int Y1 = -0x40000000; + int Y2 = -0x40000000; + int U = -128 << 23; // 19 + int V = -128 << 23; + int R, G, B; + + for (j = 0; j < lumFilterSize; j++) { + Y1 += lumSrc[j][i * 2] * (unsigned)lumFilter[j]; + Y2 += lumSrc[j][i * 2 + 1] * (unsigned)lumFilter[j]; + } + for (j = 0; j < chrFilterSize; j++) {; + U += chrUSrc[j][i] * (unsigned)chrFilter[j]; + V += chrVSrc[j][i] * (unsigned)chrFilter[j]; + } + + if (hasAlpha) { + A1 = -0x40000000; + A2 = -0x40000000; + for (j = 0; j < lumFilterSize; j++) { + A1 += alpSrc[j][i * 2] * (unsigned)lumFilter[j]; + A2 += alpSrc[j][i * 2 + 1] * (unsigned)lumFilter[j]; + } + A1 >>= 1; + A1 += 0x20002000; + A2 >>= 1; + A2 += 0x20002000; + } + + // 8bit: 12+15=27; 16-bit: 12+19=31 + Y1 >>= 14; // 10 + Y1 += 0x10000; + Y2 >>= 14; + Y2 += 0x10000; + U >>= 14; + V >>= 14; + + // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit + Y1 -= c->yuv2rgb_y_offset; + Y2 -= c->yuv2rgb_y_offset; + Y1 *= c->yuv2rgb_y_coeff; + Y2 *= c->yuv2rgb_y_coeff; + Y1 += 1 << 13; // 21 + Y2 += 1 << 13; + // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit + + R = V * c->yuv2rgb_v2r_coeff; + G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff; + B = U * c->yuv2rgb_u2b_coeff; + + // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit + output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14); + output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14); + output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14); + output_pixel(&dest[3], av_clip_uintp2(A1 , 30) >> 14); + output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14); + output_pixel(&dest[5], av_clip_uintp2( G + Y2, 30) >> 14); + output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14); + output_pixel(&dest[7], av_clip_uintp2(A2 , 30) >> 14); + dest += 8; + } +} + +static av_always_inline void +yuv2rgba64_2_c_template(SwsContext *c, const int32_t *buf[2], + const int32_t *ubuf[2], const int32_t *vbuf[2], + const int32_t *abuf[2], uint16_t *dest, int dstW, + int yalpha, int uvalpha, int y, + enum AVPixelFormat target, int hasAlpha) +{ + const int32_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1], + *abuf0 = hasAlpha ? abuf[0] : NULL, + *abuf1 = hasAlpha ? abuf[1] : NULL; + int yalpha1 = 4096 - yalpha; + int uvalpha1 = 4096 - uvalpha; + int i; + int A1 = 0xffff<<14, A2 = 0xffff<<14; + + for (i = 0; i < ((dstW + 1) >> 1); i++) { + int Y1 = (buf0[i * 2] * yalpha1 + buf1[i * 2] * yalpha) >> 14; + int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 14; + int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha + (-128 << 23)) >> 14; + int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha + (-128 << 23)) >> 14; + int R, G, B; + + Y1 -= c->yuv2rgb_y_offset; + Y2 -= c->yuv2rgb_y_offset; + Y1 *= c->yuv2rgb_y_coeff; + Y2 *= c->yuv2rgb_y_coeff; + Y1 += 1 << 13; + Y2 += 1 << 13; + + R = V * c->yuv2rgb_v2r_coeff; + G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff; + B = U * c->yuv2rgb_u2b_coeff; + + if (hasAlpha) { + A1 = (abuf0[i * 2 ] * yalpha1 + abuf1[i * 2 ] * yalpha) >> 1; + A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 1; + + A1 += 1 << 13; + A2 += 1 << 13; + } + + output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14); + output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14); + output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14); + output_pixel(&dest[3], av_clip_uintp2(A1 , 30) >> 14); + output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14); + output_pixel(&dest[5], av_clip_uintp2( G + Y2, 30) >> 14); + output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14); + output_pixel(&dest[7], av_clip_uintp2(A2 , 30) >> 14); + dest += 8; + } +} + +static av_always_inline void +yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0, + const int32_t *ubuf[2], const int32_t *vbuf[2], + const int32_t *abuf0, uint16_t *dest, int dstW, + int uvalpha, int y, enum AVPixelFormat target, int hasAlpha) +{ + const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; + int i; + int A1 = 0xffff<<14, A2= 0xffff<<14; + + if (uvalpha < 2048) { + for (i = 0; i < ((dstW + 1) >> 1); i++) { + int Y1 = (buf0[i * 2] ) >> 2; + int Y2 = (buf0[i * 2 + 1]) >> 2; + int U = (ubuf0[i] + (-128 << 11)) >> 2; + int V = (vbuf0[i] + (-128 << 11)) >> 2; + int R, G, B; + + Y1 -= c->yuv2rgb_y_offset; + Y2 -= c->yuv2rgb_y_offset; + Y1 *= c->yuv2rgb_y_coeff; + Y2 *= c->yuv2rgb_y_coeff; + Y1 += 1 << 13; + Y2 += 1 << 13; + + if (hasAlpha) { + A1 = abuf0[i * 2 ] << 11; + A2 = abuf0[i * 2 + 1] << 11; + + A1 += 1 << 13; + A2 += 1 << 13; + } + + R = V * c->yuv2rgb_v2r_coeff; + G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff; + B = U * c->yuv2rgb_u2b_coeff; + + output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14); + output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14); + output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14); + output_pixel(&dest[3], av_clip_uintp2(A1 , 30) >> 14); + output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14); + output_pixel(&dest[5], av_clip_uintp2( G + Y2, 30) >> 14); + output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14); + output_pixel(&dest[7], av_clip_uintp2(A2 , 30) >> 14); + dest += 8; + } + } else { + const int32_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; + int A1 = 0xffff<<14, A2 = 0xffff<<14; + for (i = 0; i < ((dstW + 1) >> 1); i++) { + int Y1 = (buf0[i * 2] ) >> 2; + int Y2 = (buf0[i * 2 + 1]) >> 2; + int U = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3; + int V = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3; + int R, G, B; + + Y1 -= c->yuv2rgb_y_offset; + Y2 -= c->yuv2rgb_y_offset; + Y1 *= c->yuv2rgb_y_coeff; + Y2 *= c->yuv2rgb_y_coeff; + Y1 += 1 << 13; + Y2 += 1 << 13; + + if (hasAlpha) { + A1 = abuf0[i * 2 ] << 11; + A2 = abuf0[i * 2 + 1] << 11; + + A1 += 1 << 13; + A2 += 1 << 13; + } + + R = V * c->yuv2rgb_v2r_coeff; + G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff; + B = U * c->yuv2rgb_u2b_coeff; + + output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14); + output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14); + output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14); + output_pixel(&dest[3], av_clip_uintp2(A1 , 30) >> 14); + output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14); + output_pixel(&dest[5], av_clip_uintp2( G + Y2, 30) >> 14); + output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14); + output_pixel(&dest[7], av_clip_uintp2(A2 , 30) >> 14); + dest += 8; + } + } +} + +static av_always_inline void yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter, const int32_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int32_t **chrUSrc, const int32_t **chrVSrc, int chrFilterSize, const int32_t **alpSrc, uint16_t *dest, int dstW, - int y, enum AVPixelFormat target) + int y, enum AVPixelFormat target, int hasAlpha) { int i; @@ -605,12 +916,12 @@ yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter, int R, G, B; for (j = 0; j < lumFilterSize; j++) { - Y1 += lumSrc[j][i * 2] * lumFilter[j]; - Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j]; + Y1 += lumSrc[j][i * 2] * (unsigned)lumFilter[j]; + Y2 += lumSrc[j][i * 2 + 1] * (unsigned)lumFilter[j]; } - for (j = 0; j < chrFilterSize; j++) { - U += chrUSrc[j][i] * chrFilter[j]; - V += chrVSrc[j][i] * chrFilter[j]; + for (j = 0; j < chrFilterSize; j++) {; + U += chrUSrc[j][i] * (unsigned)chrFilter[j]; + V += chrVSrc[j][i] * (unsigned)chrFilter[j]; } // 8bit: 12+15=27; 16-bit: 12+19=31 @@ -650,7 +961,7 @@ yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2], const int32_t *ubuf[2], const int32_t *vbuf[2], const int32_t *abuf[2], uint16_t *dest, int dstW, int yalpha, int uvalpha, int y, - enum AVPixelFormat target) + enum AVPixelFormat target, int hasAlpha) { const int32_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], @@ -691,7 +1002,7 @@ static av_always_inline void yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0, const int32_t *ubuf[2], const int32_t *vbuf[2], const int32_t *abuf0, uint16_t *dest, int dstW, - int uvalpha, int y, enum AVPixelFormat target) + int uvalpha, int y, enum AVPixelFormat target, int hasAlpha) { const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; int i; @@ -758,7 +1069,7 @@ yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0, #undef r_b #undef b_r -#define YUV2PACKED16WRAPPER(name, base, ext, fmt) \ +#define YUV2PACKED16WRAPPER(name, base, ext, fmt, hasAlpha) \ static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \ const int16_t **_lumSrc, int lumFilterSize, \ const int16_t *chrFilter, const int16_t **_chrUSrc, \ @@ -773,7 +1084,7 @@ static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \ uint16_t *dest = (uint16_t *) _dest; \ name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \ chrFilter, chrUSrc, chrVSrc, chrFilterSize, \ - alpSrc, dest, dstW, y, fmt); \ + alpSrc, dest, dstW, y, fmt, hasAlpha); \ } \ \ static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \ @@ -787,7 +1098,7 @@ static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \ **abuf = (const int32_t **) _abuf; \ uint16_t *dest = (uint16_t *) _dest; \ name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \ - dest, dstW, yalpha, uvalpha, y, fmt); \ + dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \ } \ \ static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \ @@ -801,13 +1112,21 @@ static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \ *abuf0 = (const int32_t *) _abuf0; \ uint16_t *dest = (uint16_t *) _dest; \ name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \ - dstW, uvalpha, y, fmt); \ + dstW, uvalpha, y, fmt, hasAlpha); \ } -YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, AV_PIX_FMT_RGB48BE) -YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, AV_PIX_FMT_RGB48LE) -YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, AV_PIX_FMT_BGR48BE) -YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, AV_PIX_FMT_BGR48LE) +YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, AV_PIX_FMT_RGB48BE, 0) +YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, AV_PIX_FMT_RGB48LE, 0) +YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, AV_PIX_FMT_BGR48BE, 0) +YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, AV_PIX_FMT_BGR48LE, 0) +YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64be, AV_PIX_FMT_RGBA64BE, 1) +YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64le, AV_PIX_FMT_RGBA64LE, 1) +YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64be, AV_PIX_FMT_RGBA64BE, 0) +YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64le, AV_PIX_FMT_RGBA64LE, 0) +YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64be, AV_PIX_FMT_BGRA64BE, 1) +YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64le, AV_PIX_FMT_BGRA64LE, 1) +YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64be, AV_PIX_FMT_BGRA64BE, 0) +YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64le, AV_PIX_FMT_BGRA64LE, 0) /* * Write out 2 RGB pixels in the target pixel format. This function takes a @@ -818,7 +1137,7 @@ YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, AV_PIX_FMT_BGR48LE) * correct RGB values into the destination buffer. */ static av_always_inline void -yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2, +yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2, unsigned A1, unsigned A2, const void *_r, const void *_g, const void *_b, int y, enum AVPixelFormat target, int hasAlpha) @@ -839,9 +1158,15 @@ yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2, if (hasAlpha) { int sh = (target == AV_PIX_FMT_RGB32_1 || target == AV_PIX_FMT_BGR32_1) ? 0 : 24; + av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0); dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh); dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh); } else { +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1 + int sh = (target == AV_PIX_FMT_RGB32_1 || target == AV_PIX_FMT_BGR32_1) ? 0 : 24; + + av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF); +#endif dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1]; dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2]; } @@ -854,6 +1179,7 @@ yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2, #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b) #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r) + dest[i * 6 + 0] = r_b[Y1]; dest[i * 6 + 1] = g[Y1]; dest[i * 6 + 2] = b_r[Y1]; @@ -872,19 +1198,19 @@ yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2, int dr1, dg1, db1, dr2, dg2, db2; if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) { - dr1 = dither_2x2_8[ y & 1 ][0]; - dg1 = dither_2x2_4[ y & 1 ][0]; - db1 = dither_2x2_8[(y & 1) ^ 1][0]; - dr2 = dither_2x2_8[ y & 1 ][1]; - dg2 = dither_2x2_4[ y & 1 ][1]; - db2 = dither_2x2_8[(y & 1) ^ 1][1]; + dr1 = ff_dither_2x2_8[ y & 1 ][0]; + dg1 = ff_dither_2x2_4[ y & 1 ][0]; + db1 = ff_dither_2x2_8[(y & 1) ^ 1][0]; + dr2 = ff_dither_2x2_8[ y & 1 ][1]; + dg2 = ff_dither_2x2_4[ y & 1 ][1]; + db2 = ff_dither_2x2_8[(y & 1) ^ 1][1]; } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) { - dr1 = dither_2x2_8[ y & 1 ][0]; - dg1 = dither_2x2_8[ y & 1 ][1]; - db1 = dither_2x2_8[(y & 1) ^ 1][0]; - dr2 = dither_2x2_8[ y & 1 ][1]; - dg2 = dither_2x2_8[ y & 1 ][0]; - db2 = dither_2x2_8[(y & 1) ^ 1][1]; + dr1 = ff_dither_2x2_8[ y & 1 ][0]; + dg1 = ff_dither_2x2_8[ y & 1 ][1]; + db1 = ff_dither_2x2_8[(y & 1) ^ 1][0]; + dr2 = ff_dither_2x2_8[ y & 1 ][1]; + dg2 = ff_dither_2x2_8[ y & 1 ][0]; + db2 = ff_dither_2x2_8[(y & 1) ^ 1][1]; } else { dr1 = ff_dither_4x4_16[ y & 3 ][0]; dg1 = ff_dither_4x4_16[ y & 3 ][1]; @@ -959,12 +1285,6 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter, Y2 >>= 19; U >>= 19; V >>= 19; - if ((Y1 | Y2 | U | V) & 0x100) { - Y1 = av_clip_uint8(Y1); - Y2 = av_clip_uint8(Y2); - U = av_clip_uint8(U); - V = av_clip_uint8(V); - } if (hasAlpha) { A1 = 1 << 18; A2 = 1 << 18; @@ -980,10 +1300,9 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter, } } - /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/ - r = c->table_rV[V]; - g = (c->table_gU[U] + c->table_gV[V]); - b = c->table_bU[U]; + r = c->table_rV[V + YUVRGB_TABLE_HEADROOM]; + g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]); + b = c->table_bU[U + YUVRGB_TABLE_HEADROOM]; yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0, r, g, b, y, target, hasAlpha); @@ -1012,16 +1331,9 @@ yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2], int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha) >> 19; int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha) >> 19; int A1, A2; - const void *r, *g, *b; - - Y1 = av_clip_uint8(Y1); - Y2 = av_clip_uint8(Y2); - U = av_clip_uint8(U); - V = av_clip_uint8(V); - - r = c->table_rV[V]; - g = (c->table_gU[U] + c->table_gV[V]); - b = c->table_bU[U]; + const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM], + *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]), + *b = c->table_bU[U + YUVRGB_TABLE_HEADROOM]; if (hasAlpha) { A1 = (abuf0[i * 2 ] * yalpha1 + abuf1[i * 2 ] * yalpha) >> 19; @@ -1047,25 +1359,18 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0, if (uvalpha < 2048) { for (i = 0; i < ((dstW + 1) >> 1); i++) { - int Y1 = buf0[i * 2] >> 7; - int Y2 = buf0[i * 2 + 1] >> 7; - int U = ubuf0[i] >> 7; - int V = vbuf0[i] >> 7; + int Y1 = (buf0[i * 2 ] + 64) >> 7; + int Y2 = (buf0[i * 2 + 1] + 64) >> 7; + int U = (ubuf0[i] + 64) >> 7; + int V = (vbuf0[i] + 64) >> 7; int A1, A2; - const void *r, *g, *b; - - Y1 = av_clip_uint8(Y1); - Y2 = av_clip_uint8(Y2); - U = av_clip_uint8(U); - V = av_clip_uint8(V); - - r = c->table_rV[V]; - g = (c->table_gU[U] + c->table_gV[V]); - b = c->table_bU[U]; + const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM], + *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]), + *b = c->table_bU[U + YUVRGB_TABLE_HEADROOM]; if (hasAlpha) { - A1 = abuf0[i * 2 ] >> 7; - A2 = abuf0[i * 2 + 1] >> 7; + A1 = abuf0[i * 2 ] * 255 + 16384 >> 15; + A2 = abuf0[i * 2 + 1] * 255 + 16384 >> 15; A1 = av_clip_uint8(A1); A2 = av_clip_uint8(A2); } @@ -1076,25 +1381,18 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0, } else { const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; for (i = 0; i < ((dstW + 1) >> 1); i++) { - int Y1 = buf0[i * 2] >> 7; - int Y2 = buf0[i * 2 + 1] >> 7; - int U = (ubuf0[i] + ubuf1[i]) >> 8; - int V = (vbuf0[i] + vbuf1[i]) >> 8; + int Y1 = (buf0[i * 2 ] + 64) >> 7; + int Y2 = (buf0[i * 2 + 1] + 64) >> 7; + int U = (ubuf0[i] + ubuf1[i] + 128) >> 8; + int V = (vbuf0[i] + vbuf1[i] + 128) >> 8; int A1, A2; - const void *r, *g, *b; - - Y1 = av_clip_uint8(Y1); - Y2 = av_clip_uint8(Y2); - U = av_clip_uint8(U); - V = av_clip_uint8(V); - - r = c->table_rV[V]; - g = (c->table_gU[U] + c->table_gV[V]); - b = c->table_bU[U]; + const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM], + *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]), + *b = c->table_bU[U + YUVRGB_TABLE_HEADROOM]; if (hasAlpha) { - A1 = abuf0[i * 2 ] >> 7; - A2 = abuf0[i * 2 + 1] >> 7; + A1 = (abuf0[i * 2 ] + 64) >> 7; + A2 = (abuf0[i * 2 + 1] + 64) >> 7; A1 = av_clip_uint8(A1); A2 = av_clip_uint8(A2); } @@ -1117,7 +1415,8 @@ static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \ chrFilter, chrUSrc, chrVSrc, chrFilterSize, \ alpSrc, dest, dstW, y, fmt, hasAlpha); \ } -#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \ + +#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \ YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \ static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \ const int16_t *ubuf[2], const int16_t *vbuf[2], \ @@ -1126,8 +1425,10 @@ static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \ { \ name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \ dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \ -} \ - \ +} + +#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \ +YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \ static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \ const int16_t *ubuf[2], const int16_t *vbuf[2], \ const int16_t *abuf0, uint8_t *dest, int dstW, \ @@ -1157,6 +1458,145 @@ YUV2RGBWRAPPER(yuv2rgb,, 8, AV_PIX_FMT_RGB8, 0) YUV2RGBWRAPPER(yuv2rgb,, 4, AV_PIX_FMT_RGB4, 0) YUV2RGBWRAPPER(yuv2rgb,, 4b, AV_PIX_FMT_RGB4_BYTE, 0) +static av_always_inline void yuv2rgb_write_full(SwsContext *c, + uint8_t *dest, int i, int Y, int A, int U, int V, + int y, enum AVPixelFormat target, int hasAlpha, int err[4]) +{ + int R, G, B; + int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8; + + Y -= c->yuv2rgb_y_offset; + Y *= c->yuv2rgb_y_coeff; + Y += 1 << 21; + R = Y + V*c->yuv2rgb_v2r_coeff; + G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff; + B = Y + U*c->yuv2rgb_u2b_coeff; + if ((R | G | B) & 0xC0000000) { + R = av_clip_uintp2(R, 30); + G = av_clip_uintp2(G, 30); + B = av_clip_uintp2(B, 30); + } + + switch(target) { + case AV_PIX_FMT_ARGB: + dest[0] = hasAlpha ? A : 255; + dest[1] = R >> 22; + dest[2] = G >> 22; + dest[3] = B >> 22; + break; + case AV_PIX_FMT_RGB24: + dest[0] = R >> 22; + dest[1] = G >> 22; + dest[2] = B >> 22; + break; + case AV_PIX_FMT_RGBA: + dest[0] = R >> 22; + dest[1] = G >> 22; + dest[2] = B >> 22; + dest[3] = hasAlpha ? A : 255; + break; + case AV_PIX_FMT_ABGR: + dest[0] = hasAlpha ? A : 255; + dest[1] = B >> 22; + dest[2] = G >> 22; + dest[3] = R >> 22; + break; + case AV_PIX_FMT_BGR24: + dest[0] = B >> 22; + dest[1] = G >> 22; + dest[2] = R >> 22; + break; + case AV_PIX_FMT_BGRA: + dest[0] = B >> 22; + dest[1] = G >> 22; + dest[2] = R >> 22; + dest[3] = hasAlpha ? A : 255; + break; + case AV_PIX_FMT_BGR4_BYTE: + case AV_PIX_FMT_RGB4_BYTE: + case AV_PIX_FMT_BGR8: + case AV_PIX_FMT_RGB8: + { + int r,g,b; + + switch (c->dither) { + default: + case SWS_DITHER_AUTO: + case SWS_DITHER_ED: + R >>= 22; + G >>= 22; + B >>= 22; + R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4; + G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4; + B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4; + c->dither_error[0][i] = err[0]; + c->dither_error[1][i] = err[1]; + c->dither_error[2][i] = err[2]; + r = R >> (isrgb8 ? 5 : 7); + g = G >> (isrgb8 ? 5 : 6); + b = B >> (isrgb8 ? 6 : 7); + r = av_clip(r, 0, isrgb8 ? 7 : 1); + g = av_clip(g, 0, isrgb8 ? 7 : 3); + b = av_clip(b, 0, isrgb8 ? 3 : 1); + err[0] = R - r*(isrgb8 ? 36 : 255); + err[1] = G - g*(isrgb8 ? 36 : 85); + err[2] = B - b*(isrgb8 ? 85 : 255); + break; + case SWS_DITHER_A_DITHER: + if (isrgb8) { + /* see http://pippin.gimp.org/a_dither/ for details/origin */ +#define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff)) + r = (((R >> 19) + A_DITHER(i,y) -96)>>8); + g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8); + b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8); + r = av_clip(r, 0, 7); + g = av_clip(g, 0, 7); + b = av_clip(b, 0, 3); + } else { + r = (((R >> 21) + A_DITHER(i,y)-256)>>8); + g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8); + b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8); + r = av_clip(r, 0, 1); + g = av_clip(g, 0, 3); + b = av_clip(b, 0, 1); + } + break; + case SWS_DITHER_X_DITHER: + if (isrgb8) { + /* see http://pippin.gimp.org/a_dither/ for details/origin */ +#define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2) + r = (((R >> 19) + X_DITHER(i,y) - 96)>>8); + g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8); + b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8); + r = av_clip(r, 0, 7); + g = av_clip(g, 0, 7); + b = av_clip(b, 0, 3); + } else { + r = (((R >> 21) + X_DITHER(i,y)-256)>>8); + g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8); + b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8); + r = av_clip(r, 0, 1); + g = av_clip(g, 0, 3); + b = av_clip(b, 0, 1); + } + + break; + } + + if(target == AV_PIX_FMT_BGR4_BYTE) { + dest[0] = r + 2*g + 8*b; + } else if(target == AV_PIX_FMT_RGB4_BYTE) { + dest[0] = b + 2*g + 8*r; + } else if(target == AV_PIX_FMT_BGR8) { + dest[0] = r + 8*g + 64*b; + } else if(target == AV_PIX_FMT_RGB8) { + dest[0] = b + 4*g + 32*r; + } else + av_assert2(0); + break;} + } +} + static av_always_inline void yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, @@ -1167,13 +1607,18 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter, { int i; int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4; + int err[4] = {0}; + int A = 0; //init to silence warning + + if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE + || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8) + step = 1; for (i = 0; i < dstW; i++) { int j; - int Y = 0; - int U = -128 << 19; - int V = -128 << 19; - int R, G, B, A; + int Y = 1<<9; + int U = (1<<9)-(128 << 19); + int V = (1<<9)-(128 << 19); for (j = 0; j < lumFilterSize; j++) { Y += lumSrc[j][i] * lumFilter[j]; @@ -1186,7 +1631,7 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter, U >>= 10; V >>= 10; if (hasAlpha) { - A = 1 << 21; + A = 1 << 18; for (j = 0; j < lumFilterSize; j++) { A += alpSrc[j][i] * lumFilter[j]; } @@ -1194,78 +1639,136 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter, if (A & 0x100) A = av_clip_uint8(A); } - Y -= c->yuv2rgb_y_offset; - Y *= c->yuv2rgb_y_coeff; - Y += 1 << 21; - R = Y + V*c->yuv2rgb_v2r_coeff; - G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff; - B = Y + U*c->yuv2rgb_u2b_coeff; - if ((R | G | B) & 0xC0000000) { - R = av_clip_uintp2(R, 30); - G = av_clip_uintp2(G, 30); - B = av_clip_uintp2(B, 30); - } + yuv2rgb_write_full(c, dest, i, Y, A, U, V, y, target, hasAlpha, err); + dest += step; + } + c->dither_error[0][i] = err[0]; + c->dither_error[1][i] = err[1]; + c->dither_error[2][i] = err[2]; +} - switch(target) { - case AV_PIX_FMT_ARGB: - dest[0] = hasAlpha ? A : 255; - dest[1] = R >> 22; - dest[2] = G >> 22; - dest[3] = B >> 22; - break; - case AV_PIX_FMT_RGB24: - dest[0] = R >> 22; - dest[1] = G >> 22; - dest[2] = B >> 22; - break; - case AV_PIX_FMT_RGBA: - dest[0] = R >> 22; - dest[1] = G >> 22; - dest[2] = B >> 22; - dest[3] = hasAlpha ? A : 255; - break; - case AV_PIX_FMT_ABGR: - dest[0] = hasAlpha ? A : 255; - dest[1] = B >> 22; - dest[2] = G >> 22; - dest[3] = R >> 22; - dest += 4; - break; - case AV_PIX_FMT_BGR24: - dest[0] = B >> 22; - dest[1] = G >> 22; - dest[2] = R >> 22; - break; - case AV_PIX_FMT_BGRA: - dest[0] = B >> 22; - dest[1] = G >> 22; - dest[2] = R >> 22; - dest[3] = hasAlpha ? A : 255; - break; +static av_always_inline void +yuv2rgb_full_2_c_template(SwsContext *c, const int16_t *buf[2], + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf[2], uint8_t *dest, int dstW, + int yalpha, int uvalpha, int y, + enum AVPixelFormat target, int hasAlpha) +{ + const int16_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1], + *abuf0 = hasAlpha ? abuf[0] : NULL, + *abuf1 = hasAlpha ? abuf[1] : NULL; + int yalpha1 = 4096 - yalpha; + int uvalpha1 = 4096 - uvalpha; + int i; + int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4; + int err[4] = {0}; + int A = 0; // init to silcene warning + + if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE + || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8) + step = 1; + + for (i = 0; i < dstW; i++) { + int Y = ( buf0[i] * yalpha1 + buf1[i] * yalpha ) >> 10; //FIXME rounding + int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha-(128 << 19)) >> 10; + int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha-(128 << 19)) >> 10; + + if (hasAlpha) { + A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + (1<<18)) >> 19; + if (A & 0x100) + A = av_clip_uint8(A); } + + yuv2rgb_write_full(c, dest, i, Y, A, U, V, y, target, hasAlpha, err); dest += step; } + c->dither_error[0][i] = err[0]; + c->dither_error[1][i] = err[1]; + c->dither_error[2][i] = err[2]; +} + +static av_always_inline void +yuv2rgb_full_1_c_template(SwsContext *c, const int16_t *buf0, + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf0, uint8_t *dest, int dstW, + int uvalpha, int y, enum AVPixelFormat target, + int hasAlpha) +{ + const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; + int i; + int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4; + int err[4] = {0}; + + if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE + || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8) + step = 1; + + if (uvalpha < 2048) { + int A = 0; //init to silence warning + for (i = 0; i < dstW; i++) { + int Y = buf0[i] << 2; + int U = (ubuf0[i] - (128<<7)) << 2; + int V = (vbuf0[i] - (128<<7)) << 2; + + if (hasAlpha) { + A = (abuf0[i] + 64) >> 7; + if (A & 0x100) + A = av_clip_uint8(A); + } + + yuv2rgb_write_full(c, dest, i, Y, A, U, V, y, target, hasAlpha, err); + dest += step; + } + } else { + const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; + int A = 0; //init to silence warning + for (i = 0; i < dstW; i++) { + int Y = buf0[i] << 2; + int U = (ubuf0[i] + ubuf1[i] - (128<<8)) << 1; + int V = (vbuf0[i] + vbuf1[i] - (128<<8)) << 1; + + if (hasAlpha) { + A = (abuf0[i] + 64) >> 7; + if (A & 0x100) + A = av_clip_uint8(A); + } + + yuv2rgb_write_full(c, dest, i, Y, A, U, V, y, target, hasAlpha, err); + dest += step; + } + } + + c->dither_error[0][i] = err[0]; + c->dither_error[1][i] = err[1]; + c->dither_error[2][i] = err[2]; } #if CONFIG_SMALL -YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, CONFIG_SWSCALE_ALPHA && c->alpPixBuf) -YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, CONFIG_SWSCALE_ALPHA && c->alpPixBuf) -YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, CONFIG_SWSCALE_ALPHA && c->alpPixBuf) -YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, CONFIG_SWSCALE_ALPHA && c->alpPixBuf) +YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, CONFIG_SWSCALE_ALPHA && c->alpPixBuf) +YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, CONFIG_SWSCALE_ALPHA && c->alpPixBuf) +YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, CONFIG_SWSCALE_ALPHA && c->alpPixBuf) +YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, CONFIG_SWSCALE_ALPHA && c->alpPixBuf) #else #if CONFIG_SWSCALE_ALPHA -YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1) -YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1) -YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1) -YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1) +YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1) +YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1) +YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1) +YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1) #endif -YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0) -YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0) -YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0) -YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0) #endif -YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0) -YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0) + +YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0) +YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0) static void yuv2gbrp_full_X_c(SwsContext *c, const int16_t *lumFilter, @@ -1277,16 +1780,17 @@ yuv2gbrp_full_X_c(SwsContext *c, const int16_t *lumFilter, { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->dstFormat); int i; - int hasAlpha = 0; + int hasAlpha = (desc->flags & AV_PIX_FMT_FLAG_ALPHA) && alpSrc; uint16_t **dest16 = (uint16_t**)dest; int SH = 22 + 7 - desc->comp[0].depth_minus1; + int A = 0; // init to silence warning for (i = 0; i < dstW; i++) { int j; int Y = 1 << 9; int U = (1 << 9) - (128 << 19); int V = (1 << 9) - (128 << 19); - int R, G, B, A; + int R, G, B; for (j = 0; j < lumFilterSize; j++) Y += lumSrc[j][i] * lumFilter[j]; @@ -1329,10 +1833,14 @@ yuv2gbrp_full_X_c(SwsContext *c, const int16_t *lumFilter, dest16[0][i] = G >> SH; dest16[1][i] = B >> SH; dest16[2][i] = R >> SH; + if (hasAlpha) + dest16[3][i] = A; } else { dest[0][i] = G >> 22; dest[1][i] = B >> 22; dest[2][i] = R >> 22; + if (hasAlpha) + dest[3][i] = A; } } if (SH != 22 && (!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) { @@ -1340,6 +1848,8 @@ yuv2gbrp_full_X_c(SwsContext *c, const int16_t *lumFilter, dest16[0][i] = av_bswap16(dest16[0][i]); dest16[1][i] = av_bswap16(dest16[1][i]); dest16[2][i] = av_bswap16(dest16[2][i]); + if (hasAlpha) + dest16[3][i] = av_bswap16(dest16[3][i]); } } } @@ -1363,10 +1873,17 @@ av_cold void ff_sws_init_output_funcs(SwsContext *c, if (desc->comp[0].depth_minus1 == 8) { *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c : yuv2planeX_9LE_c; *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c : yuv2plane1_9LE_c; - } else { + } else if (desc->comp[0].depth_minus1 == 9) { *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c : yuv2planeX_10LE_c; *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c : yuv2plane1_10LE_c; - } + } else if (desc->comp[0].depth_minus1 == 11) { + *yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_c : yuv2planeX_12LE_c; + *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_c : yuv2plane1_12LE_c; + } else if (desc->comp[0].depth_minus1 == 13) { + *yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_c : yuv2planeX_14LE_c; + *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_c : yuv2plane1_14LE_c; + } else + av_assert0(0); } else { *yuv2plane1 = yuv2plane1_8_c; *yuv2planeX = yuv2planeX_8_c; @@ -1379,77 +1896,189 @@ av_cold void ff_sws_init_output_funcs(SwsContext *c, case AV_PIX_FMT_RGBA: #if CONFIG_SMALL *yuv2packedX = yuv2rgba32_full_X_c; + *yuv2packed2 = yuv2rgba32_full_2_c; + *yuv2packed1 = yuv2rgba32_full_1_c; #else #if CONFIG_SWSCALE_ALPHA if (c->alpPixBuf) { *yuv2packedX = yuv2rgba32_full_X_c; + *yuv2packed2 = yuv2rgba32_full_2_c; + *yuv2packed1 = yuv2rgba32_full_1_c; } else #endif /* CONFIG_SWSCALE_ALPHA */ { *yuv2packedX = yuv2rgbx32_full_X_c; + *yuv2packed2 = yuv2rgbx32_full_2_c; + *yuv2packed1 = yuv2rgbx32_full_1_c; } #endif /* !CONFIG_SMALL */ break; case AV_PIX_FMT_ARGB: #if CONFIG_SMALL *yuv2packedX = yuv2argb32_full_X_c; + *yuv2packed2 = yuv2argb32_full_2_c; + *yuv2packed1 = yuv2argb32_full_1_c; #else #if CONFIG_SWSCALE_ALPHA if (c->alpPixBuf) { *yuv2packedX = yuv2argb32_full_X_c; + *yuv2packed2 = yuv2argb32_full_2_c; + *yuv2packed1 = yuv2argb32_full_1_c; } else #endif /* CONFIG_SWSCALE_ALPHA */ { *yuv2packedX = yuv2xrgb32_full_X_c; + *yuv2packed2 = yuv2xrgb32_full_2_c; + *yuv2packed1 = yuv2xrgb32_full_1_c; } #endif /* !CONFIG_SMALL */ break; case AV_PIX_FMT_BGRA: #if CONFIG_SMALL *yuv2packedX = yuv2bgra32_full_X_c; + *yuv2packed2 = yuv2bgra32_full_2_c; + *yuv2packed1 = yuv2bgra32_full_1_c; #else #if CONFIG_SWSCALE_ALPHA if (c->alpPixBuf) { *yuv2packedX = yuv2bgra32_full_X_c; + *yuv2packed2 = yuv2bgra32_full_2_c; + *yuv2packed1 = yuv2bgra32_full_1_c; } else #endif /* CONFIG_SWSCALE_ALPHA */ { *yuv2packedX = yuv2bgrx32_full_X_c; + *yuv2packed2 = yuv2bgrx32_full_2_c; + *yuv2packed1 = yuv2bgrx32_full_1_c; } #endif /* !CONFIG_SMALL */ break; case AV_PIX_FMT_ABGR: #if CONFIG_SMALL *yuv2packedX = yuv2abgr32_full_X_c; + *yuv2packed2 = yuv2abgr32_full_2_c; + *yuv2packed1 = yuv2abgr32_full_1_c; #else #if CONFIG_SWSCALE_ALPHA if (c->alpPixBuf) { *yuv2packedX = yuv2abgr32_full_X_c; + *yuv2packed2 = yuv2abgr32_full_2_c; + *yuv2packed1 = yuv2abgr32_full_1_c; } else #endif /* CONFIG_SWSCALE_ALPHA */ { *yuv2packedX = yuv2xbgr32_full_X_c; + *yuv2packed2 = yuv2xbgr32_full_2_c; + *yuv2packed1 = yuv2xbgr32_full_1_c; } #endif /* !CONFIG_SMALL */ break; case AV_PIX_FMT_RGB24: *yuv2packedX = yuv2rgb24_full_X_c; + *yuv2packed2 = yuv2rgb24_full_2_c; + *yuv2packed1 = yuv2rgb24_full_1_c; break; case AV_PIX_FMT_BGR24: *yuv2packedX = yuv2bgr24_full_X_c; + *yuv2packed2 = yuv2bgr24_full_2_c; + *yuv2packed1 = yuv2bgr24_full_1_c; + break; + case AV_PIX_FMT_BGR4_BYTE: + *yuv2packedX = yuv2bgr4_byte_full_X_c; + *yuv2packed2 = yuv2bgr4_byte_full_2_c; + *yuv2packed1 = yuv2bgr4_byte_full_1_c; + break; + case AV_PIX_FMT_RGB4_BYTE: + *yuv2packedX = yuv2rgb4_byte_full_X_c; + *yuv2packed2 = yuv2rgb4_byte_full_2_c; + *yuv2packed1 = yuv2rgb4_byte_full_1_c; + break; + case AV_PIX_FMT_BGR8: + *yuv2packedX = yuv2bgr8_full_X_c; + *yuv2packed2 = yuv2bgr8_full_2_c; + *yuv2packed1 = yuv2bgr8_full_1_c; + break; + case AV_PIX_FMT_RGB8: + *yuv2packedX = yuv2rgb8_full_X_c; + *yuv2packed2 = yuv2rgb8_full_2_c; + *yuv2packed1 = yuv2rgb8_full_1_c; break; case AV_PIX_FMT_GBRP: case AV_PIX_FMT_GBRP9BE: case AV_PIX_FMT_GBRP9LE: case AV_PIX_FMT_GBRP10BE: case AV_PIX_FMT_GBRP10LE: + case AV_PIX_FMT_GBRP12BE: + case AV_PIX_FMT_GBRP12LE: + case AV_PIX_FMT_GBRP14BE: + case AV_PIX_FMT_GBRP14LE: case AV_PIX_FMT_GBRP16BE: case AV_PIX_FMT_GBRP16LE: + case AV_PIX_FMT_GBRAP: *yuv2anyX = yuv2gbrp_full_X_c; break; } + if (!*yuv2packedX && !*yuv2anyX) + goto YUV_PACKED; } else { + YUV_PACKED: switch (dstFormat) { + case AV_PIX_FMT_RGBA64LE: +#if CONFIG_SWSCALE_ALPHA + if (c->alpPixBuf) { + *yuv2packed1 = yuv2rgba64le_1_c; + *yuv2packed2 = yuv2rgba64le_2_c; + *yuv2packedX = yuv2rgba64le_X_c; + } else +#endif /* CONFIG_SWSCALE_ALPHA */ + { + *yuv2packed1 = yuv2rgbx64le_1_c; + *yuv2packed2 = yuv2rgbx64le_2_c; + *yuv2packedX = yuv2rgbx64le_X_c; + } + break; + case AV_PIX_FMT_RGBA64BE: +#if CONFIG_SWSCALE_ALPHA + if (c->alpPixBuf) { + *yuv2packed1 = yuv2rgba64be_1_c; + *yuv2packed2 = yuv2rgba64be_2_c; + *yuv2packedX = yuv2rgba64be_X_c; + } else +#endif /* CONFIG_SWSCALE_ALPHA */ + { + *yuv2packed1 = yuv2rgbx64be_1_c; + *yuv2packed2 = yuv2rgbx64be_2_c; + *yuv2packedX = yuv2rgbx64be_X_c; + } + break; + case AV_PIX_FMT_BGRA64LE: +#if CONFIG_SWSCALE_ALPHA + if (c->alpPixBuf) { + *yuv2packed1 = yuv2bgra64le_1_c; + *yuv2packed2 = yuv2bgra64le_2_c; + *yuv2packedX = yuv2bgra64le_X_c; + } else +#endif /* CONFIG_SWSCALE_ALPHA */ + { + *yuv2packed1 = yuv2bgrx64le_1_c; + *yuv2packed2 = yuv2bgrx64le_2_c; + *yuv2packedX = yuv2bgrx64le_X_c; + } + break; + case AV_PIX_FMT_BGRA64BE: +#if CONFIG_SWSCALE_ALPHA + if (c->alpPixBuf) { + *yuv2packed1 = yuv2bgra64be_1_c; + *yuv2packed2 = yuv2bgra64be_2_c; + *yuv2packedX = yuv2bgra64be_X_c; + } else +#endif /* CONFIG_SWSCALE_ALPHA */ + { + *yuv2packed1 = yuv2bgrx64be_1_c; + *yuv2packed2 = yuv2bgrx64be_2_c; + *yuv2packedX = yuv2bgrx64be_X_c; + } + break; case AV_PIX_FMT_RGB48LE: *yuv2packed1 = yuv2rgb48le_1_c; *yuv2packed2 = yuv2rgb48le_2_c; diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index 7e00488..86f40ab 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -4,20 +4,20 @@ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> * based on the equivalent C code in swscale.c * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -295,7 +295,7 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c) if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) return; - if (c->srcBpc == 8 && c->dstBpc <= 10) { + if (c->srcBpc == 8 && c->dstBpc <= 14) { c->hyScale = c->hcScale = hScale_altivec_real; } if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c index 74b0f18..25282bf 100644 --- a/libswscale/ppc/yuv2rgb_altivec.c +++ b/libswscale/ppc/yuv2rgb_altivec.c @@ -3,20 +3,20 @@ * * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -97,6 +97,7 @@ #include "libswscale/swscale_internal.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/pixdesc.h" #include "yuv2rgb_altivec.h" #if HAVE_ALTIVEC @@ -317,12 +318,7 @@ static int altivec_ ## name(SwsContext *c, const unsigned char **in, \ const ubyte *ui = in[1]; \ const ubyte *vi = in[2]; \ \ - vector unsigned char *oute = \ - (vector unsigned char *) \ - (oplanes[0] + srcSliceY * outstrides[0]); \ - vector unsigned char *outo = \ - (vector unsigned char *) \ - (oplanes[0] + srcSliceY * outstrides[0] + outstrides[0]); \ + vector unsigned char *oute, *outo; \ \ /* loop moves y{1, 2}i by w */ \ instrides_scl[0] = instrides[0] * 2 - w; \ @@ -332,6 +328,9 @@ static int altivec_ ## name(SwsContext *c, const unsigned char **in, \ instrides_scl[2] = instrides[2] - w / 2; \ \ for (i = 0; i < h / 2; i++) { \ + oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \ + (srcSliceY + i * 2)); \ + outo = oute + (outstrides[0] >> 4); \ vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \ vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \ \ @@ -429,9 +428,6 @@ static int altivec_ ## name(SwsContext *c, const unsigned char **in, \ vi += 8; \ } \ \ - outo += (outstrides[0]) >> 4; \ - oute += (outstrides[0]) >> 4; \ - \ ui += instrides_scl[1]; \ vi += instrides_scl[2]; \ y1i += instrides_scl[0]; \ @@ -748,7 +744,7 @@ static av_always_inline void yuv2packedX_altivec(SwsContext *c, if (!printed_error_message) { av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", - sws_format_name(c->dstFormat)); + av_get_pix_fmt_name(c->dstFormat)); printed_error_message = 1; } return; @@ -836,7 +832,7 @@ static av_always_inline void yuv2packedX_altivec(SwsContext *c, /* Unreachable, I think. */ av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", - sws_format_name(c->dstFormat)); + av_get_pix_fmt_name(c->dstFormat)); return; } diff --git a/libswscale/ppc/yuv2rgb_altivec.h b/libswscale/ppc/yuv2rgb_altivec.h index 2c5e7ed..aa52a47 100644 --- a/libswscale/ppc/yuv2rgb_altivec.h +++ b/libswscale/ppc/yuv2rgb_altivec.h @@ -4,20 +4,20 @@ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> * based on the equivalent C code in swscale.c * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libswscale/ppc/yuv2yuv_altivec.c b/libswscale/ppc/yuv2yuv_altivec.c index 08545b3..2b1c5dd 100644 --- a/libswscale/ppc/yuv2yuv_altivec.c +++ b/libswscale/ppc/yuv2yuv_altivec.c @@ -4,20 +4,20 @@ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> * based on the equivalent C code in swscale.c * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c index 3fb3921..5b1fcf7 100644 --- a/libswscale/rgb2rgb.c +++ b/libswscale/rgb2rgb.c @@ -6,20 +6,20 @@ * Written by Nick Kurshev. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -73,10 +73,11 @@ void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride); -void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, - uint8_t *udst, uint8_t *vdst, - int width, int height, - int lumStride, int chromStride, int srcStride); +void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, + uint8_t *udst, uint8_t *vdst, + int width, int height, + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv); void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride); void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, @@ -108,7 +109,6 @@ void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); -#define RGB2YUV_SHIFT 8 #define BY ((int)( 0.098 * (1 << RGB2YUV_SHIFT) + 0.5)) #define BV ((int)(-0.071 * (1 << RGB2YUV_SHIFT) + 0.5)) #define BU ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5)) @@ -184,13 +184,13 @@ void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size) register uint16_t bgr = *s++; #if HAVE_BIGENDIAN *d++ = 255; - *d++ = (bgr & 0x1F) << 3; - *d++ = (bgr & 0x7E0) >> 3; - *d++ = (bgr & 0xF800) >> 8; + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); + *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); #else - *d++ = (bgr & 0xF800) >> 8; - *d++ = (bgr & 0x7E0) >> 3; - *d++ = (bgr & 0x1F) << 3; + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); + *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9); + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); *d++ = 255; #endif } @@ -223,9 +223,9 @@ void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size) while (s < end) { register uint16_t bgr = *s++; - *d++ = (bgr & 0xF800) >> 8; - *d++ = (bgr & 0x7E0) >> 3; - *d++ = (bgr & 0x1F) << 3; + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); + *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9); + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); } } @@ -259,13 +259,13 @@ void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size) register uint16_t bgr = *s++; #if HAVE_BIGENDIAN *d++ = 255; - *d++ = (bgr & 0x1F) << 3; - *d++ = (bgr & 0x3E0) >> 2; - *d++ = (bgr & 0x7C00) >> 7; + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); + *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); #else - *d++ = (bgr & 0x7C00) >> 7; - *d++ = (bgr & 0x3E0) >> 2; - *d++ = (bgr & 0x1F) << 3; + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); + *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7); + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); *d++ = 255; #endif } @@ -279,9 +279,9 @@ void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size) while (s < end) { register uint16_t bgr = *s++; - *d++ = (bgr & 0x7C00) >> 7; - *d++ = (bgr & 0x3E0) >> 2; - *d++ = (bgr & 0x1F) << 3; + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); + *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7); + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); } } @@ -318,18 +318,6 @@ void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size) } } -void bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size) -{ - int i, num_pixels = src_size; - - for (i = 0; i < num_pixels; i++) { - register uint8_t rgb = src[i]; - unsigned r = (rgb & 0x07); - unsigned g = (rgb & 0x38) >> 3; - unsigned b = (rgb & 0xC0) >> 6; - dst[i] = ((b << 1) & 0x07) | ((g & 0x07) << 3) | ((r & 0x03) << 6); - } -} #define DEFINE_SHUFFLE_BYTES(a, b, c, d) \ void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src, \ @@ -349,3 +337,57 @@ DEFINE_SHUFFLE_BYTES(0, 3, 2, 1) DEFINE_SHUFFLE_BYTES(1, 2, 3, 0) DEFINE_SHUFFLE_BYTES(3, 0, 1, 2) DEFINE_SHUFFLE_BYTES(3, 2, 1, 0) + +#define DEFINE_RGB48TOBGR48(need_bswap, swap) \ +void rgb48tobgr48_ ## need_bswap(const uint8_t *src, \ + uint8_t *dst, int src_size) \ +{ \ + uint16_t *d = (uint16_t *)dst; \ + uint16_t *s = (uint16_t *)src; \ + int i, num_pixels = src_size >> 1; \ + \ + for (i = 0; i < num_pixels; i += 3) { \ + d[i ] = swap ? av_bswap16(s[i + 2]) : s[i + 2]; \ + d[i + 1] = swap ? av_bswap16(s[i + 1]) : s[i + 1]; \ + d[i + 2] = swap ? av_bswap16(s[i ]) : s[i ]; \ + } \ +} + +DEFINE_RGB48TOBGR48(nobswap, 0) +DEFINE_RGB48TOBGR48(bswap, 1) + +#define DEFINE_RGB64TOBGR48(need_bswap, swap) \ +void rgb64tobgr48_ ## need_bswap(const uint8_t *src, \ + uint8_t *dst, int src_size) \ +{ \ + uint16_t *d = (uint16_t *)dst; \ + uint16_t *s = (uint16_t *)src; \ + int i, num_pixels = src_size >> 3; \ + \ + for (i = 0; i < num_pixels; i++) { \ + d[3 * i ] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \ + d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \ + d[3 * i + 2] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \ + } \ +} + +DEFINE_RGB64TOBGR48(nobswap, 0) +DEFINE_RGB64TOBGR48(bswap, 1) + +#define DEFINE_RGB64TO48(need_bswap, swap) \ +void rgb64to48_ ## need_bswap(const uint8_t *src, \ + uint8_t *dst, int src_size) \ +{ \ + uint16_t *d = (uint16_t *)dst; \ + uint16_t *s = (uint16_t *)src; \ + int i, num_pixels = src_size >> 3; \ + \ + for (i = 0; i < num_pixels; i++) { \ + d[3 * i ] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \ + d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \ + d[3 * i + 2] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \ + } \ +} + +DEFINE_RGB64TO48(nobswap, 0) +DEFINE_RGB64TO48(bswap, 1) diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h index f47b04e..5df5dea 100644 --- a/libswscale/rgb2rgb.h +++ b/libswscale/rgb2rgb.h @@ -6,20 +6,20 @@ * Written by Nick Kurshev. * YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -52,6 +52,12 @@ extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size); extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size); +void rgb64tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size); +void rgb64tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size); +void rgb48tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size); +void rgb48tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size); +void rgb64to48_nobswap(const uint8_t *src, uint8_t *dst, int src_size); +void rgb64to48_bswap(const uint8_t *src, uint8_t *dst, int src_size); void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size); void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size); void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size); @@ -64,16 +70,15 @@ void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size); void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size); void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size); void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size); -void bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size); void shuffle_bytes_0321(const uint8_t *src, uint8_t *dst, int src_size); void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size); void shuffle_bytes_3012(const uint8_t *src, uint8_t *dst, int src_size); void shuffle_bytes_3210(const uint8_t *src, uint8_t *dst, int src_size); -void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, - uint8_t *vdst, int width, int height, int lumStride, - int chromStride, int srcStride); +void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, + int chromStride, int srcStride, int32_t *rgb2yuv); /** * Height should be a multiple of 2 and width should be a multiple of 16. @@ -119,9 +124,10 @@ extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uin * Chrominance data is only taken from every second line, others are ignored. * FIXME: Write high quality version. */ -extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, - int lumStride, int chromStride, int srcStride); +extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + int width, int height, + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv); extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride); diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c index 65ea5dd..121f4ef 100644 --- a/libswscale/rgb2rgb_template.c +++ b/libswscale/rgb2rgb_template.c @@ -7,20 +7,20 @@ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * lot of big-endian byte order fixes by Alex Beregszaszi * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -240,27 +240,6 @@ static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size) } } -/* - * I use less accurate approximation here by simply left-shifting the input - * value and filling the low order bits with zeroes. This method improves PNG - * compression but this scheme cannot reproduce white exactly, since it does - * not generate an all-ones maximum value; the net effect is to darken the - * image slightly. - * - * The better method should be "left bit replication": - * - * 4 3 2 1 0 - * --------- - * 1 1 0 1 1 - * - * 7 6 5 4 3 2 1 0 - * ---------------- - * 1 1 0 1 1 1 1 0 - * |=======| |===| - * | leftmost bits repeated to fill open bits - * | - * original bits - */ static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size) { @@ -270,9 +249,9 @@ static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst, while (s < end) { register uint16_t bgr = *s++; - *d++ = (bgr & 0x1F) << 3; - *d++ = (bgr & 0x3E0) >> 2; - *d++ = (bgr & 0x7C00) >> 7; + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); + *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); } } @@ -285,9 +264,9 @@ static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst, while (s < end) { register uint16_t bgr = *s++; - *d++ = (bgr & 0x1F) << 3; - *d++ = (bgr & 0x7E0) >> 3; - *d++ = (bgr & 0xF800) >> 8; + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); + *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); } } @@ -301,13 +280,13 @@ static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size) register uint16_t bgr = *s++; #if HAVE_BIGENDIAN *d++ = 255; - *d++ = (bgr & 0x7C00) >> 7; - *d++ = (bgr & 0x3E0) >> 2; - *d++ = (bgr & 0x1F) << 3; + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); + *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7); + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); #else - *d++ = (bgr & 0x1F) << 3; - *d++ = (bgr & 0x3E0) >> 2; - *d++ = (bgr & 0x7C00) >> 7; + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); + *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); *d++ = 255; #endif } @@ -323,13 +302,13 @@ static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size) register uint16_t bgr = *s++; #if HAVE_BIGENDIAN *d++ = 255; - *d++ = (bgr & 0xF800) >> 8; - *d++ = (bgr & 0x7E0) >> 3; - *d++ = (bgr & 0x1F) << 3; + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); + *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9); + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); #else - *d++ = (bgr & 0x1F) << 3; - *d++ = (bgr & 0x7E0) >> 3; - *d++ = (bgr & 0xF800) >> 8; + *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2); + *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); *d++ = 255; #endif } @@ -376,9 +355,9 @@ static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; for (i = 0; i < chromWidth; i += 2) { uint64_t k = yc[0] + (uc[0] << 8) + - (yc[1] << 16) + (vc[0] << 24); + (yc[1] << 16) + (unsigned)(vc[0] << 24); uint64_t l = yc[2] + (uc[1] << 8) + - (yc[3] << 16) + (vc[1] << 24); + (yc[3] << 16) + (unsigned)(vc[1] << 24); *ldst++ = k + (l << 32); yc += 4; uc += 2; @@ -440,9 +419,9 @@ static inline void yuvPlanartouyvy_c(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; for (i = 0; i < chromWidth; i += 2) { uint64_t k = uc[0] + (yc[0] << 8) + - (vc[0] << 16) + (yc[1] << 24); + (vc[0] << 16) + (unsigned)(yc[1] << 24); uint64_t l = uc[1] + (yc[2] << 8) + - (vc[1] << 16) + (yc[3] << 24); + (vc[1] << 16) + (unsigned)(yc[3] << 24); *ldst++ = k + (l << 32); yc += 4; uc += 2; @@ -635,10 +614,13 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst, * others are ignored in the C version. * FIXME: Write HQ version. */ -void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, +void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, - int chromStride, int srcStride) + int chromStride, int srcStride, int32_t *rgb2yuv) { + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; int y; const int chromWidth = width >> 1; @@ -649,9 +631,9 @@ void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, unsigned int g = src[6 * i + 1]; unsigned int r = src[6 * i + 2]; - unsigned int Y = ((RY * r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16; - unsigned int V = ((RV * r + GV * g + BV * b) >> RGB2YUV_SHIFT) + 128; - unsigned int U = ((RU * r + GU * g + BU * b) >> RGB2YUV_SHIFT) + 128; + unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; + unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; + unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; udst[i] = U; vdst[i] = V; @@ -661,18 +643,21 @@ void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, g = src[6 * i + 4]; r = src[6 * i + 5]; - Y = ((RY * r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16; + Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ydst[2 * i + 1] = Y; } ydst += lumStride; src += srcStride; + if (y+1 == height) + break; + for (i = 0; i < chromWidth; i++) { unsigned int b = src[6 * i + 0]; unsigned int g = src[6 * i + 1]; unsigned int r = src[6 * i + 2]; - unsigned int Y = ((RY * r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16; + unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ydst[2 * i] = Y; @@ -680,7 +665,7 @@ void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, g = src[6 * i + 4]; r = src[6 * i + 5]; - Y = ((RY * r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16; + Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ydst[2 * i + 1] = Y; } udst += chromStride; @@ -856,7 +841,7 @@ static void yuyvtoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth = -((-width) >> 1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y = 0; y < height; y++) { extract_even_c(src, ydst, width); @@ -876,7 +861,7 @@ static void yuyvtoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth = -((-width) >> 1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y = 0; y < height; y++) { extract_even_c(src, ydst, width); @@ -894,7 +879,7 @@ static void uyvytoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth = -((-width) >> 1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y = 0; y < height; y++) { extract_even_c(src + 1, ydst, width); @@ -914,7 +899,7 @@ static void uyvytoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth = -((-width) >> 1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y = 0; y < height; y++) { extract_even_c(src + 1, ydst, width); @@ -953,7 +938,7 @@ static av_cold void rgb2rgb_init_c(void) yuv422ptouyvy = yuv422ptouyvy_c; yuy2toyv12 = yuy2toyv12_c; planar2x = planar2x_c; - rgb24toyv12 = rgb24toyv12_c; + ff_rgb24toyv12 = ff_rgb24toyv12_c; interleaveBytes = interleaveBytes_c; deinterleaveBytes = deinterleaveBytes_c; vu9_to_vu12 = vu9_to_vu12_c; diff --git a/libswscale/swscale-test.c b/libswscale/swscale-test.c index 8063519..661ff5b 100644 --- a/libswscale/swscale-test.c +++ b/libswscale/swscale-test.c @@ -1,20 +1,20 @@ /* - * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -53,7 +53,7 @@ (x) == AV_PIX_FMT_RGB32_1 || \ (x) == AV_PIX_FMT_YUVA420P) -static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, +static uint64_t getSSD(const uint8_t *src1, const uint8_t *src2, int stride1, int stride2, int w, int h) { int x, y; @@ -92,7 +92,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, static int srcStride[4]; uint8_t *dst[4] = { 0 }; uint8_t *out[4] = { 0 }; - int dstStride[4]; + int dstStride[4] = {0}; int i; uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0; struct SwsContext *dstContext = NULL, *outContext = NULL; @@ -108,6 +108,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, av_image_fill_linesizes(srcStride, srcFormat, srcW); for (p = 0; p < 4; p++) { + srcStride[p] = FFALIGN(srcStride[p], 16); if (srcStride[p]) src[p] = av_mallocz(srcStride[p] * srcH + 16); if (srcStride[p] && !src[p]) { @@ -125,7 +126,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, res = -1; goto end; } - sws_scale(srcContext, ref, refStride, 0, h, src, srcStride); + sws_scale(srcContext, (const uint8_t * const*)ref, refStride, 0, h, src, srcStride); sws_freeContext(srcContext); cur_srcFormat = srcFormat; @@ -141,6 +142,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, * allocated with av_malloc). */ /* An extra 16 bytes is being allocated because some scalers may write * out of bounds. */ + dstStride[i] = FFALIGN(dstStride[i], 16); if (dstStride[i]) dst[i] = av_mallocz(dstStride[i] * dstH + 16); if (dstStride[i] && !dst[i]) { @@ -166,7 +168,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, flags); fflush(stdout); - sws_scale(dstContext, src, srcStride, 0, srcH, dst, dstStride); + sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); for (i = 0; i < 4 && dstStride[i]; i++) crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i], @@ -179,6 +181,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, ssdA = r->ssdA; } else { for (i = 0; i < 4; i++) { + refStride[i] = FFALIGN(refStride[i], 16); if (refStride[i]) out[i] = av_mallocz(refStride[i] * h); if (refStride[i] && !out[i]) { @@ -197,7 +200,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, res = -1; goto end; } - sws_scale(outContext, dst, dstStride, 0, dstH, out, refStride); + sws_scale(outContext, (const uint8_t * const*)dst, dstStride, 0, dstH, out, refStride); ssdY = getSSD(ref[0], out[0], refStride[0], refStride[0], w, h); if (hasChroma(srcFormat) && hasChroma(dstFormat)) { @@ -304,7 +307,7 @@ static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp, ret = sscanf(buf, " %12s %dx%d -> %12s %dx%d flags=%d CRC=%x" - " SSD=%"PRId64 ", %"PRId64 ", %"PRId64 ", %"PRId64 "\n", + " SSD=%"SCNd64 ", %"SCNd64 ", %"SCNd64 ", %"SCNd64 "\n", srcStr, &srcW, &srcH, dstStr, &dstW, &dstH, &flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA); if (ret != 12) { @@ -315,7 +318,8 @@ static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp, srcFormat = av_get_pix_fmt(srcStr); dstFormat = av_get_pix_fmt(dstStr); - if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE) { + if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE || + srcW > 8192U || srcH > 8192U || dstW > 8192U || dstH > 8192U) { fprintf(stderr, "malformed input file\n"); return -1; } @@ -344,7 +348,7 @@ int main(int argc, char **argv) enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE; enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE; uint8_t *rgb_data = av_malloc(W * H * 4); - uint8_t *rgb_src[4] = { rgb_data, NULL, NULL, NULL }; + const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL }; int rgb_stride[4] = { 4 * W, 0, 0, 0 }; uint8_t *data = av_malloc(4 * W * H); uint8_t *src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 }; @@ -354,34 +358,20 @@ int main(int argc, char **argv) AVLFG rand; int res = -1; int i; + FILE *fp = NULL; if (!rgb_data || !data) return -1; - sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H, - AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL); - - av_lfg_init(&rand, 1); - - for (y = 0; y < H; y++) - for (x = 0; x < W * 4; x++) - rgb_data[ x + y * 4 * W] = av_lfg_get(&rand); - sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride); - sws_freeContext(sws); - av_free(rgb_data); - for (i = 1; i < argc; i += 2) { if (argv[i][0] != '-' || i + 1 == argc) goto bad_option; if (!strcmp(argv[i], "-ref")) { - FILE *fp = fopen(argv[i + 1], "r"); + fp = fopen(argv[i + 1], "r"); if (!fp) { fprintf(stderr, "could not open '%s'\n", argv[i + 1]); goto error; } - res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat); - fclose(fp); - goto end; } else if (!strcmp(argv[i], "-src")) { srcFormat = av_get_pix_fmt(argv[i + 1]); if (srcFormat == AV_PIX_FMT_NONE) { @@ -401,9 +391,25 @@ bad_option: } } - selfTest(src, stride, W, H, srcFormat, dstFormat); -end: - res = 0; + sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H, + AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL); + + av_lfg_init(&rand, 1); + + for (y = 0; y < H; y++) + for (x = 0; x < W * 4; x++) + rgb_data[ x + y * 4 * W] = av_lfg_get(&rand); + sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride); + sws_freeContext(sws); + av_free(rgb_data); + + if(fp) { + res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat); + fclose(fp); + } else { + selfTest(src, stride, W, H, srcFormat, dstFormat); + res = 0; + } error: av_free(data); diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 7756e1b..59ead12 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -1,29 +1,29 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <assert.h> #include <inttypes.h> #include <math.h> #include <stdio.h> #include <string.h> +#include "libavutil/avassert.h" #include "libavutil/avutil.h" #include "libavutil/bswap.h" #include "libavutil/cpu.h" @@ -35,7 +35,7 @@ #include "swscale_internal.h" #include "swscale.h" -DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_128)[8][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_128)[9][8] = { { 36, 68, 60, 92, 34, 66, 58, 90, }, { 100, 4, 124, 28, 98, 2, 122, 26, }, { 52, 84, 44, 76, 50, 82, 42, 74, }, @@ -44,6 +44,7 @@ DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_128)[8][8] = { { 96, 0, 120, 24, 102, 6, 126, 30, }, { 48, 80, 40, 72, 54, 86, 46, 78, }, { 112, 16, 104, 8, 118, 22, 110, 14, }, + { 36, 68, 60, 92, 34, 66, 58, 90, }, }; DECLARE_ALIGNED(8, static const uint8_t, sws_pb_64)[8] = { @@ -61,28 +62,6 @@ static av_always_inline void fillPlane(uint8_t *plane, int stride, int width, } } -static void fill_plane9or10(uint8_t *plane, int stride, int width, - int height, int y, uint8_t val, - const int dst_depth, const int big_endian) -{ - int i, j; - uint16_t *dst = (uint16_t *) (plane + stride * y); -#define FILL8TO9_OR_10(wfunc) \ - for (i = 0; i < height; i++) { \ - for (j = 0; j < width; j++) { \ - wfunc(&dst[j], (val << (dst_depth - 8)) | \ - (val >> (16 - dst_depth))); \ - } \ - dst += stride / 2; \ - } - if (big_endian) { - FILL8TO9_OR_10(AV_WB16); - } else { - FILL8TO9_OR_10(AV_WL16); - } -} - - static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src, const int16_t *filter, const int32_t *filterPos, int filterSize) @@ -94,6 +73,9 @@ static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, int bits = desc->comp[0].depth_minus1; int sh = bits - 4; + if((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth_minus1<15) + sh= 9; + for (i = 0; i < dstW; i++) { int j; int srcPos = filterPos[i]; @@ -116,6 +98,9 @@ static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint16_t *src = (const uint16_t *) _src; int sh = desc->comp[0].depth_minus1; + if(sh<15) + sh= isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : desc->comp[0].depth_minus1; + for (i = 0; i < dstW; i++) { int j; int srcPos = filterPos[i]; @@ -223,8 +208,9 @@ static void lumRangeToJpeg16_c(int16_t *_dst, int width) { int i; int32_t *dst = (int32_t *) _dst; - for (i = 0; i < width; i++) - dst[i] = (FFMIN(dst[i], 30189 << 4) * 4769 - (39057361 << 2)) >> 12; + for (i = 0; i < width; i++) { + dst[i] = ((int)(FFMIN(dst[i], 30189 << 4) * 4769U - (39057361 << 2))) >> 12; + } } static void lumRangeFromJpeg16_c(int16_t *_dst, int width) @@ -232,20 +218,7 @@ static void lumRangeFromJpeg16_c(int16_t *_dst, int width) int i; int32_t *dst = (int32_t *) _dst; for (i = 0; i < width; i++) - dst[i] = (dst[i] * 14071 + (33561947 << 4)) >> 14; -} - -static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth, - const uint8_t *src, int srcW, int xInc) -{ - int i; - unsigned int xpos = 0; - for (i = 0; i < dstWidth; i++) { - register unsigned int xx = xpos >> 16; - register unsigned int xalpha = (xpos & 0xFFFF) >> 9; - dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha; - xpos += xInc; - } + dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12; } // *** horizontal scale Y line to temp buffer @@ -258,16 +231,19 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth, uint8_t *formatConvBuffer, uint32_t *pal, int isAlpha) { - void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = + void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12; void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange; const uint8_t *src = src_in[isAlpha ? 3 : 0]; if (toYV12) { - toYV12(formatConvBuffer, src, srcW, pal); + toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal); src = formatConvBuffer; } else if (c->readLumPlanar && !isAlpha) { - c->readLumPlanar(formatConvBuffer, src_in, srcW); + c->readLumPlanar(formatConvBuffer, src_in, srcW, c->input_rgb2yuv_table); + src = formatConvBuffer; + } else if (c->readAlpPlanar && isAlpha) { + c->readAlpPlanar(formatConvBuffer, src_in, srcW, NULL); src = formatConvBuffer; } @@ -282,21 +258,6 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth, convertRange(dst, dstWidth); } -static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2, - int dstWidth, const uint8_t *src1, - const uint8_t *src2, int srcW, int xInc) -{ - int i; - unsigned int xpos = 0; - for (i = 0; i < dstWidth; i++) { - register unsigned int xx = xpos >> 16; - register unsigned int xalpha = (xpos & 0xFFFF) >> 9; - dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha); - dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha); - xpos += xInc; - } -} - static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src_in[4], @@ -309,14 +270,14 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, const uint8_t *src1 = src_in[1], *src2 = src_in[2]; if (c->chrToYV12) { uint8_t *buf2 = formatConvBuffer + - FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16); - c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal); - src1 = formatConvBuffer; - src2 = buf2; + FFALIGN(srcW*2+78, 16); + c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal); + src1= formatConvBuffer; + src2= buf2; } else if (c->readChrPlanar) { uint8_t *buf2 = formatConvBuffer + - FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16); - c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW); + FFALIGN(srcW*2+78, 16); + c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW, c->input_rgb2yuv_table); src1 = formatConvBuffer; src2 = buf2; } @@ -356,8 +317,6 @@ static int swscale(SwsContext *c, const uint8_t *src[], int32_t *vChrFilterPos = c->vChrFilterPos; int32_t *hLumFilterPos = c->hLumFilterPos; int32_t *hChrFilterPos = c->hChrFilterPos; - int16_t *vLumFilter = c->vLumFilter; - int16_t *vChrFilter = c->vChrFilter; int16_t *hLumFilter = c->hLumFilter; int16_t *hChrFilter = c->hChrFilter; int32_t *lumMmxFilter = c->lumMmxFilter; @@ -381,8 +340,8 @@ static int swscale(SwsContext *c, const uint8_t *src[], yuv2packed2_fn yuv2packed2 = c->yuv2packed2; yuv2packedX_fn yuv2packedX = c->yuv2packedX; yuv2anyX_fn yuv2anyX = c->yuv2anyX; - const int chrSrcSliceY = srcSliceY >> c->chrSrcVSubSample; - const int chrSrcSliceH = -((-srcSliceH) >> c->chrSrcVSubSample); + const int chrSrcSliceY = srcSliceY >> c->chrSrcVSubSample; + const int chrSrcSliceH = FF_CEIL_RSHIFT(srcSliceH, c->chrSrcVSubSample); int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat); int lastDstY; @@ -394,6 +353,10 @@ static int swscale(SwsContext *c, const uint8_t *src[], int lastInLumBuf = c->lastInLumBuf; int lastInChrBuf = c->lastInChrBuf; + if (!usePal(c->srcFormat)) { + pal = c->input_rgb2yuv_table; + } + if (isPacked(c->srcFormat)) { src[0] = src[1] = @@ -417,8 +380,8 @@ static int swscale(SwsContext *c, const uint8_t *src[], DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n", vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize); - if (dstStride[0] % 8 != 0 || dstStride[1] % 8 != 0 || - dstStride[2] % 8 != 0 || dstStride[3] % 8 != 0) { + if (dstStride[0]&15 || dstStride[1]&15 || + dstStride[2]&15 || dstStride[3]&15) { static int warnedAlready = 0; // FIXME maybe move this into the context if (flags & SWS_PRINT_INFO && !warnedAlready) { av_log(c, AV_LOG_WARNING, @@ -428,6 +391,19 @@ static int swscale(SwsContext *c, const uint8_t *src[], } } + if ( (uintptr_t)dst[0]&15 || (uintptr_t)dst[1]&15 || (uintptr_t)dst[2]&15 + || (uintptr_t)src[0]&15 || (uintptr_t)src[1]&15 || (uintptr_t)src[2]&15 + || dstStride[0]&15 || dstStride[1]&15 || dstStride[2]&15 || dstStride[3]&15 + || srcStride[0]&15 || srcStride[1]&15 || srcStride[2]&15 || srcStride[3]&15 + ) { + static int warnedAlready=0; + int cpu_flags = av_get_cpu_flags(); + if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){ + av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n"); + warnedAlready=1; + } + } + /* Note the user might start scaling the picture in the middle so this * will not get executed. This is not really intended but works * currently, so people might do it. */ @@ -452,6 +428,7 @@ static int swscale(SwsContext *c, const uint8_t *src[], dst[2] + dstStride[2] * chrDstY, (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL, }; + int use_mmx_vfilter= c->use_mmx_vfilter; // First line needed as input const int firstLumSrcY = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]); @@ -470,8 +447,8 @@ static int swscale(SwsContext *c, const uint8_t *src[], lastInLumBuf = firstLumSrcY - 1; if (firstChrSrcY > lastInChrBuf) lastInChrBuf = firstChrSrcY - 1; - assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); - assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); + av_assert0(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); + av_assert0(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); DEBUG_BUFFERS("dstY: %d\n", dstY); DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n", @@ -481,7 +458,7 @@ static int swscale(SwsContext *c, const uint8_t *src[], // Do we have enough lines in this slice to output the dstY line enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && - lastChrSrcY < -((-srcSliceY - srcSliceH) >> c->chrSrcVSubSample); + lastChrSrcY < FF_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample); if (!enough_lines) { lastLumSrcY = srcSliceY + srcSliceH - 1; @@ -499,9 +476,9 @@ static int swscale(SwsContext *c, const uint8_t *src[], src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3], }; lumBufIndex++; - assert(lumBufIndex < 2 * vLumBufSize); - assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); - assert(lastInLumBuf + 1 - srcSliceY >= 0); + av_assert0(lumBufIndex < 2 * vLumBufSize); + av_assert0(lastInLumBuf + 1 - srcSliceY < srcSliceH); + av_assert0(lastInLumBuf + 1 - srcSliceY >= 0); hyscale(c, lumPixBuf[lumBufIndex], dstW, src1, srcW, lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize, formatConvBuffer, pal, 0); @@ -521,9 +498,9 @@ static int swscale(SwsContext *c, const uint8_t *src[], src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3], }; chrBufIndex++; - assert(chrBufIndex < 2 * vChrBufSize); - assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); - assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); + av_assert0(chrBufIndex < 2 * vChrBufSize); + av_assert0(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); + av_assert0(lastInChrBuf + 1 - chrSrcSliceY >= 0); // FIXME replace parameters through context struct (some at least) if (c->needs_hcscale) @@ -556,103 +533,81 @@ static int swscale(SwsContext *c, const uint8_t *src[], * this array's tail */ ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX, &yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX); + use_mmx_vfilter= 0; } { - const int16_t **lumSrcPtr = (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; - const int16_t **chrUSrcPtr = (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; - const int16_t **chrVSrcPtr = (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; + const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; + const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; + const int16_t **chrVSrcPtr = (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? - (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; - - if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) { - const int16_t **tmpY = (const int16_t **)lumPixBuf + - 2 * vLumBufSize; - int neg = -firstLumSrcY, i; - int end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize); - for (i = 0; i < neg; i++) - tmpY[i] = lumSrcPtr[neg]; - for (; i < end; i++) - tmpY[i] = lumSrcPtr[i]; - for (; i < vLumFilterSize; i++) - tmpY[i] = tmpY[i - 1]; - lumSrcPtr = tmpY; - - if (alpSrcPtr) { - const int16_t **tmpA = (const int16_t **)alpPixBuf + - 2 * vLumBufSize; - for (i = 0; i < neg; i++) - tmpA[i] = alpSrcPtr[neg]; - for (; i < end; i++) - tmpA[i] = alpSrcPtr[i]; - for (; i < vLumFilterSize; i++) - tmpA[i] = tmpA[i - 1]; - alpSrcPtr = tmpA; - } - } - if (firstChrSrcY < 0 || - firstChrSrcY + vChrFilterSize > c->chrSrcH) { - const int16_t **tmpU = (const int16_t **)chrUPixBuf + 2 * vChrBufSize, - **tmpV = (const int16_t **)chrVPixBuf + 2 * vChrBufSize; - int neg = -firstChrSrcY, i; - int end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize); - for (i = 0; i < neg; i++) { - tmpU[i] = chrUSrcPtr[neg]; - tmpV[i] = chrVSrcPtr[neg]; - } - for (; i < end; i++) { - tmpU[i] = chrUSrcPtr[i]; - tmpV[i] = chrVSrcPtr[i]; - } - for (; i < vChrFilterSize; i++) { - tmpU[i] = tmpU[i - 1]; - tmpV[i] = tmpV[i - 1]; - } - chrUSrcPtr = tmpU; - chrVSrcPtr = tmpV; - } + (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; + int16_t *vLumFilter = c->vLumFilter; + int16_t *vChrFilter = c->vChrFilter; if (isPlanarYUV(dstFormat) || (isGray(dstFormat) && !isALPHA(dstFormat))) { // YV12 like const int chrSkipMask = (1 << c->chrDstVSubSample) - 1; + vLumFilter += dstY * vLumFilterSize; + vChrFilter += chrDstY * vChrFilterSize; + +// av_assert0(use_mmx_vfilter != ( +// yuv2planeX == yuv2planeX_10BE_c +// || yuv2planeX == yuv2planeX_10LE_c +// || yuv2planeX == yuv2planeX_9BE_c +// || yuv2planeX == yuv2planeX_9LE_c +// || yuv2planeX == yuv2planeX_16BE_c +// || yuv2planeX == yuv2planeX_16LE_c +// || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86); + + if(use_mmx_vfilter){ + vLumFilter= (int16_t *)c->lumMmxFilter; + vChrFilter= (int16_t *)c->chrMmxFilter; + } + if (vLumFilterSize == 1) { yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0); } else { - yuv2planeX(vLumFilter + dstY * vLumFilterSize, - vLumFilterSize, lumSrcPtr, dest[0], + yuv2planeX(vLumFilter, vLumFilterSize, + lumSrcPtr, dest[0], dstW, c->lumDither8, 0); } if (!((dstY & chrSkipMask) || isGray(dstFormat))) { if (yuv2nv12cX) { - yuv2nv12cX(c, vChrFilter + chrDstY * vChrFilterSize, + yuv2nv12cX(c, vChrFilter, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW); } else if (vChrFilterSize == 1) { yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0); yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3); } else { - yuv2planeX(vChrFilter + chrDstY * vChrFilterSize, + yuv2planeX(vChrFilter, vChrFilterSize, chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0); - yuv2planeX(vChrFilter + chrDstY * vChrFilterSize, + yuv2planeX(vChrFilter, vChrFilterSize, chrVSrcPtr, dest[2], - chrDstW, c->chrDither8, 3); + chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3); } } if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { + if(use_mmx_vfilter){ + vLumFilter= (int16_t *)c->alpMmxFilter; + } if (vLumFilterSize == 1) { yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0); } else { - yuv2planeX(vLumFilter + dstY * vLumFilterSize, + yuv2planeX(vLumFilter, vLumFilterSize, alpSrcPtr, dest[3], dstW, c->lumDither8, 0); } } } else if (yuv2packedX) { + av_assert1(lumSrcPtr + vLumFilterSize - 1 < (const int16_t **)lumPixBuf + vLumBufSize * 2); + av_assert1(chrUSrcPtr + vChrFilterSize - 1 < (const int16_t **)chrUPixBuf + vChrBufSize * 2); if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize <= 2) { // unscaled RGB int chrAlpha = vChrFilterSize == 1 ? 0 : vChrFilter[2 * dstY + 1]; @@ -678,6 +633,7 @@ static int swscale(SwsContext *c, const uint8_t *src[], alpSrcPtr, dest[0], dstW, dstY); } } else { + av_assert1(!yuv2packed1 && !yuv2packed2); yuv2anyX(c, vLumFilter + dstY * vLumFilterSize, lumSrcPtr, vLumFilterSize, vChrFilter + dstY * vChrFilterSize, @@ -686,18 +642,15 @@ static int swscale(SwsContext *c, const uint8_t *src[], } } } - if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf) { int length = dstW; int height = dstY - lastDstY; - if (is16BPS(c->dstFormat)) - length *= 2; - if (is9_OR_10BPS(dstFormat)) { + if (is16BPS(dstFormat) || isNBPS(dstFormat)) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat); - fill_plane9or10(dst[3], dstStride[3], length, height, lastDstY, - 255, desc->comp[3].depth_minus1 + 1, - isBE(dstFormat)); + fillPlane16(dst[3], dstStride[3], length, height, lastDstY, + 1, desc->comp[3].depth_minus1, + isBE(dstFormat)); } else fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255); } @@ -718,6 +671,31 @@ static int swscale(SwsContext *c, const uint8_t *src[], return dstY - lastDstY; } +av_cold void ff_sws_init_range_convert(SwsContext *c) +{ + c->lumConvertRange = NULL; + c->chrConvertRange = NULL; + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { + if (c->dstBpc <= 14) { + if (c->srcRange) { + c->lumConvertRange = lumRangeFromJpeg_c; + c->chrConvertRange = chrRangeFromJpeg_c; + } else { + c->lumConvertRange = lumRangeToJpeg_c; + c->chrConvertRange = chrRangeToJpeg_c; + } + } else { + if (c->srcRange) { + c->lumConvertRange = lumRangeFromJpeg16_c; + c->chrConvertRange = chrRangeFromJpeg16_c; + } else { + c->lumConvertRange = lumRangeToJpeg16_c; + c->chrConvertRange = chrRangeToJpeg16_c; + } + } + } +} + static av_cold void sws_init_swscale(SwsContext *c) { enum AVPixelFormat srcFormat = c->srcFormat; @@ -728,40 +706,23 @@ static av_cold void sws_init_swscale(SwsContext *c) ff_sws_init_input_funcs(c); + if (c->srcBpc == 8) { - if (c->dstBpc <= 10) { + if (c->dstBpc <= 14) { c->hyScale = c->hcScale = hScale8To15_c; if (c->flags & SWS_FAST_BILINEAR) { - c->hyscale_fast = hyscale_fast_c; - c->hcscale_fast = hcscale_fast_c; + c->hyscale_fast = ff_hyscale_fast_c; + c->hcscale_fast = ff_hcscale_fast_c; } } else { c->hyScale = c->hcScale = hScale8To19_c; } } else { - c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c + c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_c : hScale16To15_c; } - if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { - if (c->dstBpc <= 10) { - if (c->srcRange) { - c->lumConvertRange = lumRangeFromJpeg_c; - c->chrConvertRange = chrRangeFromJpeg_c; - } else { - c->lumConvertRange = lumRangeToJpeg_c; - c->chrConvertRange = chrRangeToJpeg_c; - } - } else { - if (c->srcRange) { - c->lumConvertRange = lumRangeFromJpeg16_c; - c->chrConvertRange = chrRangeFromJpeg16_c; - } else { - c->lumConvertRange = lumRangeToJpeg16_c; - c->chrConvertRange = chrRangeToJpeg16_c; - } - } - } + ff_sws_init_range_convert(c); if (!(isGray(srcFormat) || isGray(c->dstFormat) || srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE)) @@ -779,3 +740,349 @@ SwsFunc ff_getSwsFunc(SwsContext *c) return swscale; } + +static void reset_ptr(const uint8_t *src[], int format) +{ + if (!isALPHA(format)) + src[3] = NULL; + if (!isPlanar(format)) { + src[3] = src[2] = NULL; + + if (!usePal(format)) + src[1] = NULL; + } +} + +static int check_image_pointers(const uint8_t * const data[4], enum AVPixelFormat pix_fmt, + const int linesizes[4]) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); + int i; + + for (i = 0; i < 4; i++) { + int plane = desc->comp[i].plane; + if (!data[plane] || !linesizes[plane]) + return 0; + } + + return 1; +} + +static void xyz12Torgb48(struct SwsContext *c, uint16_t *dst, + const uint16_t *src, int stride, int h) +{ + int xp,yp; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat); + + for (yp=0; yp<h; yp++) { + for (xp=0; xp+2<stride; xp+=3) { + int x, y, z, r, g, b; + + if (desc->flags & AV_PIX_FMT_FLAG_BE) { + x = AV_RB16(src + xp + 0); + y = AV_RB16(src + xp + 1); + z = AV_RB16(src + xp + 2); + } else { + x = AV_RL16(src + xp + 0); + y = AV_RL16(src + xp + 1); + z = AV_RL16(src + xp + 2); + } + + x = c->xyzgamma[x>>4]; + y = c->xyzgamma[y>>4]; + z = c->xyzgamma[z>>4]; + + // convert from XYZlinear to sRGBlinear + r = c->xyz2rgb_matrix[0][0] * x + + c->xyz2rgb_matrix[0][1] * y + + c->xyz2rgb_matrix[0][2] * z >> 12; + g = c->xyz2rgb_matrix[1][0] * x + + c->xyz2rgb_matrix[1][1] * y + + c->xyz2rgb_matrix[1][2] * z >> 12; + b = c->xyz2rgb_matrix[2][0] * x + + c->xyz2rgb_matrix[2][1] * y + + c->xyz2rgb_matrix[2][2] * z >> 12; + + // limit values to 12-bit depth + r = av_clip_c(r,0,4095); + g = av_clip_c(g,0,4095); + b = av_clip_c(b,0,4095); + + // convert from sRGBlinear to RGB and scale from 12bit to 16bit + if (desc->flags & AV_PIX_FMT_FLAG_BE) { + AV_WB16(dst + xp + 0, c->rgbgamma[r] << 4); + AV_WB16(dst + xp + 1, c->rgbgamma[g] << 4); + AV_WB16(dst + xp + 2, c->rgbgamma[b] << 4); + } else { + AV_WL16(dst + xp + 0, c->rgbgamma[r] << 4); + AV_WL16(dst + xp + 1, c->rgbgamma[g] << 4); + AV_WL16(dst + xp + 2, c->rgbgamma[b] << 4); + } + } + src += stride; + dst += stride; + } +} + +static void rgb48Toxyz12(struct SwsContext *c, uint16_t *dst, + const uint16_t *src, int stride, int h) +{ + int xp,yp; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->dstFormat); + + for (yp=0; yp<h; yp++) { + for (xp=0; xp+2<stride; xp+=3) { + int x, y, z, r, g, b; + + if (desc->flags & AV_PIX_FMT_FLAG_BE) { + r = AV_RB16(src + xp + 0); + g = AV_RB16(src + xp + 1); + b = AV_RB16(src + xp + 2); + } else { + r = AV_RL16(src + xp + 0); + g = AV_RL16(src + xp + 1); + b = AV_RL16(src + xp + 2); + } + + r = c->rgbgammainv[r>>4]; + g = c->rgbgammainv[g>>4]; + b = c->rgbgammainv[b>>4]; + + // convert from sRGBlinear to XYZlinear + x = c->rgb2xyz_matrix[0][0] * r + + c->rgb2xyz_matrix[0][1] * g + + c->rgb2xyz_matrix[0][2] * b >> 12; + y = c->rgb2xyz_matrix[1][0] * r + + c->rgb2xyz_matrix[1][1] * g + + c->rgb2xyz_matrix[1][2] * b >> 12; + z = c->rgb2xyz_matrix[2][0] * r + + c->rgb2xyz_matrix[2][1] * g + + c->rgb2xyz_matrix[2][2] * b >> 12; + + // limit values to 12-bit depth + x = av_clip_c(x,0,4095); + y = av_clip_c(y,0,4095); + z = av_clip_c(z,0,4095); + + // convert from XYZlinear to X'Y'Z' and scale from 12bit to 16bit + if (desc->flags & AV_PIX_FMT_FLAG_BE) { + AV_WB16(dst + xp + 0, c->xyzgammainv[x] << 4); + AV_WB16(dst + xp + 1, c->xyzgammainv[y] << 4); + AV_WB16(dst + xp + 2, c->xyzgammainv[z] << 4); + } else { + AV_WL16(dst + xp + 0, c->xyzgammainv[x] << 4); + AV_WL16(dst + xp + 1, c->xyzgammainv[y] << 4); + AV_WL16(dst + xp + 2, c->xyzgammainv[z] << 4); + } + } + src += stride; + dst += stride; + } +} + +/** + * swscale wrapper, so we don't need to export the SwsContext. + * Assumes planar YUV to be in YUV order instead of YVU. + */ +int attribute_align_arg sws_scale(struct SwsContext *c, + const uint8_t * const srcSlice[], + const int srcStride[], int srcSliceY, + int srcSliceH, uint8_t *const dst[], + const int dstStride[]) +{ + int i, ret; + const uint8_t *src2[4]; + uint8_t *dst2[4]; + uint8_t *rgb0_tmp = NULL; + + if (!srcStride || !dstStride || !dst || !srcSlice) { + av_log(c, AV_LOG_ERROR, "One of the input parameters to sws_scale() is NULL, please check the calling code\n"); + return 0; + } + memcpy(src2, srcSlice, sizeof(src2)); + memcpy(dst2, dst, sizeof(dst2)); + + // do not mess up sliceDir if we have a "trailing" 0-size slice + if (srcSliceH == 0) + return 0; + + if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) { + av_log(c, AV_LOG_ERROR, "bad src image pointers\n"); + return 0; + } + if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) { + av_log(c, AV_LOG_ERROR, "bad dst image pointers\n"); + return 0; + } + + if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) { + av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n"); + return 0; + } + if (c->sliceDir == 0) { + if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1; + } + + if (usePal(c->srcFormat)) { + for (i = 0; i < 256; i++) { + int r, g, b, y, u, v, a = 0xff; + if (c->srcFormat == AV_PIX_FMT_PAL8) { + uint32_t p = ((const uint32_t *)(srcSlice[1]))[i]; + a = (p >> 24) & 0xFF; + r = (p >> 16) & 0xFF; + g = (p >> 8) & 0xFF; + b = p & 0xFF; + } else if (c->srcFormat == AV_PIX_FMT_RGB8) { + r = ( i >> 5 ) * 36; + g = ((i >> 2) & 7) * 36; + b = ( i & 3) * 85; + } else if (c->srcFormat == AV_PIX_FMT_BGR8) { + b = ( i >> 6 ) * 85; + g = ((i >> 3) & 7) * 36; + r = ( i & 7) * 36; + } else if (c->srcFormat == AV_PIX_FMT_RGB4_BYTE) { + r = ( i >> 3 ) * 255; + g = ((i >> 1) & 3) * 85; + b = ( i & 1) * 255; + } else if (c->srcFormat == AV_PIX_FMT_GRAY8 || c->srcFormat == AV_PIX_FMT_GRAY8A) { + r = g = b = i; + } else { + av_assert1(c->srcFormat == AV_PIX_FMT_BGR4_BYTE); + b = ( i >> 3 ) * 255; + g = ((i >> 1) & 3) * 85; + r = ( i & 1) * 255; + } +#define RGB2YUV_SHIFT 15 +#define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) +#define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) +#define BU ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) +#define GY ( (int) (0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) +#define GV (-(int) (0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) +#define GU (-(int) (0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) +#define RY ( (int) (0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) +#define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) +#define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) + + y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); + u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); + v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); + c->pal_yuv[i]= y + (u<<8) + (v<<16) + ((unsigned)a<<24); + + switch (c->dstFormat) { + case AV_PIX_FMT_BGR32: +#if !HAVE_BIGENDIAN + case AV_PIX_FMT_RGB24: +#endif + c->pal_rgb[i]= r + (g<<8) + (b<<16) + ((unsigned)a<<24); + break; + case AV_PIX_FMT_BGR32_1: +#if HAVE_BIGENDIAN + case AV_PIX_FMT_BGR24: +#endif + c->pal_rgb[i]= a + (r<<8) + (g<<16) + ((unsigned)b<<24); + break; + case AV_PIX_FMT_RGB32_1: +#if HAVE_BIGENDIAN + case AV_PIX_FMT_RGB24: +#endif + c->pal_rgb[i]= a + (b<<8) + (g<<16) + ((unsigned)r<<24); + break; + case AV_PIX_FMT_RGB32: +#if !HAVE_BIGENDIAN + case AV_PIX_FMT_BGR24: +#endif + default: + c->pal_rgb[i]= b + (g<<8) + (r<<16) + ((unsigned)a<<24); + } + } + } + + if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) { + uint8_t *base; + int x,y; + rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32); + if (!rgb0_tmp) + return AVERROR(ENOMEM); + + base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp; + for (y=0; y<srcSliceH; y++){ + memcpy(base + srcStride[0]*y, src2[0] + srcStride[0]*y, 4*c->srcW); + for (x=c->src0Alpha-1; x<4*c->srcW; x+=4) { + base[ srcStride[0]*y + x] = 0xFF; + } + } + src2[0] = base; + } + + if (c->srcXYZ && !(c->dstXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) { + uint8_t *base; + rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32); + if (!rgb0_tmp) + return AVERROR(ENOMEM); + + base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp; + + xyz12Torgb48(c, (uint16_t*)base, (const uint16_t*)src2[0], srcStride[0]/2, srcSliceH); + src2[0] = base; + } + + if (!srcSliceY && (c->flags & SWS_BITEXACT) && c->dither == SWS_DITHER_ED && c->dither_error[0]) + for (i = 0; i < 4; i++) + memset(c->dither_error[i], 0, sizeof(c->dither_error[0][0]) * (c->dstW+2)); + + + // copy strides, so they can safely be modified + if (c->sliceDir == 1) { + // slices go from top to bottom + int srcStride2[4] = { srcStride[0], srcStride[1], srcStride[2], + srcStride[3] }; + int dstStride2[4] = { dstStride[0], dstStride[1], dstStride[2], + dstStride[3] }; + + reset_ptr(src2, c->srcFormat); + reset_ptr((void*)dst2, c->dstFormat); + + /* reset slice direction at end of frame */ + if (srcSliceY + srcSliceH == c->srcH) + c->sliceDir = 0; + + ret = c->swscale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2, + dstStride2); + } else { + // slices go from bottom to top => we flip the image internally + int srcStride2[4] = { -srcStride[0], -srcStride[1], -srcStride[2], + -srcStride[3] }; + int dstStride2[4] = { -dstStride[0], -dstStride[1], -dstStride[2], + -dstStride[3] }; + + src2[0] += (srcSliceH - 1) * srcStride[0]; + if (!usePal(c->srcFormat)) + src2[1] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[1]; + src2[2] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[2]; + src2[3] += (srcSliceH - 1) * srcStride[3]; + dst2[0] += ( c->dstH - 1) * dstStride[0]; + dst2[1] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[1]; + dst2[2] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[2]; + dst2[3] += ( c->dstH - 1) * dstStride[3]; + + reset_ptr(src2, c->srcFormat); + reset_ptr((void*)dst2, c->dstFormat); + + /* reset slice direction at end of frame */ + if (!srcSliceY) + c->sliceDir = 0; + + ret = c->swscale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH, + srcSliceH, dst2, dstStride2); + } + + + if (c->dstXYZ && !(c->srcXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) { + /* replace on the same data */ + rgb48Toxyz12(c, (uint16_t*)dst2[0], (const uint16_t*)dst2[0], dstStride[0]/2, ret); + } + + av_free(rgb0_tmp); + return ret; +} + diff --git a/libswscale/swscale.h b/libswscale/swscale.h index 8abbac4..903e120 100644 --- a/libswscale/swscale.h +++ b/libswscale/swscale.h @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -24,8 +24,7 @@ /** * @file * @ingroup libsws - * @brief - * external api for the swscale stuff + * external API header */ #include <stdint.h> @@ -81,6 +80,7 @@ const char *swscale_license(void); #define SWS_DIRECT_BGR 0x8000 #define SWS_ACCURATE_RND 0x40000 #define SWS_BITEXACT 0x80000 +#define SWS_ERROR_DIFFUSION 0x800000 #if FF_API_SWS_CPU_CAPS /** @@ -225,7 +225,13 @@ int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[], uint8_t *const dst[], const int dstStride[]); /** - * @param inv_table the yuv2rgb coefficients, normally ff_yuv2rgb_coeffs[x] + * @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg) + * @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg) + * @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x] + * @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x] + * @param brightness 16.16 fixed point brightness correction + * @param contrast 16.16 fixed point contrast correction + * @param saturation 16.16 fixed point saturation correction * @return -1 if not supported */ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index a0daa07..335e1f8 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,18 +27,23 @@ #include <altivec.h> #endif +#include "version.h" + #include "libavutil/avassert.h" #include "libavutil/avutil.h" #include "libavutil/common.h" +#include "libavutil/intreadwrite.h" #include "libavutil/log.h" #include "libavutil/pixfmt.h" #include "libavutil/pixdesc.h" #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long -#define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients +#define YUVRGB_TABLE_HEADROOM 128 + +#define MAX_FILTER_SIZE SWS_MAX_FILTER_SIZE -#define MAX_FILTER_SIZE 256 +#define DITHER1XBPP #if HAVE_BIGENDIAN #define ALT32_CORR (-1) @@ -58,6 +63,16 @@ struct SwsContext; +typedef enum SwsDither { + SWS_DITHER_NONE = 0, + SWS_DITHER_AUTO, + SWS_DITHER_BAYER, + SWS_DITHER_ED, + SWS_DITHER_A_DITHER, + SWS_DITHER_X_DITHER, + NB_SWS_DITHER, +} SwsDither; + typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]); @@ -351,10 +366,25 @@ typedef struct SwsContext { int dstY; ///< Last destination vertical line output from last slice. int flags; ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc... void *yuvTable; // pointer to the yuv->rgb table start so it can be freed() - uint8_t *table_rV[256]; - uint8_t *table_gU[256]; - int table_gV[256]; - uint8_t *table_bU[256]; + // alignment ensures the offset can be added in a single + // instruction on e.g. ARM + DECLARE_ALIGNED(16, int, table_gV)[256 + 2*YUVRGB_TABLE_HEADROOM]; + uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM]; + uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM]; + uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM]; + DECLARE_ALIGNED(16, int32_t, input_rgb2yuv_table)[16+40*4]; // This table can contain both C and SIMD formatted values, the C vales are always at the XY_IDX points +#define RY_IDX 0 +#define GY_IDX 1 +#define BY_IDX 2 +#define RU_IDX 3 +#define GU_IDX 4 +#define BU_IDX 5 +#define RV_IDX 6 +#define GV_IDX 7 +#define BV_IDX 8 +#define RGB2YUV_SHIFT 15 + + int *dither_error[4]; //Colorspace stuff int contrast, brightness, saturation; // for sws_getColorspaceDetails @@ -362,6 +392,14 @@ typedef struct SwsContext { int dstColorspaceTable[4]; int srcRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (source image). int dstRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image). + int src0Alpha; + int dst0Alpha; + int srcXYZ; + int dstXYZ; + int src_h_chr_pos; + int dst_h_chr_pos; + int src_v_chr_pos; + int dst_v_chr_pos; int yuv2rgb_y_offset; int yuv2rgb_y_coeff; int yuv2rgb_v2r_coeff; @@ -381,18 +419,19 @@ typedef struct SwsContext { #define U_OFFSET "9*8" #define V_OFFSET "10*8" #define LUM_MMX_FILTER_OFFSET "11*8" -#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256" -#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM -#define ESP_OFFSET "11*8+4*4*256*2+8" -#define VROUNDER_OFFSET "11*8+4*4*256*2+16" -#define U_TEMP "11*8+4*4*256*2+24" -#define V_TEMP "11*8+4*4*256*2+32" -#define Y_TEMP "11*8+4*4*256*2+40" -#define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48" -#define UV_OFF_PX "11*8+4*4*256*3+48" -#define UV_OFF_BYTE "11*8+4*4*256*3+56" -#define DITHER16 "11*8+4*4*256*3+64" -#define DITHER32 "11*8+4*4*256*3+80" +#define CHR_MMX_FILTER_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE) +#define DSTW_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2" +#define ESP_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+8" +#define VROUNDER_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+16" +#define U_TEMP "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+24" +#define V_TEMP "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+32" +#define Y_TEMP "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+40" +#define ALP_MMX_FILTER_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+48" +#define UV_OFF_PX "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*3+48" +#define UV_OFF_BYTE "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*3+56" +#define DITHER16 "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*3+64" +#define DITHER32 "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*3+80" +#define DITHER32_INT (11*8+4*4*MAX_FILTER_SIZE*3+80) // value equal to above, used for checking that the struct hasn't been changed by mistake DECLARE_ALIGNED(8, uint64_t, redDither); DECLARE_ALIGNED(8, uint64_t, greenDither); @@ -418,8 +457,8 @@ typedef struct SwsContext { // alignment of these values is not necessary, but merely here // to maintain the same offset across x8632 and x86-64. Once we // use proper offset macros in the asm, they can be removed. - DECLARE_ALIGNED(8, ptrdiff_t, uv_off_px); ///< offset (in pixels) between u and v planes - DECLARE_ALIGNED(8, ptrdiff_t, uv_off_byte); ///< offset (in bytes) between u and v planes + DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes + DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes DECLARE_ALIGNED(8, uint16_t, dither16)[8]; DECLARE_ALIGNED(8, uint32_t, dither32)[8]; @@ -436,6 +475,18 @@ typedef struct SwsContext { vector signed short *vYCoeffsBank, *vCCoeffsBank; #endif + int use_mmx_vfilter; + +/* pre defined color-spaces gamma */ +#define XYZ_GAMMA (2.6f) +#define RGB_GAMMA (2.2f) + int16_t *xyzgamma; + int16_t *rgbgamma; + int16_t *xyzgammainv; + int16_t *rgbgammainv; + int16_t xyz2rgb_matrix[3][4]; + int16_t rgb2xyz_matrix[3][4]; + /* function pointers for swscale() */ yuv2planar1_fn yuv2plane1; yuv2planarX_fn yuv2planeX; @@ -446,24 +497,25 @@ typedef struct SwsContext { yuv2anyX_fn yuv2anyX; /// Unscaled conversion of luma plane to YV12 for horizontal scaler. - void (*lumToYV12)(uint8_t *dst, const uint8_t *src, + void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3, int width, uint32_t *pal); /// Unscaled conversion of alpha plane to YV12 for horizontal scaler. - void (*alpToYV12)(uint8_t *dst, const uint8_t *src, + void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3, int width, uint32_t *pal); /// Unscaled conversion of chroma planes to YV12 for horizontal scaler. void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, + const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, int width, uint32_t *pal); /** * Functions to read planar input, such as planar RGB, and convert - * internally to Y/UV. + * internally to Y/UV/A. */ /** @{ */ - void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width); + void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv); void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], - int width); + int width, int32_t *rgb2yuv); + void (*readAlpPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv); /** @} */ /** @@ -539,6 +591,8 @@ typedef struct SwsContext { void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width); int needs_hcscale; ///< Set if there are chroma planes to be converted. + + SwsDither dither; } SwsContext; //FIXME check init (where 0) @@ -552,10 +606,18 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4], void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex, int lastInLumBuf, int lastInChrBuf); +av_cold void ff_sws_init_range_convert(SwsContext *c); + SwsFunc ff_yuv2rgb_init_x86(SwsContext *c); SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c); +#if FF_API_SWS_FORMAT_NAME +/** + * @deprecated Use av_get_pix_fmt_name() instead. + */ +attribute_deprecated const char *sws_format_name(enum AVPixelFormat format); +#endif static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt) { @@ -568,9 +630,11 @@ static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); av_assert0(desc); - return desc->comp[0].depth_minus1 == 8 || desc->comp[0].depth_minus1 == 9; + return desc->comp[0].depth_minus1 >= 8 && desc->comp[0].depth_minus1 <= 13; } +#define isNBPS(x) is9_OR_10BPS(x) + static av_always_inline int isBE(enum AVPixelFormat pix_fmt) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); @@ -601,8 +665,8 @@ static av_always_inline int isRGB(enum AVPixelFormat pix_fmt) #if 0 // FIXME #define isGray(x) \ - (!(av_pix_fmt_descriptors[x].flags & AV_PIX_FMT_FLAG_PAL) && \ - av_pix_fmt_descriptors[x].nb_components <= 2) + (!(av_pix_fmt_desc_get(x)->flags & AV_PIX_FMT_FLAG_PAL) && \ + av_pix_fmt_desc_get(x)->nb_components <= 2) #else #define isGray(x) \ ((x) == AV_PIX_FMT_GRAY8 || \ @@ -613,8 +677,9 @@ static av_always_inline int isRGB(enum AVPixelFormat pix_fmt) (x) == AV_PIX_FMT_YA16LE) #endif -#define isRGBinInt(x) \ - ((x) == AV_PIX_FMT_RGB48BE || \ +#define isRGBinInt(x) \ + ( \ + (x) == AV_PIX_FMT_RGB48BE || \ (x) == AV_PIX_FMT_RGB48LE || \ (x) == AV_PIX_FMT_RGB32 || \ (x) == AV_PIX_FMT_RGB32_1 || \ @@ -631,10 +696,11 @@ static av_always_inline int isRGB(enum AVPixelFormat pix_fmt) (x) == AV_PIX_FMT_RGBA64BE || \ (x) == AV_PIX_FMT_RGBA64LE || \ (x) == AV_PIX_FMT_MONOBLACK || \ - (x) == AV_PIX_FMT_MONOWHITE) - -#define isBGRinInt(x) \ - ((x) == AV_PIX_FMT_BGR48BE || \ + (x) == AV_PIX_FMT_MONOWHITE \ + ) +#define isBGRinInt(x) \ + ( \ + (x) == AV_PIX_FMT_BGR48BE || \ (x) == AV_PIX_FMT_BGR48LE || \ (x) == AV_PIX_FMT_BGR32 || \ (x) == AV_PIX_FMT_BGR32_1 || \ @@ -651,19 +717,73 @@ static av_always_inline int isRGB(enum AVPixelFormat pix_fmt) (x) == AV_PIX_FMT_BGRA64BE || \ (x) == AV_PIX_FMT_BGRA64LE || \ (x) == AV_PIX_FMT_MONOBLACK || \ - (x) == AV_PIX_FMT_MONOWHITE) - -#define isAnyRGB(x) \ - (isRGBinInt(x) || \ - isBGRinInt(x)) + (x) == AV_PIX_FMT_MONOWHITE \ + ) + +#define isRGBinBytes(x) ( \ + (x) == AV_PIX_FMT_RGB48BE \ + || (x) == AV_PIX_FMT_RGB48LE \ + || (x) == AV_PIX_FMT_RGBA64BE \ + || (x) == AV_PIX_FMT_RGBA64LE \ + || (x) == AV_PIX_FMT_RGBA \ + || (x) == AV_PIX_FMT_ARGB \ + || (x) == AV_PIX_FMT_RGB24 \ + ) +#define isBGRinBytes(x) ( \ + (x) == AV_PIX_FMT_BGR48BE \ + || (x) == AV_PIX_FMT_BGR48LE \ + || (x) == AV_PIX_FMT_BGRA64BE \ + || (x) == AV_PIX_FMT_BGRA64LE \ + || (x) == AV_PIX_FMT_BGRA \ + || (x) == AV_PIX_FMT_ABGR \ + || (x) == AV_PIX_FMT_BGR24 \ + ) + +#define isBayer(x) ( \ + (x)==AV_PIX_FMT_BAYER_BGGR8 \ + || (x)==AV_PIX_FMT_BAYER_BGGR16LE \ + || (x)==AV_PIX_FMT_BAYER_BGGR16BE \ + || (x)==AV_PIX_FMT_BAYER_RGGB8 \ + || (x)==AV_PIX_FMT_BAYER_RGGB16LE \ + || (x)==AV_PIX_FMT_BAYER_RGGB16BE \ + || (x)==AV_PIX_FMT_BAYER_GBRG8 \ + || (x)==AV_PIX_FMT_BAYER_GBRG16LE \ + || (x)==AV_PIX_FMT_BAYER_GBRG16BE \ + || (x)==AV_PIX_FMT_BAYER_GRBG8 \ + || (x)==AV_PIX_FMT_BAYER_GRBG16LE \ + || (x)==AV_PIX_FMT_BAYER_GRBG16BE \ + ) + +#define isAnyRGB(x) \ + ( \ + isBayer(x) || \ + isRGBinInt(x) || \ + isBGRinInt(x) || \ + isRGB(x) \ + ) static av_always_inline int isALPHA(enum AVPixelFormat pix_fmt) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); av_assert0(desc); - return desc->nb_components == 2 || desc->nb_components == 4; + if (pix_fmt == AV_PIX_FMT_PAL8) + return 1; + return desc->flags & AV_PIX_FMT_FLAG_ALPHA; } +#if 1 +#define isPacked(x) ( \ + (x)==AV_PIX_FMT_PAL8 \ + || (x)==AV_PIX_FMT_YUYV422 \ + || (x)==AV_PIX_FMT_YVYU422 \ + || (x)==AV_PIX_FMT_UYVY422 \ + || (x)==AV_PIX_FMT_YA8 \ + || (x)==AV_PIX_FMT_YA16LE \ + || (x)==AV_PIX_FMT_YA16BE \ + || isRGBinInt(x) \ + || isBGRinInt(x) \ + ) +#else static av_always_inline int isPacked(enum AVPixelFormat pix_fmt) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); @@ -672,6 +792,7 @@ static av_always_inline int isPacked(enum AVPixelFormat pix_fmt) pix_fmt == AV_PIX_FMT_PAL8); } +#endif static av_always_inline int isPlanar(enum AVPixelFormat pix_fmt) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); @@ -698,18 +819,19 @@ static av_always_inline int usePal(enum AVPixelFormat pix_fmt) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); av_assert0(desc); - return ((desc->flags & AV_PIX_FMT_FLAG_PAL) || (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL) || - pix_fmt == AV_PIX_FMT_YA8); + return (desc->flags & AV_PIX_FMT_FLAG_PAL) || (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL); } extern const uint64_t ff_dither4[2]; extern const uint64_t ff_dither8[2]; -extern const uint8_t ff_dither_4x4_16[4][8]; -extern const uint8_t ff_dither_8x8_32[8][8]; -extern const uint8_t ff_dither_8x8_73[8][8]; -extern const uint8_t ff_dither_8x8_128[8][8]; -extern const uint8_t ff_dither_8x8_220[8][8]; +extern const uint8_t ff_dither_2x2_4[3][8]; +extern const uint8_t ff_dither_2x2_8[3][8]; +extern const uint8_t ff_dither_4x4_16[5][8]; +extern const uint8_t ff_dither_8x8_32[9][8]; +extern const uint8_t ff_dither_8x8_73[9][8]; +extern const uint8_t ff_dither_8x8_128[9][8]; +extern const uint8_t ff_dither_8x8_220[9][8]; extern const int32_t ff_yuv2rgb_coeffs[8][4]; @@ -721,6 +843,7 @@ extern const AVClass sws_context_class; */ void ff_get_unscaled_swscale(SwsContext *c); void ff_get_unscaled_swscale_ppc(SwsContext *c); +void ff_get_unscaled_swscale_arm(SwsContext *c); /** * Return function pointer to fastest main scaler path function depending @@ -740,4 +863,39 @@ void ff_sws_init_output_funcs(SwsContext *c, void ff_sws_init_swscale_ppc(SwsContext *c); void ff_sws_init_swscale_x86(SwsContext *c); +void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth, + const uint8_t *src, int srcW, int xInc); +void ff_hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2, + int dstWidth, const uint8_t *src1, + const uint8_t *src2, int srcW, int xInc); +int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, + int16_t *filter, int32_t *filterPos, + int numSplits); +void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, + int dstWidth, const uint8_t *src, + int srcW, int xInc); +void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, + int dstWidth, const uint8_t *src1, + const uint8_t *src2, int srcW, int xInc); + +static inline void fillPlane16(uint8_t *plane, int stride, int width, int height, int y, + int alpha, int bits, const int big_endian) +{ + int i, j; + uint8_t *ptr = plane + stride * y; + int v = alpha ? 0xFFFF>>(15-bits) : (1<<bits); + for (i = 0; i < height; i++) { +#define FILL(wfunc) \ + for (j = 0; j < width; j++) {\ + wfunc(ptr+2*j, v);\ + } + if (big_endian) { + FILL(AV_WB16); + } else { + FILL(AV_WL16); + } + ptr += stride; + } +} + #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index ffc813e..da457df 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,7 +23,6 @@ #include <math.h> #include <stdio.h> #include "config.h" -#include <assert.h> #include "swscale.h" #include "swscale_internal.h" #include "rgb2rgb.h" @@ -33,58 +32,101 @@ #include "libavutil/mathematics.h" #include "libavutil/bswap.h" #include "libavutil/pixdesc.h" +#include "libavutil/avassert.h" -DECLARE_ALIGNED(8, static const uint8_t, dither_8x8_1)[8][8] = { - { 0, 1, 0, 1, 0, 1, 0, 1,}, - { 1, 0, 1, 0, 1, 0, 1, 0,}, - { 0, 1, 0, 1, 0, 1, 0, 1,}, - { 1, 0, 1, 0, 1, 0, 1, 0,}, - { 0, 1, 0, 1, 0, 1, 0, 1,}, - { 1, 0, 1, 0, 1, 0, 1, 0,}, - { 0, 1, 0, 1, 0, 1, 0, 1,}, - { 1, 0, 1, 0, 1, 0, 1, 0,}, -}; -DECLARE_ALIGNED(8, static const uint8_t, dither_8x8_3)[8][8] = { - { 1, 2, 1, 2, 1, 2, 1, 2,}, - { 3, 0, 3, 0, 3, 0, 3, 0,}, - { 1, 2, 1, 2, 1, 2, 1, 2,}, - { 3, 0, 3, 0, 3, 0, 3, 0,}, - { 1, 2, 1, 2, 1, 2, 1, 2,}, - { 3, 0, 3, 0, 3, 0, 3, 0,}, - { 1, 2, 1, 2, 1, 2, 1, 2,}, - { 3, 0, 3, 0, 3, 0, 3, 0,}, -}; -DECLARE_ALIGNED(8, static const uint8_t, dither_8x8_64)[8][8] = { - { 18, 34, 30, 46, 17, 33, 29, 45,}, - { 50, 2, 62, 14, 49, 1, 61, 13,}, - { 26, 42, 22, 38, 25, 41, 21, 37,}, - { 58, 10, 54, 6, 57, 9, 53, 5,}, - { 16, 32, 28, 44, 19, 35, 31, 47,}, - { 48, 0, 60, 12, 51, 3, 63, 15,}, - { 24, 40, 20, 36, 27, 43, 23, 39,}, - { 56, 8, 52, 4, 59, 11, 55, 7,}, -}; -DECLARE_ALIGNED(8, static const uint8_t, dither_8x8_256)[8][8] = { - { 72, 136, 120, 184, 68, 132, 116, 180,}, - { 200, 8, 248, 56, 196, 4, 244, 52,}, - { 104, 168, 88, 152, 100, 164, 84, 148,}, - { 232, 40, 216, 24, 228, 36, 212, 20,}, - { 64, 128, 102, 176, 76, 140, 124, 188,}, - { 192, 0, 240, 48, 204, 12, 252, 60,}, - { 96, 160, 80, 144, 108, 172, 92, 156,}, - { 224, 32, 208, 16, 236, 44, 220, 28,}, +DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={ +{ + { 0, 1, 0, 1, 0, 1, 0, 1,}, + { 1, 0, 1, 0, 1, 0, 1, 0,}, + { 0, 1, 0, 1, 0, 1, 0, 1,}, + { 1, 0, 1, 0, 1, 0, 1, 0,}, + { 0, 1, 0, 1, 0, 1, 0, 1,}, + { 1, 0, 1, 0, 1, 0, 1, 0,}, + { 0, 1, 0, 1, 0, 1, 0, 1,}, + { 1, 0, 1, 0, 1, 0, 1, 0,}, +},{ + { 1, 2, 1, 2, 1, 2, 1, 2,}, + { 3, 0, 3, 0, 3, 0, 3, 0,}, + { 1, 2, 1, 2, 1, 2, 1, 2,}, + { 3, 0, 3, 0, 3, 0, 3, 0,}, + { 1, 2, 1, 2, 1, 2, 1, 2,}, + { 3, 0, 3, 0, 3, 0, 3, 0,}, + { 1, 2, 1, 2, 1, 2, 1, 2,}, + { 3, 0, 3, 0, 3, 0, 3, 0,}, +},{ + { 2, 4, 3, 5, 2, 4, 3, 5,}, + { 6, 0, 7, 1, 6, 0, 7, 1,}, + { 3, 5, 2, 4, 3, 5, 2, 4,}, + { 7, 1, 6, 0, 7, 1, 6, 0,}, + { 2, 4, 3, 5, 2, 4, 3, 5,}, + { 6, 0, 7, 1, 6, 0, 7, 1,}, + { 3, 5, 2, 4, 3, 5, 2, 4,}, + { 7, 1, 6, 0, 7, 1, 6, 0,}, +},{ + { 4, 8, 7, 11, 4, 8, 7, 11,}, + { 12, 0, 15, 3, 12, 0, 15, 3,}, + { 6, 10, 5, 9, 6, 10, 5, 9,}, + { 14, 2, 13, 1, 14, 2, 13, 1,}, + { 4, 8, 7, 11, 4, 8, 7, 11,}, + { 12, 0, 15, 3, 12, 0, 15, 3,}, + { 6, 10, 5, 9, 6, 10, 5, 9,}, + { 14, 2, 13, 1, 14, 2, 13, 1,}, +},{ + { 9, 17, 15, 23, 8, 16, 14, 22,}, + { 25, 1, 31, 7, 24, 0, 30, 6,}, + { 13, 21, 11, 19, 12, 20, 10, 18,}, + { 29, 5, 27, 3, 28, 4, 26, 2,}, + { 8, 16, 14, 22, 9, 17, 15, 23,}, + { 24, 0, 30, 6, 25, 1, 31, 7,}, + { 12, 20, 10, 18, 13, 21, 11, 19,}, + { 28, 4, 26, 2, 29, 5, 27, 3,}, +},{ + { 18, 34, 30, 46, 17, 33, 29, 45,}, + { 50, 2, 62, 14, 49, 1, 61, 13,}, + { 26, 42, 22, 38, 25, 41, 21, 37,}, + { 58, 10, 54, 6, 57, 9, 53, 5,}, + { 16, 32, 28, 44, 19, 35, 31, 47,}, + { 48, 0, 60, 12, 51, 3, 63, 15,}, + { 24, 40, 20, 36, 27, 43, 23, 39,}, + { 56, 8, 52, 4, 59, 11, 55, 7,}, +},{ + { 18, 34, 30, 46, 17, 33, 29, 45,}, + { 50, 2, 62, 14, 49, 1, 61, 13,}, + { 26, 42, 22, 38, 25, 41, 21, 37,}, + { 58, 10, 54, 6, 57, 9, 53, 5,}, + { 16, 32, 28, 44, 19, 35, 31, 47,}, + { 48, 0, 60, 12, 51, 3, 63, 15,}, + { 24, 40, 20, 36, 27, 43, 23, 39,}, + { 56, 8, 52, 4, 59, 11, 55, 7,}, +},{ + { 36, 68, 60, 92, 34, 66, 58, 90,}, + { 100, 4,124, 28, 98, 2,122, 26,}, + { 52, 84, 44, 76, 50, 82, 42, 74,}, + { 116, 20,108, 12,114, 18,106, 10,}, + { 32, 64, 56, 88, 38, 70, 62, 94,}, + { 96, 0,120, 24,102, 6,126, 30,}, + { 48, 80, 40, 72, 54, 86, 46, 78,}, + { 112, 16,104, 8,118, 22,110, 14,}, +}}; + +static const uint16_t dither_scale[15][16]={ +{ 2, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,}, +{ 2, 3, 7, 7, 13, 13, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,}, +{ 3, 3, 4, 15, 15, 29, 57, 57, 57, 113, 113, 113, 113, 113, 113, 113,}, +{ 3, 4, 4, 5, 31, 31, 61, 121, 241, 241, 241, 241, 481, 481, 481, 481,}, +{ 3, 4, 5, 5, 6, 63, 63, 125, 249, 497, 993, 993, 993, 993, 993, 1985,}, +{ 3, 5, 6, 6, 6, 7, 127, 127, 253, 505, 1009, 2017, 4033, 4033, 4033, 4033,}, +{ 3, 5, 6, 7, 7, 7, 8, 255, 255, 509, 1017, 2033, 4065, 8129,16257,16257,}, +{ 3, 5, 6, 8, 8, 8, 8, 9, 511, 511, 1021, 2041, 4081, 8161,16321,32641,}, +{ 3, 5, 7, 8, 9, 9, 9, 9, 10, 1023, 1023, 2045, 4089, 8177,16353,32705,}, +{ 3, 5, 7, 8, 10, 10, 10, 10, 10, 11, 2047, 2047, 4093, 8185,16369,32737,}, +{ 3, 5, 7, 8, 10, 11, 11, 11, 11, 11, 12, 4095, 4095, 8189,16377,32753,}, +{ 3, 5, 7, 9, 10, 12, 12, 12, 12, 12, 12, 13, 8191, 8191,16381,32761,}, +{ 3, 5, 7, 9, 10, 12, 13, 13, 13, 13, 13, 13, 14,16383,16383,32765,}, +{ 3, 5, 7, 9, 10, 12, 14, 14, 14, 14, 14, 14, 14, 15,32767,32767,}, +{ 3, 5, 7, 9, 11, 12, 14, 15, 15, 15, 15, 15, 15, 15, 16,65535,}, }; -#define RGB2YUV_SHIFT 15 -#define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define BU ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define GY ( (int) (0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define GV (-(int) (0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define GU (-(int) (0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define RY ( (int) (0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) -#define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)) static void fillPlane(uint8_t *plane, int stride, int width, int height, int y, uint8_t val) @@ -97,27 +139,6 @@ static void fillPlane(uint8_t *plane, int stride, int width, int height, int y, } } -static void fill_plane9or10(uint8_t *plane, int stride, int width, - int height, int y, uint8_t val, - const int dst_depth, const int big_endian) -{ - int i, j; - uint16_t *dst = (uint16_t *) (plane + stride * y); -#define FILL8TO9_OR_10(wfunc) \ - for (i = 0; i < height; i++) { \ - for (j = 0; j < width; j++) { \ - wfunc(&dst[j], (val << (dst_depth - 8)) | \ - (val >> (16 - dst_depth))); \ - } \ - dst += stride / 2; \ - } - if (big_endian) { - FILL8TO9_OR_10(AV_WB16); - } else { - FILL8TO9_OR_10(AV_WL16); - } -} - static void copyPlane(const uint8_t *src, int srcStride, int srcSliceY, int srcSliceH, int width, uint8_t *dst, int dstStride) @@ -321,19 +342,23 @@ static int packed_16bpc_bswap(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int i, j; - int srcstr = srcStride[0] >> 1; - int dststr = dstStride[0] >> 1; - uint16_t *dstPtr = (uint16_t *) dst[0]; - const uint16_t *srcPtr = (const uint16_t *) src[0]; - int min_stride = FFMIN(srcstr, dststr); - - for (i = 0; i < srcSliceH; i++) { - for (j = 0; j < min_stride; j++) { - dstPtr[j] = av_bswap16(srcPtr[j]); + int i, j, p; + + for (p = 0; p < 4; p++) { + int srcstr = srcStride[p] / 2; + int dststr = dstStride[p] / 2; + uint16_t *dstPtr = (uint16_t *) dst[p]; + const uint16_t *srcPtr = (const uint16_t *) src[p]; + int min_stride = FFMIN(FFABS(srcstr), FFABS(dststr)); + if(!dstPtr || !srcPtr) + continue; + for (i = 0; i < (srcSliceH >> c->chrDstVSubSample); i++) { + for (j = 0; j < min_stride; j++) { + dstPtr[j] = av_bswap16(srcPtr[j]); + } + srcPtr += srcstr; + dstPtr += dststr; } - srcPtr += srcstr; - dstPtr += dststr; } return srcSliceH; @@ -373,7 +398,7 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[], if (!conv) av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n", - sws_format_name(srcFormat), sws_format_name(dstFormat)); + av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat)); else { for (i = 0; i < srcSliceH; i++) { conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb); @@ -385,6 +410,371 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[], return srcSliceH; } +static void packed16togbra16(const uint8_t *src, int srcStride, + uint16_t *dst[], int dstStride[], int srcSliceH, + int src_alpha, int swap, int shift, int width) +{ + int x, h, i; + int dst_alpha = dst[3] != NULL; + for (h = 0; h < srcSliceH; h++) { + uint16_t *src_line = (uint16_t *)(src + srcStride * h); + switch (swap) { + case 3: + if (src_alpha && dst_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[3][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + } + } else if (dst_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[3][x] = 0xFFFF; + } + } else if (src_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + src_line++; + } + } else { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift); + } + } + break; + case 2: + if (src_alpha && dst_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(*src_line++ >> shift); + dst[1][x] = av_bswap16(*src_line++ >> shift); + dst[2][x] = av_bswap16(*src_line++ >> shift); + dst[3][x] = av_bswap16(*src_line++ >> shift); + } + } else if (dst_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(*src_line++ >> shift); + dst[1][x] = av_bswap16(*src_line++ >> shift); + dst[2][x] = av_bswap16(*src_line++ >> shift); + dst[3][x] = 0xFFFF; + } + } else if (src_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(*src_line++ >> shift); + dst[1][x] = av_bswap16(*src_line++ >> shift); + dst[2][x] = av_bswap16(*src_line++ >> shift); + src_line++; + } + } else { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(*src_line++ >> shift); + dst[1][x] = av_bswap16(*src_line++ >> shift); + dst[2][x] = av_bswap16(*src_line++ >> shift); + } + } + break; + case 1: + if (src_alpha && dst_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(*src_line++) >> shift; + dst[1][x] = av_bswap16(*src_line++) >> shift; + dst[2][x] = av_bswap16(*src_line++) >> shift; + dst[3][x] = av_bswap16(*src_line++) >> shift; + } + } else if (dst_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(*src_line++) >> shift; + dst[1][x] = av_bswap16(*src_line++) >> shift; + dst[2][x] = av_bswap16(*src_line++) >> shift; + dst[3][x] = 0xFFFF; + } + } else if (src_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(*src_line++) >> shift; + dst[1][x] = av_bswap16(*src_line++) >> shift; + dst[2][x] = av_bswap16(*src_line++) >> shift; + src_line++; + } + } else { + for (x = 0; x < width; x++) { + dst[0][x] = av_bswap16(*src_line++) >> shift; + dst[1][x] = av_bswap16(*src_line++) >> shift; + dst[2][x] = av_bswap16(*src_line++) >> shift; + } + } + break; + default: + if (src_alpha && dst_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = *src_line++ >> shift; + dst[1][x] = *src_line++ >> shift; + dst[2][x] = *src_line++ >> shift; + dst[3][x] = *src_line++ >> shift; + } + } else if (dst_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = *src_line++ >> shift; + dst[1][x] = *src_line++ >> shift; + dst[2][x] = *src_line++ >> shift; + dst[3][x] = 0xFFFF; + } + } else if (src_alpha) { + for (x = 0; x < width; x++) { + dst[0][x] = *src_line++ >> shift; + dst[1][x] = *src_line++ >> shift; + dst[2][x] = *src_line++ >> shift; + src_line++; + } + } else { + for (x = 0; x < width; x++) { + dst[0][x] = *src_line++ >> shift; + dst[1][x] = *src_line++ >> shift; + dst[2][x] = *src_line++ >> shift; + } + } + } + for (i = 0; i < 4; i++) + dst[i] += dstStride[i] >> 1; + } +} + +static int Rgb16ToPlanarRgb16Wrapper(SwsContext *c, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + uint16_t *dst2013[] = { (uint16_t *)dst[2], (uint16_t *)dst[0], (uint16_t *)dst[1], (uint16_t *)dst[3] }; + uint16_t *dst1023[] = { (uint16_t *)dst[1], (uint16_t *)dst[0], (uint16_t *)dst[2], (uint16_t *)dst[3] }; + int stride2013[] = { dstStride[2], dstStride[0], dstStride[1], dstStride[3] }; + int stride1023[] = { dstStride[1], dstStride[0], dstStride[2], dstStride[3] }; + const AVPixFmtDescriptor *src_format = av_pix_fmt_desc_get(c->srcFormat); + const AVPixFmtDescriptor *dst_format = av_pix_fmt_desc_get(c->dstFormat); + int bpc = dst_format->comp[0].depth_minus1 + 1; + int alpha = src_format->flags & AV_PIX_FMT_FLAG_ALPHA; + int swap = 0; + if ( HAVE_BIGENDIAN && !(src_format->flags & AV_PIX_FMT_FLAG_BE) || + !HAVE_BIGENDIAN && src_format->flags & AV_PIX_FMT_FLAG_BE) + swap++; + if ( HAVE_BIGENDIAN && !(dst_format->flags & AV_PIX_FMT_FLAG_BE) || + !HAVE_BIGENDIAN && dst_format->flags & AV_PIX_FMT_FLAG_BE) + swap += 2; + + if ((dst_format->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) != + (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB) || bpc < 9) { + av_log(c, AV_LOG_ERROR, "unsupported conversion to planar RGB %s -> %s\n", + src_format->name, dst_format->name); + return srcSliceH; + } + switch (c->srcFormat) { + case AV_PIX_FMT_RGB48LE: + case AV_PIX_FMT_RGB48BE: + case AV_PIX_FMT_RGBA64LE: + case AV_PIX_FMT_RGBA64BE: + packed16togbra16(src[0] + srcSliceY * srcStride[0], srcStride[0], + dst2013, stride2013, srcSliceH, alpha, swap, + 16 - bpc, c->srcW); + break; + case AV_PIX_FMT_BGR48LE: + case AV_PIX_FMT_BGR48BE: + case AV_PIX_FMT_BGRA64LE: + case AV_PIX_FMT_BGRA64BE: + packed16togbra16(src[0] + srcSliceY * srcStride[0], srcStride[0], + dst1023, stride1023, srcSliceH, alpha, swap, + 16 - bpc, c->srcW); + break; + default: + av_log(c, AV_LOG_ERROR, + "unsupported conversion to planar RGB %s -> %s\n", + src_format->name, dst_format->name); + } + + return srcSliceH; +} + +static void gbr16ptopacked16(const uint16_t *src[], int srcStride[], + uint8_t *dst, int dstStride, int srcSliceH, + int alpha, int swap, int bpp, int width) +{ + int x, h, i; + int src_alpha = src[3] != NULL; + int scale_high = 16 - bpp, scale_low = (bpp - 8) * 2; + for (h = 0; h < srcSliceH; h++) { + uint16_t *dest = (uint16_t *)(dst + dstStride * h); + uint16_t component; + + switch(swap) { + case 3: + if (alpha && !src_alpha) { + for (x = 0; x < width; x++) { + component = av_bswap16(src[0][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + component = av_bswap16(src[1][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + component = av_bswap16(src[2][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + *dest++ = 0xffff; + } + } else if (alpha && src_alpha) { + for (x = 0; x < width; x++) { + component = av_bswap16(src[0][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + component = av_bswap16(src[1][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + component = av_bswap16(src[2][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + component = av_bswap16(src[3][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + } + } else { + for (x = 0; x < width; x++) { + component = av_bswap16(src[0][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + component = av_bswap16(src[1][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + component = av_bswap16(src[2][x]); + *dest++ = av_bswap16(component << scale_high | component >> scale_low); + } + } + break; + case 2: + if (alpha && !src_alpha) { + for (x = 0; x < width; x++) { + *dest++ = av_bswap16(src[0][x] << scale_high | src[0][x] >> scale_low); + *dest++ = av_bswap16(src[1][x] << scale_high | src[1][x] >> scale_low); + *dest++ = av_bswap16(src[2][x] << scale_high | src[2][x] >> scale_low); + *dest++ = 0xffff; + } + } else if (alpha && src_alpha) { + for (x = 0; x < width; x++) { + *dest++ = av_bswap16(src[0][x] << scale_high | src[0][x] >> scale_low); + *dest++ = av_bswap16(src[1][x] << scale_high | src[1][x] >> scale_low); + *dest++ = av_bswap16(src[2][x] << scale_high | src[2][x] >> scale_low); + *dest++ = av_bswap16(src[3][x] << scale_high | src[3][x] >> scale_low); + } + } else { + for (x = 0; x < width; x++) { + *dest++ = av_bswap16(src[0][x] << scale_high | src[0][x] >> scale_low); + *dest++ = av_bswap16(src[1][x] << scale_high | src[1][x] >> scale_low); + *dest++ = av_bswap16(src[2][x] << scale_high | src[2][x] >> scale_low); + } + } + break; + case 1: + if (alpha && !src_alpha) { + for (x = 0; x < width; x++) { + *dest++ = av_bswap16(src[0][x]) << scale_high | av_bswap16(src[0][x]) >> scale_low; + *dest++ = av_bswap16(src[1][x]) << scale_high | av_bswap16(src[1][x]) >> scale_low; + *dest++ = av_bswap16(src[2][x]) << scale_high | av_bswap16(src[2][x]) >> scale_low; + *dest++ = 0xffff; + } + } else if (alpha && src_alpha) { + for (x = 0; x < width; x++) { + *dest++ = av_bswap16(src[0][x]) << scale_high | av_bswap16(src[0][x]) >> scale_low; + *dest++ = av_bswap16(src[1][x]) << scale_high | av_bswap16(src[1][x]) >> scale_low; + *dest++ = av_bswap16(src[2][x]) << scale_high | av_bswap16(src[2][x]) >> scale_low; + *dest++ = av_bswap16(src[3][x]) << scale_high | av_bswap16(src[3][x]) >> scale_low; + } + } else { + for (x = 0; x < width; x++) { + *dest++ = av_bswap16(src[0][x]) << scale_high | av_bswap16(src[0][x]) >> scale_low; + *dest++ = av_bswap16(src[1][x]) << scale_high | av_bswap16(src[1][x]) >> scale_low; + *dest++ = av_bswap16(src[2][x]) << scale_high | av_bswap16(src[2][x]) >> scale_low; + } + } + break; + default: + if (alpha && !src_alpha) { + for (x = 0; x < width; x++) { + *dest++ = src[0][x] << scale_high | src[0][x] >> scale_low; + *dest++ = src[1][x] << scale_high | src[1][x] >> scale_low; + *dest++ = src[2][x] << scale_high | src[2][x] >> scale_low; + *dest++ = 0xffff; + } + } else if (alpha && src_alpha) { + for (x = 0; x < width; x++) { + *dest++ = src[0][x] << scale_high | src[0][x] >> scale_low; + *dest++ = src[1][x] << scale_high | src[1][x] >> scale_low; + *dest++ = src[2][x] << scale_high | src[2][x] >> scale_low; + *dest++ = src[3][x] << scale_high | src[3][x] >> scale_low; + } + } else { + for (x = 0; x < width; x++) { + *dest++ = src[0][x] << scale_high | src[0][x] >> scale_low; + *dest++ = src[1][x] << scale_high | src[1][x] >> scale_low; + *dest++ = src[2][x] << scale_high | src[2][x] >> scale_low; + } + } + } + for (i = 0; i < 3 + src_alpha; i++) + src[i] += srcStride[i] >> 1; + } +} + +static int planarRgb16ToRgb16Wrapper(SwsContext *c, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + const uint16_t *src102[] = { (uint16_t *)src[1], (uint16_t *)src[0], (uint16_t *)src[2], (uint16_t *)src[3] }; + const uint16_t *src201[] = { (uint16_t *)src[2], (uint16_t *)src[0], (uint16_t *)src[1], (uint16_t *)src[3] }; + int stride102[] = { srcStride[1], srcStride[0], srcStride[2], srcStride[3] }; + int stride201[] = { srcStride[2], srcStride[0], srcStride[1], srcStride[3] }; + const AVPixFmtDescriptor *src_format = av_pix_fmt_desc_get(c->srcFormat); + const AVPixFmtDescriptor *dst_format = av_pix_fmt_desc_get(c->dstFormat); + int bits_per_sample = src_format->comp[0].depth_minus1 + 1; + int swap = 0; + if ( HAVE_BIGENDIAN && !(src_format->flags & AV_PIX_FMT_FLAG_BE) || + !HAVE_BIGENDIAN && src_format->flags & AV_PIX_FMT_FLAG_BE) + swap++; + if ( HAVE_BIGENDIAN && !(dst_format->flags & AV_PIX_FMT_FLAG_BE) || + !HAVE_BIGENDIAN && dst_format->flags & AV_PIX_FMT_FLAG_BE) + swap += 2; + + if ((src_format->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) != + (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB) || + bits_per_sample <= 8) { + av_log(c, AV_LOG_ERROR, "unsupported planar RGB conversion %s -> %s\n", + src_format->name, dst_format->name); + return srcSliceH; + } + switch (c->dstFormat) { + case AV_PIX_FMT_BGR48LE: + case AV_PIX_FMT_BGR48BE: + gbr16ptopacked16(src102, stride102, + dst[0] + srcSliceY * dstStride[0], dstStride[0], + srcSliceH, 0, swap, bits_per_sample, c->srcW); + break; + case AV_PIX_FMT_RGB48LE: + case AV_PIX_FMT_RGB48BE: + gbr16ptopacked16(src201, stride201, + dst[0] + srcSliceY * dstStride[0], dstStride[0], + srcSliceH, 0, swap, bits_per_sample, c->srcW); + break; + case AV_PIX_FMT_RGBA64LE: + case AV_PIX_FMT_RGBA64BE: + gbr16ptopacked16(src201, stride201, + dst[0] + srcSliceY * dstStride[0], dstStride[0], + srcSliceH, 1, swap, bits_per_sample, c->srcW); + break; + case AV_PIX_FMT_BGRA64LE: + case AV_PIX_FMT_BGRA64BE: + gbr16ptopacked16(src102, stride102, + dst[0] + srcSliceY * dstStride[0], dstStride[0], + srcSliceH, 1, swap, bits_per_sample, c->srcW); + break; + default: + av_log(c, AV_LOG_ERROR, + "unsupported planar RGB conversion %s -> %s\n", + src_format->name, dst_format->name); + } + + return srcSliceH; +} + static void gbr24ptopacked24(const uint8_t *src[], int srcStride[], uint8_t *dst, int dstStride, int srcSliceH, int width) @@ -488,6 +878,22 @@ static int planarRgbToRgbWrapper(SwsContext *c, const uint8_t *src[], return srcSliceH; } +static int planarRgbToplanarRgbWrapper(SwsContext *c, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW, + dst[0], dstStride[0]); + copyPlane(src[1], srcStride[1], srcSliceY, srcSliceH, c->srcW, + dst[1], dstStride[1]); + copyPlane(src[2], srcStride[2], srcSliceY, srcSliceH, c->srcW, + dst[2], dstStride[2]); + if (dst[3]) + fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); + + return srcSliceH; +} + static void packedtogbr24p(const uint8_t *src, int srcStride, uint8_t *dst[], int dstStride[], int srcSliceH, int alpha_first, int inc_size, int width) @@ -562,6 +968,160 @@ static int rgbToPlanarRgbWrapper(SwsContext *c, const uint8_t *src[], return srcSliceH; } +#define BAYER_GBRG +#define BAYER_8 +#define BAYER_RENAME(x) bayer_gbrg8_to_##x +#include "bayer_template.c" + +#define BAYER_GBRG +#define BAYER_16LE +#define BAYER_RENAME(x) bayer_gbrg16le_to_##x +#include "bayer_template.c" + +#define BAYER_GBRG +#define BAYER_16BE +#define BAYER_RENAME(x) bayer_gbrg16be_to_##x +#include "bayer_template.c" + +#define BAYER_GRBG +#define BAYER_8 +#define BAYER_RENAME(x) bayer_grbg8_to_##x +#include "bayer_template.c" + +#define BAYER_GRBG +#define BAYER_16LE +#define BAYER_RENAME(x) bayer_grbg16le_to_##x +#include "bayer_template.c" + +#define BAYER_GRBG +#define BAYER_16BE +#define BAYER_RENAME(x) bayer_grbg16be_to_##x +#include "bayer_template.c" + +#define BAYER_BGGR +#define BAYER_8 +#define BAYER_RENAME(x) bayer_bggr8_to_##x +#include "bayer_template.c" + +#define BAYER_BGGR +#define BAYER_16LE +#define BAYER_RENAME(x) bayer_bggr16le_to_##x +#include "bayer_template.c" + +#define BAYER_BGGR +#define BAYER_16BE +#define BAYER_RENAME(x) bayer_bggr16be_to_##x +#include "bayer_template.c" + +#define BAYER_RGGB +#define BAYER_8 +#define BAYER_RENAME(x) bayer_rggb8_to_##x +#include "bayer_template.c" + +#define BAYER_RGGB +#define BAYER_16LE +#define BAYER_RENAME(x) bayer_rggb16le_to_##x +#include "bayer_template.c" + +#define BAYER_RGGB +#define BAYER_16BE +#define BAYER_RENAME(x) bayer_rggb16be_to_##x +#include "bayer_template.c" + +static int bayer_to_rgb24_wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY, + int srcSliceH, uint8_t* dst[], int dstStride[]) +{ + uint8_t *dstPtr= dst[0]; + const uint8_t *srcPtr= src[0]; + int i; + void (*copy) (const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width); + void (*interpolate)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width); + + switch(c->srcFormat) { +#define CASE(pixfmt, prefix) \ + case pixfmt: copy = bayer_##prefix##_to_rgb24_copy; \ + interpolate = bayer_##prefix##_to_rgb24_interpolate; \ + break; + CASE(AV_PIX_FMT_BAYER_BGGR8, bggr8) + CASE(AV_PIX_FMT_BAYER_BGGR16LE, bggr16le) + CASE(AV_PIX_FMT_BAYER_BGGR16BE, bggr16be) + CASE(AV_PIX_FMT_BAYER_RGGB8, rggb8) + CASE(AV_PIX_FMT_BAYER_RGGB16LE, rggb16le) + CASE(AV_PIX_FMT_BAYER_RGGB16BE, rggb16be) + CASE(AV_PIX_FMT_BAYER_GBRG8, gbrg8) + CASE(AV_PIX_FMT_BAYER_GBRG16LE, gbrg16le) + CASE(AV_PIX_FMT_BAYER_GBRG16BE, gbrg16be) + CASE(AV_PIX_FMT_BAYER_GRBG8, grbg8) + CASE(AV_PIX_FMT_BAYER_GRBG16LE, grbg16le) + CASE(AV_PIX_FMT_BAYER_GRBG16BE, grbg16be) +#undef CASE + default: return 0; + } + + copy(srcPtr, srcStride[0], dstPtr, dstStride[0], c->srcW); + srcPtr += 2 * srcStride[0]; + dstPtr += 2 * dstStride[0]; + + for (i = 2; i < srcSliceH - 2; i += 2) { + interpolate(srcPtr, srcStride[0], dstPtr, dstStride[0], c->srcW); + srcPtr += 2 * srcStride[0]; + dstPtr += 2 * dstStride[0]; + } + + copy(srcPtr, srcStride[0], dstPtr, dstStride[0], c->srcW); + return srcSliceH; +} + +static int bayer_to_yv12_wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY, + int srcSliceH, uint8_t* dst[], int dstStride[]) +{ + const uint8_t *srcPtr= src[0]; + uint8_t *dstY= dst[0]; + uint8_t *dstU= dst[1]; + uint8_t *dstV= dst[2]; + int i; + void (*copy) (const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv); + void (*interpolate)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv); + + switch(c->srcFormat) { +#define CASE(pixfmt, prefix) \ + case pixfmt: copy = bayer_##prefix##_to_yv12_copy; \ + interpolate = bayer_##prefix##_to_yv12_interpolate; \ + break; + CASE(AV_PIX_FMT_BAYER_BGGR8, bggr8) + CASE(AV_PIX_FMT_BAYER_BGGR16LE, bggr16le) + CASE(AV_PIX_FMT_BAYER_BGGR16BE, bggr16be) + CASE(AV_PIX_FMT_BAYER_RGGB8, rggb8) + CASE(AV_PIX_FMT_BAYER_RGGB16LE, rggb16le) + CASE(AV_PIX_FMT_BAYER_RGGB16BE, rggb16be) + CASE(AV_PIX_FMT_BAYER_GBRG8, gbrg8) + CASE(AV_PIX_FMT_BAYER_GBRG16LE, gbrg16le) + CASE(AV_PIX_FMT_BAYER_GBRG16BE, gbrg16be) + CASE(AV_PIX_FMT_BAYER_GRBG8, grbg8) + CASE(AV_PIX_FMT_BAYER_GRBG16LE, grbg16le) + CASE(AV_PIX_FMT_BAYER_GRBG16BE, grbg16be) +#undef CASE + default: return 0; + } + + copy(srcPtr, srcStride[0], dstY, dstU, dstV, dstStride[0], c->srcW, c->input_rgb2yuv_table); + srcPtr += 2 * srcStride[0]; + dstY += 2 * dstStride[0]; + dstU += dstStride[1]; + dstV += dstStride[1]; + + for (i = 2; i < srcSliceH - 2; i += 2) { + interpolate(srcPtr, srcStride[0], dstY, dstU, dstV, dstStride[0], c->srcW, c->input_rgb2yuv_table); + srcPtr += 2 * srcStride[0]; + dstY += 2 * dstStride[0]; + dstU += dstStride[1]; + dstV += dstStride[1]; + } + + copy(srcPtr, srcStride[0], dstY, dstU, dstV, dstStride[0], c->srcW, c->input_rgb2yuv_table); + return srcSliceH; +} + #define isRGBA32(x) ( \ (x) == AV_PIX_FMT_ARGB \ || (x) == AV_PIX_FMT_RGBA \ @@ -569,6 +1129,20 @@ static int rgbToPlanarRgbWrapper(SwsContext *c, const uint8_t *src[], || (x) == AV_PIX_FMT_ABGR \ ) +#define isRGBA64(x) ( \ + (x) == AV_PIX_FMT_RGBA64LE \ + || (x) == AV_PIX_FMT_RGBA64BE \ + || (x) == AV_PIX_FMT_BGRA64LE \ + || (x) == AV_PIX_FMT_BGRA64BE \ + ) + +#define isRGB48(x) ( \ + (x) == AV_PIX_FMT_RGB48LE \ + || (x) == AV_PIX_FMT_RGB48BE \ + || (x) == AV_PIX_FMT_BGR48LE \ + || (x) == AV_PIX_FMT_BGR48BE \ + ) + /* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */ typedef void (* rgbConvFn) (const uint8_t *, uint8_t *, int); static rgbConvFn findRgbConvFn(SwsContext *c) @@ -578,17 +1152,11 @@ static rgbConvFn findRgbConvFn(SwsContext *c) const int srcId = c->srcFormatBpp; const int dstId = c->dstFormatBpp; rgbConvFn conv = NULL; - const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat); - const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat); #define IS_NOT_NE(bpp, desc) \ (((bpp + 7) >> 3) == 2 && \ (!(desc->flags & AV_PIX_FMT_FLAG_BE) != !HAVE_BIGENDIAN)) - /* if this is non-native rgb444/555/565, don't handle it here. */ - if (IS_NOT_NE(srcId, desc_src) || IS_NOT_NE(dstId, desc_dst)) - return NULL; - #define CONV_IS(src, dst) (srcFormat == AV_PIX_FMT_##src && dstFormat == AV_PIX_FMT_##dst) if (isRGBA32(srcFormat) && isRGBA32(dstFormat)) { @@ -604,6 +1172,32 @@ static rgbConvFn findRgbConvFn(SwsContext *c) || CONV_IS(RGBA, BGRA)) conv = shuffle_bytes_2103; else if (CONV_IS(BGRA, ABGR) || CONV_IS(RGBA, ARGB)) conv = shuffle_bytes_3012; + } else if (isRGB48(srcFormat) && isRGB48(dstFormat)) { + if (CONV_IS(RGB48LE, BGR48LE) + || CONV_IS(BGR48LE, RGB48LE) + || CONV_IS(RGB48BE, BGR48BE) + || CONV_IS(BGR48BE, RGB48BE)) conv = rgb48tobgr48_nobswap; + else if (CONV_IS(RGB48LE, BGR48BE) + || CONV_IS(BGR48LE, RGB48BE) + || CONV_IS(RGB48BE, BGR48LE) + || CONV_IS(BGR48BE, RGB48LE)) conv = rgb48tobgr48_bswap; + } else if (isRGBA64(srcFormat) && isRGB48(dstFormat)) { + if (CONV_IS(RGBA64LE, BGR48LE) + || CONV_IS(BGRA64LE, RGB48LE) + || CONV_IS(RGBA64BE, BGR48BE) + || CONV_IS(BGRA64BE, RGB48BE)) conv = rgb64tobgr48_nobswap; + else if (CONV_IS(RGBA64LE, BGR48BE) + || CONV_IS(BGRA64LE, RGB48BE) + || CONV_IS(RGBA64BE, BGR48LE) + || CONV_IS(BGRA64BE, RGB48LE)) conv = rgb64tobgr48_bswap; + else if (CONV_IS(RGBA64LE, RGB48LE) + || CONV_IS(BGRA64LE, BGR48LE) + || CONV_IS(RGBA64BE, RGB48BE) + || CONV_IS(BGRA64BE, BGR48BE)) conv = rgb64to48_nobswap; + else if (CONV_IS(RGBA64LE, RGB48BE) + || CONV_IS(BGRA64LE, BGR48BE) + || CONV_IS(RGBA64BE, RGB48LE) + || CONV_IS(BGRA64BE, BGR48LE)) conv = rgb64to48_bswap; } else /* BGR -> BGR */ if ((isBGRinInt(srcFormat) && isBGRinInt(dstFormat)) || @@ -645,6 +1239,9 @@ static rgbConvFn findRgbConvFn(SwsContext *c) } } + if ((dstFormat == AV_PIX_FMT_RGB32_1 || dstFormat == AV_PIX_FMT_BGR32_1) && !isRGBA32(srcFormat) && ALT32_CORR<0) + return NULL; + return conv; } @@ -656,34 +1253,52 @@ static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[], { const enum AVPixelFormat srcFormat = c->srcFormat; const enum AVPixelFormat dstFormat = c->dstFormat; + const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(c->srcFormat); + const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat); const int srcBpp = (c->srcFormatBpp + 7) >> 3; const int dstBpp = (c->dstFormatBpp + 7) >> 3; rgbConvFn conv = findRgbConvFn(c); if (!conv) { av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n", - sws_format_name(srcFormat), sws_format_name(dstFormat)); + av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat)); } else { const uint8_t *srcPtr = src[0]; uint8_t *dstPtr = dst[0]; + int src_bswap = IS_NOT_NE(c->srcFormatBpp, desc_src); + int dst_bswap = IS_NOT_NE(c->dstFormatBpp, desc_dst); + if ((srcFormat == AV_PIX_FMT_RGB32_1 || srcFormat == AV_PIX_FMT_BGR32_1) && !isRGBA32(dstFormat)) srcPtr += ALT32_CORR; if ((dstFormat == AV_PIX_FMT_RGB32_1 || dstFormat == AV_PIX_FMT_BGR32_1) && - !isRGBA32(srcFormat)) + !isRGBA32(srcFormat)) { + int i; + av_assert0(ALT32_CORR == 1); + for (i = 0; i < srcSliceH; i++) + dstPtr[dstStride[0] * (srcSliceY + i)] = 255; dstPtr += ALT32_CORR; + } if (dstStride[0] * srcBpp == srcStride[0] * dstBpp && srcStride[0] > 0 && - !(srcStride[0] % srcBpp)) + !(srcStride[0] % srcBpp) && !dst_bswap && !src_bswap) conv(srcPtr, dstPtr + dstStride[0] * srcSliceY, (srcSliceH - 1) * srcStride[0] + c->srcW * srcBpp); else { - int i; + int i, j; dstPtr += dstStride[0] * srcSliceY; for (i = 0; i < srcSliceH; i++) { - conv(srcPtr, dstPtr, c->srcW * srcBpp); + if(src_bswap) { + for(j=0; j<c->srcW; j++) + ((uint16_t*)c->formatConvBuffer)[j] = av_bswap16(((uint16_t*)srcPtr)[j]); + conv(c->formatConvBuffer, dstPtr, c->srcW * srcBpp); + }else + conv(srcPtr, dstPtr, c->srcW * srcBpp); + if(dst_bswap) + for(j=0; j<c->srcW; j++) + ((uint16_t*)dstPtr)[j] = av_bswap16(((uint16_t*)dstPtr)[j]); srcPtr += srcStride[0]; dstPtr += dstStride[0]; } @@ -696,13 +1311,14 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - rgb24toyv12( + ff_rgb24toyv12( src[0], dst[0] + srcSliceY * dstStride[0], dst[1] + (srcSliceY >> 1) * dstStride[1], dst[2] + (srcSliceY >> 1) * dstStride[2], c->srcW, srcSliceH, - dstStride[0], dstStride[1], srcStride[0]); + dstStride[0], dstStride[1], srcStride[0], + c->input_rgb2yuv_table); if (dst[3]) fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); return srcSliceH; @@ -741,7 +1357,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], while (length + c->srcW <= FFABS(dstStride[0]) && length + c->srcW <= FFABS(srcStride[0])) length += c->srcW; - assert(length != 0); + av_assert1(length != 0); for (i = 0; i < srcSliceH; i++) { memcpy(dstPtr, srcPtr, length); @@ -752,25 +1368,25 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], return srcSliceH; } -#define clip9(x) av_clip_uintp2(x, 9) -#define clip10(x) av_clip_uintp2(x, 10) -#define DITHER_COPY(dst, dstStride, wfunc, src, srcStride, rfunc, dithers, shift, clip) \ - for (i = 0; i < height; i++) { \ - const uint8_t *dither = dithers[i & 7]; \ - for (j = 0; j < length - 7; j += 8) { \ - wfunc(&dst[j + 0], clip((rfunc(&src[j + 0]) + dither[0]) >> shift)); \ - wfunc(&dst[j + 1], clip((rfunc(&src[j + 1]) + dither[1]) >> shift)); \ - wfunc(&dst[j + 2], clip((rfunc(&src[j + 2]) + dither[2]) >> shift)); \ - wfunc(&dst[j + 3], clip((rfunc(&src[j + 3]) + dither[3]) >> shift)); \ - wfunc(&dst[j + 4], clip((rfunc(&src[j + 4]) + dither[4]) >> shift)); \ - wfunc(&dst[j + 5], clip((rfunc(&src[j + 5]) + dither[5]) >> shift)); \ - wfunc(&dst[j + 6], clip((rfunc(&src[j + 6]) + dither[6]) >> shift)); \ - wfunc(&dst[j + 7], clip((rfunc(&src[j + 7]) + dither[7]) >> shift)); \ - } \ - for (; j < length; j++) \ - wfunc(&dst[j], (rfunc(&src[j]) + dither[j & 7]) >> shift); \ - dst += dstStride; \ - src += srcStride; \ +#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\ + uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\ + int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\ + for (i = 0; i < height; i++) {\ + const uint8_t *dither= dithers[src_depth-9][i&7];\ + for (j = 0; j < length-7; j+=8){\ + dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\ + dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\ + dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\ + dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\ + dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\ + dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\ + dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\ + dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\ + }\ + for (; j < length; j++)\ + dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\ + dst += dstStride;\ + src += srcStride;\ } static int planarCopyWrapper(SwsContext *c, const uint8_t *src[], @@ -781,174 +1397,129 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t *src[], const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat); int plane, i, j; for (plane = 0; plane < 4; plane++) { - int length = (plane == 0 || plane == 3) ? c->srcW : -((-c->srcW ) >> c->chrDstHSubSample); - int y = (plane == 0 || plane == 3) ? srcSliceY: -((-srcSliceY) >> c->chrDstVSubSample); - int height = (plane == 0 || plane == 3) ? srcSliceH: -((-srcSliceH) >> c->chrDstVSubSample); + int length = (plane == 0 || plane == 3) ? c->srcW : FF_CEIL_RSHIFT(c->srcW, c->chrDstHSubSample); + int y = (plane == 0 || plane == 3) ? srcSliceY: FF_CEIL_RSHIFT(srcSliceY, c->chrDstVSubSample); + int height = (plane == 0 || plane == 3) ? srcSliceH: FF_CEIL_RSHIFT(srcSliceH, c->chrDstVSubSample); const uint8_t *srcPtr = src[plane]; uint8_t *dstPtr = dst[plane] + dstStride[plane] * y; + int shiftonly= plane==1 || plane==2 || (!c->srcRange && plane==0); if (!dst[plane]) continue; // ignore palette for GRAY8 if (plane == 1 && !dst[2]) continue; if (!src[plane] || (plane == 1 && !src[2])) { - int val = (plane == 3) ? 255 : 128; - if (is16BPS(c->dstFormat)) - length *= 2; - if (is9_OR_10BPS(c->dstFormat)) { - fill_plane9or10(dst[plane], dstStride[plane], - length, height, y, val, - desc_dst->comp[plane].depth_minus1 + 1, - isBE(c->dstFormat)); - } else + if (is16BPS(c->dstFormat) || isNBPS(c->dstFormat)) { + fillPlane16(dst[plane], dstStride[plane], length, height, y, + plane == 3, desc_dst->comp[plane].depth_minus1, + isBE(c->dstFormat)); + } else { fillPlane(dst[plane], dstStride[plane], length, height, y, - val); + (plane == 3) ? 255 : 128); + } } else { - if (is9_OR_10BPS(c->srcFormat)) { + if(isNBPS(c->srcFormat) || isNBPS(c->dstFormat) + || (is16BPS(c->srcFormat) != is16BPS(c->dstFormat)) + ) { const int src_depth = desc_src->comp[plane].depth_minus1 + 1; const int dst_depth = desc_dst->comp[plane].depth_minus1 + 1; const uint16_t *srcPtr2 = (const uint16_t *) srcPtr; + uint16_t *dstPtr2 = (uint16_t*)dstPtr; - if (is16BPS(c->dstFormat)) { - uint16_t *dstPtr2 = (uint16_t *) dstPtr; -#define COPY9_OR_10TO16(rfunc, wfunc) \ - for (i = 0; i < height; i++) { \ - for (j = 0; j < length; j++) { \ - int srcpx = rfunc(&srcPtr2[j]); \ - wfunc(&dstPtr2[j], (srcpx << (16 - src_depth)) | (srcpx >> (2 * src_depth - 16))); \ - } \ - dstPtr2 += dstStride[plane] / 2; \ - srcPtr2 += srcStride[plane] / 2; \ + if (dst_depth == 8) { + if(isBE(c->srcFormat) == HAVE_BIGENDIAN){ + DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, , ) + } else { + DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, av_bswap16, ) } - if (isBE(c->dstFormat)) { - if (isBE(c->srcFormat)) { - COPY9_OR_10TO16(AV_RB16, AV_WB16); - } else { - COPY9_OR_10TO16(AV_RL16, AV_WB16); + } else if (src_depth == 8) { + for (i = 0; i < height; i++) { + #define COPY816(w)\ + if(shiftonly){\ + for (j = 0; j < length; j++)\ + w(&dstPtr2[j], srcPtr[j]<<(dst_depth-8));\ + }else{\ + for (j = 0; j < length; j++)\ + w(&dstPtr2[j], (srcPtr[j]<<(dst_depth-8)) |\ + (srcPtr[j]>>(2*8-dst_depth)));\ } - } else { - if (isBE(c->srcFormat)) { - COPY9_OR_10TO16(AV_RB16, AV_WL16); + if(isBE(c->dstFormat)){ + COPY816(AV_WB16) } else { - COPY9_OR_10TO16(AV_RL16, AV_WL16); + COPY816(AV_WL16) } + dstPtr2 += dstStride[plane]/2; + srcPtr += srcStride[plane]; } - } else if (is9_OR_10BPS(c->dstFormat)) { - uint16_t *dstPtr2 = (uint16_t *) dstPtr; -#define COPY9_OR_10TO9_OR_10(loop) \ - for (i = 0; i < height; i++) { \ - for (j = 0; j < length; j++) { \ - loop; \ - } \ - dstPtr2 += dstStride[plane] / 2; \ - srcPtr2 += srcStride[plane] / 2; \ - } -#define COPY9_OR_10TO9_OR_10_2(rfunc, wfunc) \ - if (dst_depth > src_depth) { \ - COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \ - wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \ - } else if (dst_depth < src_depth) { \ - DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \ - srcPtr2, srcStride[plane] / 2, rfunc, \ - dither_8x8_1, 1, clip9); \ - } else { \ - COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]))); \ - } - if (isBE(c->dstFormat)) { - if (isBE(c->srcFormat)) { - COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WB16); - } else { - COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WB16); + } else if (src_depth <= dst_depth) { + for (i = 0; i < height; i++) { + j = 0; + if(isBE(c->srcFormat) == HAVE_BIGENDIAN && + isBE(c->dstFormat) == HAVE_BIGENDIAN && + shiftonly) { + unsigned shift = dst_depth - src_depth; +#if HAVE_FAST_64BIT +#define FAST_COPY_UP(shift) \ + for (; j < length - 3; j += 4) { \ + uint64_t v = AV_RN64A(srcPtr2 + j); \ + AV_WN64A(dstPtr2 + j, v << shift); \ + } +#else +#define FAST_COPY_UP(shift) \ + for (; j < length - 1; j += 2) { \ + uint32_t v = AV_RN32A(srcPtr2 + j); \ + AV_WN32A(dstPtr2 + j, v << shift); \ + } +#endif + switch (shift) + { + case 6: FAST_COPY_UP(6); break; + case 7: FAST_COPY_UP(7); break; + } } - } else { - if (isBE(c->srcFormat)) { - COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WL16); +#define COPY_UP(r,w) \ + if(shiftonly){\ + for (; j < length; j++){ \ + unsigned int v= r(&srcPtr2[j]);\ + w(&dstPtr2[j], v<<(dst_depth-src_depth));\ + }\ + }else{\ + for (; j < length; j++){ \ + unsigned int v= r(&srcPtr2[j]);\ + w(&dstPtr2[j], (v<<(dst_depth-src_depth)) | \ + (v>>(2*src_depth-dst_depth)));\ + }\ + } + if(isBE(c->srcFormat)){ + if(isBE(c->dstFormat)){ + COPY_UP(AV_RB16, AV_WB16) + } else { + COPY_UP(AV_RB16, AV_WL16) + } } else { - COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WL16); + if(isBE(c->dstFormat)){ + COPY_UP(AV_RL16, AV_WB16) + } else { + COPY_UP(AV_RL16, AV_WL16) + } } + dstPtr2 += dstStride[plane]/2; + srcPtr2 += srcStride[plane]/2; } } else { -#define W8(a, b) { *(a) = (b); } -#define COPY9_OR_10TO8(rfunc) \ - if (src_depth == 9) { \ - DITHER_COPY(dstPtr, dstStride[plane], W8, \ - srcPtr2, srcStride[plane] / 2, rfunc, \ - dither_8x8_1, 1, av_clip_uint8); \ - } else { \ - DITHER_COPY(dstPtr, dstStride[plane], W8, \ - srcPtr2, srcStride[plane] / 2, rfunc, \ - dither_8x8_3, 2, av_clip_uint8); \ - } - if (isBE(c->srcFormat)) { - COPY9_OR_10TO8(AV_RB16); - } else { - COPY9_OR_10TO8(AV_RL16); - } - } - } else if (is9_OR_10BPS(c->dstFormat)) { - const int dst_depth = desc_dst->comp[plane].depth_minus1 + 1; - uint16_t *dstPtr2 = (uint16_t *) dstPtr; - - if (is16BPS(c->srcFormat)) { - const uint16_t *srcPtr2 = (const uint16_t *) srcPtr; -#define COPY16TO9_OR_10(rfunc, wfunc) \ - if (dst_depth == 9) { \ - DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \ - srcPtr2, srcStride[plane] / 2, rfunc, \ - ff_dither_8x8_128, 7, clip9); \ - } else { \ - DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \ - srcPtr2, srcStride[plane] / 2, rfunc, \ - dither_8x8_64, 6, clip10); \ - } - if (isBE(c->dstFormat)) { - if (isBE(c->srcFormat)) { - COPY16TO9_OR_10(AV_RB16, AV_WB16); + if(isBE(c->srcFormat) == HAVE_BIGENDIAN){ + if(isBE(c->dstFormat) == HAVE_BIGENDIAN){ + DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , ) } else { - COPY16TO9_OR_10(AV_RL16, AV_WB16); + DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , av_bswap16) } - } else { - if (isBE(c->srcFormat)) { - COPY16TO9_OR_10(AV_RB16, AV_WL16); + }else{ + if(isBE(c->dstFormat) == HAVE_BIGENDIAN){ + DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, ) } else { - COPY16TO9_OR_10(AV_RL16, AV_WL16); + DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, av_bswap16) } } - } else /* 8bit */ { -#define COPY8TO9_OR_10(wfunc) \ - for (i = 0; i < height; i++) { \ - for (j = 0; j < length; j++) { \ - const int srcpx = srcPtr[j]; \ - wfunc(&dstPtr2[j], (srcpx << (dst_depth - 8)) | (srcpx >> (16 - dst_depth))); \ - } \ - dstPtr2 += dstStride[plane] / 2; \ - srcPtr += srcStride[plane]; \ - } - if (isBE(c->dstFormat)) { - COPY8TO9_OR_10(AV_WB16); - } else { - COPY8TO9_OR_10(AV_WL16); - } - } - } else if (is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) { - const uint16_t *srcPtr2 = (const uint16_t *) srcPtr; -#define COPY16TO8(rfunc) \ - DITHER_COPY(dstPtr, dstStride[plane], W8, \ - srcPtr2, srcStride[plane] / 2, rfunc, \ - dither_8x8_256, 8, av_clip_uint8); - if (isBE(c->srcFormat)) { - COPY16TO8(AV_RB16); - } else { - COPY16TO8(AV_RL16); - } - } else if (!is16BPS(c->srcFormat) && is16BPS(c->dstFormat)) { - for (i = 0; i < height; i++) { - for (j = 0; j < length; j++) { - dstPtr[ j << 1 ] = srcPtr[j]; - dstPtr[(j << 1) + 1] = srcPtr[j]; - } - srcPtr += srcStride[plane]; - dstPtr += dstStride[plane]; } } else if (is16BPS(c->srcFormat) && is16BPS(c->dstFormat) && isBE(c->srcFormat) != isBE(c->dstFormat)) { @@ -1010,11 +1581,11 @@ void ff_get_unscaled_swscale(SwsContext *c) /* yuv2bgr */ if ((srcFormat == AV_PIX_FMT_YUV420P || srcFormat == AV_PIX_FMT_YUV422P || srcFormat == AV_PIX_FMT_YUVA420P) && isAnyRGB(dstFormat) && - !(flags & SWS_ACCURATE_RND) && !(dstH & 1)) { + !(flags & SWS_ACCURATE_RND) && (c->dither == SWS_DITHER_BAYER || c->dither == SWS_DITHER_AUTO) && !(dstH & 1)) { c->swscale = ff_yuv2rgb_get_func_ptr(c); } - if (srcFormat == AV_PIX_FMT_YUV410P && + if (srcFormat == AV_PIX_FMT_YUV410P && !(dstH & 3) && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && !(flags & SWS_BITEXACT)) { c->swscale = yvu9ToYv12Wrapper; @@ -1031,6 +1602,10 @@ void ff_get_unscaled_swscale(SwsContext *c) && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)))) c->swscale = rgbToRgbWrapper; + if ((srcFormat == AV_PIX_FMT_GBRP && dstFormat == AV_PIX_FMT_GBRAP) || + (srcFormat == AV_PIX_FMT_GBRAP && dstFormat == AV_PIX_FMT_GBRP)) + c->swscale = planarRgbToplanarRgbWrapper; + #define isByteRGB(f) ( \ f == AV_PIX_FMT_RGB32 || \ f == AV_PIX_FMT_RGB32_1 || \ @@ -1042,33 +1617,89 @@ void ff_get_unscaled_swscale(SwsContext *c) if (srcFormat == AV_PIX_FMT_GBRP && isPlanar(srcFormat) && isByteRGB(dstFormat)) c->swscale = planarRgbToRgbWrapper; + if ((srcFormat == AV_PIX_FMT_RGB48LE || srcFormat == AV_PIX_FMT_RGB48BE || + srcFormat == AV_PIX_FMT_BGR48LE || srcFormat == AV_PIX_FMT_BGR48BE || + srcFormat == AV_PIX_FMT_RGBA64LE || srcFormat == AV_PIX_FMT_RGBA64BE || + srcFormat == AV_PIX_FMT_BGRA64LE || srcFormat == AV_PIX_FMT_BGRA64BE) && + (dstFormat == AV_PIX_FMT_GBRP9LE || dstFormat == AV_PIX_FMT_GBRP9BE || + dstFormat == AV_PIX_FMT_GBRP10LE || dstFormat == AV_PIX_FMT_GBRP10BE || + dstFormat == AV_PIX_FMT_GBRP12LE || dstFormat == AV_PIX_FMT_GBRP12BE || + dstFormat == AV_PIX_FMT_GBRP14LE || dstFormat == AV_PIX_FMT_GBRP14BE || + dstFormat == AV_PIX_FMT_GBRP16LE || dstFormat == AV_PIX_FMT_GBRP16BE || + dstFormat == AV_PIX_FMT_GBRAP16LE || dstFormat == AV_PIX_FMT_GBRAP16BE )) + c->swscale = Rgb16ToPlanarRgb16Wrapper; + + if ((srcFormat == AV_PIX_FMT_GBRP9LE || srcFormat == AV_PIX_FMT_GBRP9BE || + srcFormat == AV_PIX_FMT_GBRP16LE || srcFormat == AV_PIX_FMT_GBRP16BE || + srcFormat == AV_PIX_FMT_GBRP10LE || srcFormat == AV_PIX_FMT_GBRP10BE || + srcFormat == AV_PIX_FMT_GBRP12LE || srcFormat == AV_PIX_FMT_GBRP12BE || + srcFormat == AV_PIX_FMT_GBRP14LE || srcFormat == AV_PIX_FMT_GBRP14BE || + srcFormat == AV_PIX_FMT_GBRAP16LE || srcFormat == AV_PIX_FMT_GBRAP16BE) && + (dstFormat == AV_PIX_FMT_RGB48LE || dstFormat == AV_PIX_FMT_RGB48BE || + dstFormat == AV_PIX_FMT_BGR48LE || dstFormat == AV_PIX_FMT_BGR48BE || + dstFormat == AV_PIX_FMT_RGBA64LE || dstFormat == AV_PIX_FMT_RGBA64BE || + dstFormat == AV_PIX_FMT_BGRA64LE || dstFormat == AV_PIX_FMT_BGRA64BE)) + c->swscale = planarRgb16ToRgb16Wrapper; + if (av_pix_fmt_desc_get(srcFormat)->comp[0].depth_minus1 == 7 && isPackedRGB(srcFormat) && dstFormat == AV_PIX_FMT_GBRP) c->swscale = rgbToPlanarRgbWrapper; + if (isBayer(srcFormat)) { + if (dstFormat == AV_PIX_FMT_RGB24) + c->swscale = bayer_to_rgb24_wrapper; + else if (dstFormat == AV_PIX_FMT_YUV420P) + c->swscale = bayer_to_yv12_wrapper; + else if (!isBayer(dstFormat)) { + av_log(c, AV_LOG_ERROR, "unsupported bayer conversion\n"); + av_assert0(0); + } + } + /* bswap 16 bits per pixel/component packed formats */ - if (IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR444) || + if (IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BAYER_BGGR16) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BAYER_RGGB16) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BAYER_GBRG16) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BAYER_GRBG16) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR444) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR48) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGRA64) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR555) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR565) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGRA64) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GRAY16) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YA16) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP9) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP10) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP12) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP14) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP16) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRAP16) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB444) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB48) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGBA64) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB555) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB565) || IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGBA64) || - IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_XYZ12)) + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_XYZ12) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P9) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P10) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P12) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P14) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P16) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P9) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P10) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P12) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P14) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P16) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P9) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P10) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P12) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P14) || + IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P16)) c->swscale = packed_16bpc_bswap; - if ((usePal(srcFormat) && ( - dstFormat == AV_PIX_FMT_RGB32 || - dstFormat == AV_PIX_FMT_RGB32_1 || - dstFormat == AV_PIX_FMT_RGB24 || - dstFormat == AV_PIX_FMT_BGR32 || - dstFormat == AV_PIX_FMT_BGR32_1 || - dstFormat == AV_PIX_FMT_BGR24))) + if (usePal(srcFormat) && isByteRGB(dstFormat)) c->swscale = palToRgbWrapper; if (srcFormat == AV_PIX_FMT_YUV422P) { @@ -1099,13 +1730,14 @@ void ff_get_unscaled_swscale(SwsContext *c) if (srcFormat == AV_PIX_FMT_UYVY422 && dstFormat == AV_PIX_FMT_YUV422P) c->swscale = uyvyToYuv422Wrapper; +#define isPlanarGray(x) (isGray(x) && (x) != AV_PIX_FMT_YA8 && (x) != AV_PIX_FMT_YA16LE && (x) != AV_PIX_FMT_YA16BE) /* simple copy */ if ( srcFormat == dstFormat || (srcFormat == AV_PIX_FMT_YUVA420P && dstFormat == AV_PIX_FMT_YUV420P) || (srcFormat == AV_PIX_FMT_YUV420P && dstFormat == AV_PIX_FMT_YUVA420P) || - (isPlanarYUV(srcFormat) && isGray(dstFormat)) || - (isPlanarYUV(dstFormat) && isGray(srcFormat)) || - (isGray(dstFormat) && isGray(srcFormat)) || + (isPlanarYUV(srcFormat) && isPlanarGray(dstFormat)) || + (isPlanarYUV(dstFormat) && isPlanarGray(srcFormat)) || + (isPlanarGray(dstFormat) && isPlanarGray(srcFormat)) || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat) && c->chrDstHSubSample == c->chrSrcHSubSample && c->chrDstVSubSample == c->chrSrcVSubSample && @@ -1120,177 +1752,8 @@ void ff_get_unscaled_swscale(SwsContext *c) if (ARCH_PPC) ff_get_unscaled_swscale_ppc(c); -} - -static void reset_ptr(const uint8_t *src[], int format) -{ - if (!isALPHA(format)) - src[3] = NULL; - if (!isPlanar(format)) { - src[3] = src[2] = NULL; - - if (!usePal(format)) - src[1] = NULL; - } -} - -static int check_image_pointers(uint8_t *data[4], enum AVPixelFormat pix_fmt, - const int linesizes[4]) -{ - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt); - int i; - - for (i = 0; i < 4; i++) { - int plane = desc->comp[i].plane; - if (!data[plane] || !linesizes[plane]) - return 0; - } - - return 1; -} - -/** - * swscale wrapper, so we don't need to export the SwsContext. - * Assumes planar YUV to be in YUV order instead of YVU. - */ -int attribute_align_arg sws_scale(struct SwsContext *c, - const uint8_t * const srcSlice[], - const int srcStride[], int srcSliceY, - int srcSliceH, uint8_t *const dst[], - const int dstStride[]) -{ - int i; - const uint8_t *src2[4] = { srcSlice[0], srcSlice[1], srcSlice[2], srcSlice[3] }; - uint8_t *dst2[4] = { dst[0], dst[1], dst[2], dst[3] }; - - // do not mess up sliceDir if we have a "trailing" 0-size slice - if (srcSliceH == 0) - return 0; - - if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) { - av_log(c, AV_LOG_ERROR, "bad src image pointers\n"); - return 0; - } - if (!check_image_pointers(dst, c->dstFormat, dstStride)) { - av_log(c, AV_LOG_ERROR, "bad dst image pointers\n"); - return 0; - } - - if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) { - av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n"); - return 0; - } - if (c->sliceDir == 0) { - if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1; - } - - if (usePal(c->srcFormat)) { - for (i = 0; i < 256; i++) { - int r, g, b, y, u, v; - if (c->srcFormat == AV_PIX_FMT_PAL8) { - uint32_t p = ((const uint32_t *)(srcSlice[1]))[i]; - r = (p >> 16) & 0xFF; - g = (p >> 8) & 0xFF; - b = p & 0xFF; - } else if (c->srcFormat == AV_PIX_FMT_RGB8) { - r = ( i >> 5 ) * 36; - g = ((i >> 2) & 7) * 36; - b = ( i & 3) * 85; - } else if (c->srcFormat == AV_PIX_FMT_BGR8) { - b = ( i >> 6 ) * 85; - g = ((i >> 3) & 7) * 36; - r = ( i & 7) * 36; - } else if (c->srcFormat == AV_PIX_FMT_RGB4_BYTE) { - r = ( i >> 3 ) * 255; - g = ((i >> 1) & 3) * 85; - b = ( i & 1) * 255; - } else if (c->srcFormat == AV_PIX_FMT_GRAY8 || - c->srcFormat == AV_PIX_FMT_YA8) { - r = g = b = i; - } else { - assert(c->srcFormat == AV_PIX_FMT_BGR4_BYTE); - b = ( i >> 3 ) * 255; - g = ((i >> 1) & 3) * 85; - r = ( i & 1) * 255; - } - y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); - u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); - v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT); - c->pal_yuv[i] = y + (u << 8) + (v << 16) + (0xFFU << 24); - - switch (c->dstFormat) { - case AV_PIX_FMT_BGR32: -#if !HAVE_BIGENDIAN - case AV_PIX_FMT_RGB24: -#endif - c->pal_rgb[i] = r + (g << 8) + (b << 16) + (0xFFU << 24); - break; - case AV_PIX_FMT_BGR32_1: -#if HAVE_BIGENDIAN - case AV_PIX_FMT_BGR24: -#endif - c->pal_rgb[i] = 0xFF + (r << 8) + (g << 16) + ((unsigned)b << 24); - break; - case AV_PIX_FMT_RGB32_1: -#if HAVE_BIGENDIAN - case AV_PIX_FMT_RGB24: -#endif - c->pal_rgb[i] = 0xFF + (b << 8) + (g << 16) + ((unsigned)r << 24); - break; - case AV_PIX_FMT_RGB32: -#if !HAVE_BIGENDIAN - case AV_PIX_FMT_BGR24: -#endif - default: - c->pal_rgb[i] = b + (g << 8) + (r << 16) + (0xFFU << 24); - } - } - } - - // copy strides, so they can safely be modified - if (c->sliceDir == 1) { - // slices go from top to bottom - int srcStride2[4] = { srcStride[0], srcStride[1], srcStride[2], - srcStride[3] }; - int dstStride2[4] = { dstStride[0], dstStride[1], dstStride[2], - dstStride[3] }; - - reset_ptr(src2, c->srcFormat); - reset_ptr((const uint8_t **) dst2, c->dstFormat); - - /* reset slice direction at end of frame */ - if (srcSliceY + srcSliceH == c->srcH) - c->sliceDir = 0; - - return c->swscale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2, - dstStride2); - } else { - // slices go from bottom to top => we flip the image internally - int srcStride2[4] = { -srcStride[0], -srcStride[1], -srcStride[2], - -srcStride[3] }; - int dstStride2[4] = { -dstStride[0], -dstStride[1], -dstStride[2], - -dstStride[3] }; - - src2[0] += (srcSliceH - 1) * srcStride[0]; - if (!usePal(c->srcFormat)) - src2[1] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[1]; - src2[2] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[2]; - src2[3] += (srcSliceH - 1) * srcStride[3]; - dst2[0] += ( c->dstH - 1) * dstStride[0]; - dst2[1] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[1]; - dst2[2] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[2]; - dst2[3] += ( c->dstH - 1) * dstStride[3]; - - reset_ptr(src2, c->srcFormat); - reset_ptr((const uint8_t **) dst2, c->dstFormat); - - /* reset slice direction at end of frame */ - if (!srcSliceY) - c->sliceDir = 0; - - return c->swscale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH, - srcSliceH, dst2, dstStride2); - } +// if (ARCH_ARM) +// ff_get_unscaled_swscale_arm(c); } /* Convert the palette to the same packed 32-bit format as the palette */ diff --git a/libswscale/swscaleres.rc b/libswscale/swscaleres.rc new file mode 100644 index 0000000..5cb8ee7 --- /dev/null +++ b/libswscale/swscaleres.rc @@ -0,0 +1,55 @@ +/* + * Windows resource file for libswscale + * + * Copyright (C) 2012 James Almer + * Copyright (C) 2013 Tiancheng "Timothy" Gu + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <windows.h> +#include "libswscale/version.h" +#include "libavutil/ffversion.h" +#include "config.h" + +1 VERSIONINFO +FILEVERSION LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO, 0 +PRODUCTVERSION LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO, 0 +FILEFLAGSMASK VS_FFI_FILEFLAGSMASK +FILEOS VOS_NT_WINDOWS32 +FILETYPE VFT_DLL +{ + BLOCK "StringFileInfo" + { + BLOCK "040904B0" + { + VALUE "CompanyName", "FFmpeg Project" + VALUE "FileDescription", "FFmpeg image rescaling library" + VALUE "FileVersion", AV_STRINGIFY(LIBSWSCALE_VERSION) + VALUE "InternalName", "libswscale" + VALUE "LegalCopyright", "Copyright (C) 2000-" AV_STRINGIFY(CONFIG_THIS_YEAR) " FFmpeg Project" + VALUE "OriginalFilename", "swscale" BUILDSUF "-" AV_STRINGIFY(LIBSWSCALE_VERSION_MAJOR) SLIBSUF + VALUE "ProductName", "FFmpeg" + VALUE "ProductVersion", FFMPEG_VERSION + } + } + + BLOCK "VarFileInfo" + { + VALUE "Translation", 0x0409, 0x04B0 + } +} diff --git a/libswscale/utils.c b/libswscale/utils.c index 32e304c..06fd358 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -1,27 +1,27 @@ /* * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" #define _SVID_SOURCE // needed for MAP_ANONYMOUS -#include <assert.h> +#define _DARWIN_C_SOURCE // needed for MAP_ANON #include <inttypes.h> #include <math.h> #include <stdio.h> @@ -38,6 +38,7 @@ #endif #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavutil/avutil.h" #include "libavutil/bswap.h" #include "libavutil/cpu.h" @@ -52,24 +53,25 @@ #include "swscale.h" #include "swscale_internal.h" +static void handle_formats(SwsContext *c); + unsigned swscale_version(void) { + av_assert0(LIBSWSCALE_VERSION_MICRO >= 100); return LIBSWSCALE_VERSION_INT; } const char *swscale_configuration(void) { - return LIBAV_CONFIGURATION; + return FFMPEG_CONFIGURATION; } const char *swscale_license(void) { #define LICENSE_PREFIX "libswscale license: " - return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1; + return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1; } -#define RET 0xC3 // near return opcode for x86 - typedef struct FormatEntry { uint8_t is_supported_in :1; uint8_t is_supported_out :1; @@ -90,6 +92,7 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { [AV_PIX_FMT_MONOBLACK] = { 1, 1 }, [AV_PIX_FMT_PAL8] = { 1, 0 }, [AV_PIX_FMT_YUVJ420P] = { 1, 1 }, + [AV_PIX_FMT_YUVJ411P] = { 1, 1 }, [AV_PIX_FMT_YUVJ422P] = { 1, 1 }, [AV_PIX_FMT_YUVJ444P] = { 1, 1 }, [AV_PIX_FMT_YVYU422] = { 1, 1 }, @@ -107,6 +110,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { [AV_PIX_FMT_RGBA] = { 1, 1 }, [AV_PIX_FMT_ABGR] = { 1, 1 }, [AV_PIX_FMT_BGRA] = { 1, 1 }, + [AV_PIX_FMT_0RGB] = { 1, 1 }, + [AV_PIX_FMT_RGB0] = { 1, 1 }, + [AV_PIX_FMT_0BGR] = { 1, 1 }, + [AV_PIX_FMT_BGR0] = { 1, 1 }, [AV_PIX_FMT_GRAY16BE] = { 1, 1 }, [AV_PIX_FMT_GRAY16LE] = { 1, 1 }, [AV_PIX_FMT_YUV440P] = { 1, 1 }, @@ -134,8 +141,8 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { [AV_PIX_FMT_YUVA444P16LE]= { 1, 1 }, [AV_PIX_FMT_RGB48BE] = { 1, 1 }, [AV_PIX_FMT_RGB48LE] = { 1, 1 }, - [AV_PIX_FMT_RGBA64BE] = { 0, 0, 1 }, - [AV_PIX_FMT_RGBA64LE] = { 0, 0, 1 }, + [AV_PIX_FMT_RGBA64BE] = { 1, 1, 1 }, + [AV_PIX_FMT_RGBA64LE] = { 1, 1, 1 }, [AV_PIX_FMT_RGB565BE] = { 1, 1 }, [AV_PIX_FMT_RGB565LE] = { 1, 1 }, [AV_PIX_FMT_RGB555BE] = { 1, 1 }, @@ -159,29 +166,60 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { [AV_PIX_FMT_YA16LE] = { 1, 0 }, [AV_PIX_FMT_BGR48BE] = { 1, 1 }, [AV_PIX_FMT_BGR48LE] = { 1, 1 }, - [AV_PIX_FMT_BGRA64BE] = { 0, 0, 1 }, - [AV_PIX_FMT_BGRA64LE] = { 0, 0, 1 }, + [AV_PIX_FMT_BGRA64BE] = { 1, 1, 1 }, + [AV_PIX_FMT_BGRA64LE] = { 1, 1, 1 }, [AV_PIX_FMT_YUV420P9BE] = { 1, 1 }, [AV_PIX_FMT_YUV420P9LE] = { 1, 1 }, [AV_PIX_FMT_YUV420P10BE] = { 1, 1 }, [AV_PIX_FMT_YUV420P10LE] = { 1, 1 }, + [AV_PIX_FMT_YUV420P12BE] = { 1, 1 }, + [AV_PIX_FMT_YUV420P12LE] = { 1, 1 }, + [AV_PIX_FMT_YUV420P14BE] = { 1, 1 }, + [AV_PIX_FMT_YUV420P14LE] = { 1, 1 }, [AV_PIX_FMT_YUV422P9BE] = { 1, 1 }, [AV_PIX_FMT_YUV422P9LE] = { 1, 1 }, [AV_PIX_FMT_YUV422P10BE] = { 1, 1 }, [AV_PIX_FMT_YUV422P10LE] = { 1, 1 }, + [AV_PIX_FMT_YUV422P12BE] = { 1, 1 }, + [AV_PIX_FMT_YUV422P12LE] = { 1, 1 }, + [AV_PIX_FMT_YUV422P14BE] = { 1, 1 }, + [AV_PIX_FMT_YUV422P14LE] = { 1, 1 }, [AV_PIX_FMT_YUV444P9BE] = { 1, 1 }, [AV_PIX_FMT_YUV444P9LE] = { 1, 1 }, [AV_PIX_FMT_YUV444P10BE] = { 1, 1 }, [AV_PIX_FMT_YUV444P10LE] = { 1, 1 }, + [AV_PIX_FMT_YUV444P12BE] = { 1, 1 }, + [AV_PIX_FMT_YUV444P12LE] = { 1, 1 }, + [AV_PIX_FMT_YUV444P14BE] = { 1, 1 }, + [AV_PIX_FMT_YUV444P14LE] = { 1, 1 }, [AV_PIX_FMT_GBRP] = { 1, 1 }, [AV_PIX_FMT_GBRP9LE] = { 1, 1 }, [AV_PIX_FMT_GBRP9BE] = { 1, 1 }, [AV_PIX_FMT_GBRP10LE] = { 1, 1 }, [AV_PIX_FMT_GBRP10BE] = { 1, 1 }, + [AV_PIX_FMT_GBRP12LE] = { 1, 1 }, + [AV_PIX_FMT_GBRP12BE] = { 1, 1 }, + [AV_PIX_FMT_GBRP14LE] = { 1, 1 }, + [AV_PIX_FMT_GBRP14BE] = { 1, 1 }, [AV_PIX_FMT_GBRP16LE] = { 1, 0 }, [AV_PIX_FMT_GBRP16BE] = { 1, 0 }, - [AV_PIX_FMT_XYZ12BE] = { 0, 0, 1 }, - [AV_PIX_FMT_XYZ12LE] = { 0, 0, 1 }, + [AV_PIX_FMT_XYZ12BE] = { 1, 1, 1 }, + [AV_PIX_FMT_XYZ12LE] = { 1, 1, 1 }, + [AV_PIX_FMT_GBRAP] = { 1, 1 }, + [AV_PIX_FMT_GBRAP16LE] = { 1, 0 }, + [AV_PIX_FMT_GBRAP16BE] = { 1, 0 }, + [AV_PIX_FMT_BAYER_BGGR8] = { 1, 0 }, + [AV_PIX_FMT_BAYER_RGGB8] = { 1, 0 }, + [AV_PIX_FMT_BAYER_GBRG8] = { 1, 0 }, + [AV_PIX_FMT_BAYER_GRBG8] = { 1, 0 }, + [AV_PIX_FMT_BAYER_BGGR16LE] = { 1, 0 }, + [AV_PIX_FMT_BAYER_BGGR16BE] = { 1, 0 }, + [AV_PIX_FMT_BAYER_RGGB16LE] = { 1, 0 }, + [AV_PIX_FMT_BAYER_RGGB16BE] = { 1, 0 }, + [AV_PIX_FMT_BAYER_GBRG16LE] = { 1, 0 }, + [AV_PIX_FMT_BAYER_GBRG16BE] = { 1, 0 }, + [AV_PIX_FMT_BAYER_GRBG16LE] = { 1, 0 }, + [AV_PIX_FMT_BAYER_GRBG16BE] = { 1, 0 }, }; int sws_isSupportedInput(enum AVPixelFormat pix_fmt) @@ -202,6 +240,7 @@ int sws_isSupportedEndiannessConversion(enum AVPixelFormat pix_fmt) format_entries[pix_fmt].is_supported_endianness : 0; } +#if FF_API_SWS_FORMAT_NAME const char *sws_format_name(enum AVPixelFormat format) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(format); @@ -210,6 +249,7 @@ const char *sws_format_name(enum AVPixelFormat format) else return "Unknown format"; } +#endif static double getSplineCoeff(double a, double b, double c, double d, double dist) @@ -224,12 +264,41 @@ static double getSplineCoeff(double a, double b, double c, double d, dist - 1.0); } +static av_cold int get_local_pos(SwsContext *s, int chr_subsample, int pos, int dir) +{ + if (pos < 0) { + pos = (128 << chr_subsample) - 128; + } + pos += 128; // relative to ideal left edge + return pos >> chr_subsample; +} + +typedef struct { + int flag; ///< flag associated to the algorithm + const char *description; ///< human-readable description + int size_factor; ///< size factor used when initing the filters +} ScaleAlgorithm; + +static const ScaleAlgorithm scale_algorithms[] = { + { SWS_AREA, "area averaging", 1 /* downscale only, for upscale it is bilinear */ }, + { SWS_BICUBIC, "bicubic", 4 }, + { SWS_BICUBLIN, "luma bicubic / chroma bilinear", -1 }, + { SWS_BILINEAR, "bilinear", 2 }, + { SWS_FAST_BILINEAR, "fast bilinear", -1 }, + { SWS_GAUSS, "Gaussian", 8 /* infinite ;) */ }, + { SWS_LANCZOS, "Lanczos", -1 /* custom */ }, + { SWS_POINT, "nearest neighbor / point", -1 }, + { SWS_SINC, "sinc", 20 /* infinite ;) */ }, + { SWS_SPLINE, "bicubic spline", 20 /* infinite :)*/ }, + { SWS_X, "experimental", 8 }, +}; + static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, int *outFilterSize, int xInc, int srcW, int dstW, int filterAlign, int one, int flags, int cpu_flags, SwsVector *srcFilter, SwsVector *dstFilter, - double param[2], int is_horizontal) + double param[2], int srcPos, int dstPos) { int i; int filterSize; @@ -237,19 +306,19 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, int minFilterSize; int64_t *filter = NULL; int64_t *filter2 = NULL; - const int64_t fone = 1LL << 54; + const int64_t fone = 1LL << (54 - FFMIN(av_log2(srcW/dstW), 8)); int ret = -1; emms_c(); // FIXME should not be required but IS (even for non-MMX versions) // NOTE: the +3 is for the MMX(+1) / SSE(+3) scaler which reads over the end - FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW + 3) * sizeof(**filterPos), fail); + FF_ALLOC_ARRAY_OR_GOTO(NULL, *filterPos, (dstW + 3), sizeof(**filterPos), fail); - if (FFABS(xInc - 0x10000) < 10) { // unscaled + if (FFABS(xInc - 0x10000) < 10 && srcPos == dstPos) { // unscaled int i; filterSize = 1; - FF_ALLOCZ_OR_GOTO(NULL, filter, - dstW * sizeof(*filter) * filterSize, fail); + FF_ALLOCZ_ARRAY_OR_GOTO(NULL, filter, + dstW, sizeof(*filter) * filterSize, fail); for (i = 0; i < dstW; i++) { filter[i * filterSize] = fone; @@ -257,12 +326,12 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, } } else if (flags & SWS_POINT) { // lame looking point sampling mode int i; - int xDstInSrc; + int64_t xDstInSrc; filterSize = 1; - FF_ALLOC_OR_GOTO(NULL, filter, - dstW * sizeof(*filter) * filterSize, fail); + FF_ALLOC_ARRAY_OR_GOTO(NULL, filter, + dstW, sizeof(*filter) * filterSize, fail); - xDstInSrc = xInc / 2 - 0x8000; + xDstInSrc = ((dstPos*(int64_t)xInc)>>8) - ((srcPos*0x8000LL)>>7); for (i = 0; i < dstW; i++) { int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16; @@ -273,12 +342,12 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, } else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) || (flags & SWS_FAST_BILINEAR)) { // bilinear upscale int i; - int xDstInSrc; + int64_t xDstInSrc; filterSize = 2; - FF_ALLOC_OR_GOTO(NULL, filter, - dstW * sizeof(*filter) * filterSize, fail); + FF_ALLOC_ARRAY_OR_GOTO(NULL, filter, + dstW, sizeof(*filter) * filterSize, fail); - xDstInSrc = xInc / 2 - 0x8000; + xDstInSrc = ((dstPos*(int64_t)xInc)>>8) - ((srcPos*0x8000LL)>>7); for (i = 0; i < dstW; i++) { int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16; int j; @@ -286,8 +355,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, (*filterPos)[i] = xx; // bilinear upscale / linear interpolate / area averaging for (j = 0; j < filterSize; j++) { - int64_t coeff = fone - FFABS((xx << 16) - xDstInSrc) * - (fone >> 16); + int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16); if (coeff < 0) coeff = 0; filter[i * filterSize + j] = coeff; @@ -297,28 +365,17 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, } } else { int64_t xDstInSrc; - int sizeFactor; - - if (flags & SWS_BICUBIC) - sizeFactor = 4; - else if (flags & SWS_X) - sizeFactor = 8; - else if (flags & SWS_AREA) - sizeFactor = 1; // downscale only, for upscale it is bilinear - else if (flags & SWS_GAUSS) - sizeFactor = 8; // infinite ;) - else if (flags & SWS_LANCZOS) - sizeFactor = param[0] != SWS_PARAM_DEFAULT ? ceil(2 * param[0]) : 6; - else if (flags & SWS_SINC) - sizeFactor = 20; // infinite ;) - else if (flags & SWS_SPLINE) - sizeFactor = 20; // infinite ;) - else if (flags & SWS_BILINEAR) - sizeFactor = 2; - else { - sizeFactor = 0; // GCC warning killer - assert(0); + int sizeFactor = -1; + + for (i = 0; i < FF_ARRAY_ELEMS(scale_algorithms); i++) { + if (flags & scale_algorithms[i].flag && scale_algorithms[i].size_factor > 0) { + sizeFactor = scale_algorithms[i].size_factor; + break; + } } + if (flags & SWS_LANCZOS) + sizeFactor = param[0] != SWS_PARAM_DEFAULT ? ceil(2 * param[0]) : 6; + av_assert0(sizeFactor > 0); if (xInc <= 1 << 16) filterSize = 1 + sizeFactor; // upscale @@ -328,10 +385,10 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, filterSize = FFMIN(filterSize, srcW - 2); filterSize = FFMAX(filterSize, 1); - FF_ALLOC_OR_GOTO(NULL, filter, - dstW * sizeof(*filter) * filterSize, fail); + FF_ALLOC_ARRAY_OR_GOTO(NULL, filter, + dstW, sizeof(*filter) * filterSize, fail); - xDstInSrc = xInc - 0x10000; + xDstInSrc = ((dstPos*(int64_t)xInc)>>7) - ((srcPos*0x10000LL)>>7); for (i = 0; i < dstW; i++) { int xx = (xDstInSrc - ((int64_t)(filterSize - 2) << 16)) / (1 << 17); int j; @@ -365,7 +422,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, (-12 * B - 48 * C) * d + (8 * B + 24 * C) * (1 << 30); } - coeff *= fone >> (30 + 24); + coeff /= (1LL<<54)/fone; } #if 0 else if (flags & SWS_X) { @@ -416,8 +473,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, double p = -2.196152422706632; coeff = getSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone; } else { - coeff = 0.0; // GCC warning killer - assert(0); + av_assert0(0); } filter[i * filterSize + j] = coeff; @@ -430,14 +486,14 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, /* apply src & dst Filter to filter -> filter2 * av_free(filter); */ - assert(filterSize > 0); + av_assert0(filterSize > 0); filter2Size = filterSize; if (srcFilter) filter2Size += srcFilter->length - 1; if (dstFilter) filter2Size += dstFilter->length - 1; - assert(filter2Size > 0); - FF_ALLOCZ_OR_GOTO(NULL, filter2, filter2Size * dstW * sizeof(*filter2), fail); + av_assert0(filter2Size > 0); + FF_ALLOCZ_ARRAY_OR_GOTO(NULL, filter2, dstW, filter2Size * sizeof(*filter2), fail); for (i = 0; i < dstW; i++) { int j, k; @@ -512,19 +568,24 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, filterAlign = 1; } - if (INLINE_MMX(cpu_flags)) { + if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) { // special case for unscaled vertical filtering if (minFilterSize == 1 && filterAlign == 2) filterAlign = 1; } - assert(minFilterSize > 0); + av_assert0(minFilterSize > 0); filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1)); - assert(filterSize > 0); - filter = av_malloc(filterSize * dstW * sizeof(*filter)); + av_assert0(filterSize > 0); + filter = av_malloc_array(dstW, filterSize * sizeof(*filter)); + if (!filter) + goto fail; if (filterSize >= MAX_FILTER_SIZE * 16 / - ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter) + ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16)) { + av_log(NULL, AV_LOG_ERROR, "sws: filterSize %d is too large, try less extreme scaling or set --sws-max-filter-size and recompile\n", + FF_CEIL_RSHIFT((filterSize+1) * ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16), 4)); goto fail; + } *outFilterSize = filterSize; if (flags & SWS_PRINT_INFO) @@ -548,36 +609,34 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, // FIXME try to align filterPos if possible // fix borders - if (is_horizontal) { - for (i = 0; i < dstW; i++) { - int j; - if ((*filterPos)[i] < 0) { - // move filter coefficients left to compensate for filterPos - for (j = 1; j < filterSize; j++) { - int left = FFMAX(j + (*filterPos)[i], 0); - filter[i * filterSize + left] += filter[i * filterSize + j]; - filter[i * filterSize + j] = 0; - } - (*filterPos)[i] = 0; + for (i = 0; i < dstW; i++) { + int j; + if ((*filterPos)[i] < 0) { + // move filter coefficients left to compensate for filterPos + for (j = 1; j < filterSize; j++) { + int left = FFMAX(j + (*filterPos)[i], 0); + filter[i * filterSize + left] += filter[i * filterSize + j]; + filter[i * filterSize + j] = 0; } + (*filterPos)[i]= 0; + } - if ((*filterPos)[i] + filterSize > srcW) { - int shift = (*filterPos)[i] + filterSize - srcW; - // move filter coefficients right to compensate for filterPos - for (j = filterSize - 2; j >= 0; j--) { - int right = FFMIN(j + shift, filterSize - 1); - filter[i * filterSize + right] += filter[i * filterSize + j]; - filter[i * filterSize + j] = 0; - } - (*filterPos)[i] = srcW - filterSize; + if ((*filterPos)[i] + filterSize > srcW) { + int shift = (*filterPos)[i] + filterSize - srcW; + // move filter coefficients right to compensate for filterPos + for (j = filterSize - 2; j >= 0; j--) { + int right = FFMIN(j + shift, filterSize - 1); + filter[i * filterSize + right] += filter[i * filterSize + j]; + filter[i * filterSize + j] = 0; } + (*filterPos)[i]= srcW - filterSize; } } // Note the +1 is for the MMX scaler which reads over the end /* align at 16 for AltiVec (needed by hScale_altivec_real) */ - FF_ALLOCZ_OR_GOTO(NULL, *outFilter, - *outFilterSize * (dstW + 3) * sizeof(int16_t), fail); + FF_ALLOCZ_ARRAY_OR_GOTO(NULL, *outFilter, + (dstW + 3), *outFilterSize * sizeof(int16_t), fail); /* normalize & store in outFilter */ for (i = 0; i < dstW; i++) { @@ -589,6 +648,10 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, sum += filter[i * filterSize + j]; } sum = (sum + one / 2) / one; + if (!sum) { + av_log(NULL, AV_LOG_WARNING, "SwScaler: zero vector in scaling\n"); + sum = 1; + } for (j = 0; j < *outFilterSize; j++) { int64_t v = filter[i * filterSize + j] + error; int intV = ROUNDED_DIV(v, sum); @@ -611,211 +674,193 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, ret = 0; fail: + if(ret < 0) + av_log(NULL, AV_LOG_ERROR, "sws: initFilter failed\n"); av_free(filter); av_free(filter2); return ret; } -#if HAVE_MMXEXT_INLINE -static av_cold int init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, - int16_t *filter, int32_t *filterPos, - int numSplits) +static void fill_rgb2yuv_table(SwsContext *c, const int table[4], int dstRange) { - uint8_t *fragmentA; - x86_reg imm8OfPShufW1A; - x86_reg imm8OfPShufW2A; - x86_reg fragmentLengthA; - uint8_t *fragmentB; - x86_reg imm8OfPShufW1B; - x86_reg imm8OfPShufW2B; - x86_reg fragmentLengthB; - int fragmentPos; - - int xpos, i; - - // create an optimized horizontal scaling routine - /* This scaler is made of runtime-generated MMXEXT code using specially tuned - * pshufw instructions. For every four output pixels, if four input pixels - * are enough for the fast bilinear scaling, then a chunk of fragmentB is - * used. If five input pixels are needed, then a chunk of fragmentA is used. - */ - - // code fragment - - __asm__ volatile ( - "jmp 9f \n\t" - // Begin - "0: \n\t" - "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" - "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" - "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "pshufw $0xFF, %%mm1, %%mm1 \n\t" - "1: \n\t" - "pshufw $0xFF, %%mm0, %%mm0 \n\t" - "2: \n\t" - "psubw %%mm1, %%mm0 \n\t" - "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" - "pmullw %%mm3, %%mm0 \n\t" - "psllw $7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - - "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" - - "add $8, %%"REG_a" \n\t" - // End - "9: \n\t" - // "int $3 \n\t" - "lea " LOCAL_MANGLE(0b) ", %0 \n\t" - "lea " LOCAL_MANGLE(1b) ", %1 \n\t" - "lea " LOCAL_MANGLE(2b) ", %2 \n\t" - "dec %1 \n\t" - "dec %2 \n\t" - "sub %0, %1 \n\t" - "sub %0, %2 \n\t" - "lea " LOCAL_MANGLE(9b) ", %3 \n\t" - "sub %0, %3 \n\t" - - - : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), - "=r" (fragmentLengthA) - ); - - __asm__ volatile ( - "jmp 9f \n\t" - // Begin - "0: \n\t" - "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" - "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "pshufw $0xFF, %%mm0, %%mm1 \n\t" - "1: \n\t" - "pshufw $0xFF, %%mm0, %%mm0 \n\t" - "2: \n\t" - "psubw %%mm1, %%mm0 \n\t" - "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" - "pmullw %%mm3, %%mm0 \n\t" - "psllw $7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - - "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" - - "add $8, %%"REG_a" \n\t" - // End - "9: \n\t" - // "int $3 \n\t" - "lea " LOCAL_MANGLE(0b) ", %0 \n\t" - "lea " LOCAL_MANGLE(1b) ", %1 \n\t" - "lea " LOCAL_MANGLE(2b) ", %2 \n\t" - "dec %1 \n\t" - "dec %2 \n\t" - "sub %0, %1 \n\t" - "sub %0, %2 \n\t" - "lea " LOCAL_MANGLE(9b) ", %3 \n\t" - "sub %0, %3 \n\t" - - - : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), - "=r" (fragmentLengthB) - ); - - xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers - fragmentPos = 0; - - for (i = 0; i < dstW / numSplits; i++) { - int xx = xpos >> 16; - - if ((i & 3) == 0) { - int a = 0; - int b = ((xpos + xInc) >> 16) - xx; - int c = ((xpos + xInc * 2) >> 16) - xx; - int d = ((xpos + xInc * 3) >> 16) - xx; - int inc = (d + 1 < 4); - uint8_t *fragment = (d + 1 < 4) ? fragmentB : fragmentA; - x86_reg imm8OfPShufW1 = (d + 1 < 4) ? imm8OfPShufW1B : imm8OfPShufW1A; - x86_reg imm8OfPShufW2 = (d + 1 < 4) ? imm8OfPShufW2B : imm8OfPShufW2A; - x86_reg fragmentLength = (d + 1 < 4) ? fragmentLengthB : fragmentLengthA; - int maxShift = 3 - (d + inc); - int shift = 0; - - if (filterCode) { - filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9; - filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9; - filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9; - filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9; - filterPos[i / 2] = xx; - - memcpy(filterCode + fragmentPos, fragment, fragmentLength); - - filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) | - ((b + inc) << 2) | - ((c + inc) << 4) | - ((d + inc) << 6); - filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) | - (c << 4) | - (d << 6); - - if (i + 4 - inc >= dstW) - shift = maxShift; // avoid overread - else if ((filterPos[i / 2] & 3) <= maxShift) - shift = filterPos[i / 2] & 3; // align - - if (shift && i >= shift) { - filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift; - filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift; - filterPos[i / 2] -= shift; - } - } - - fragmentPos += fragmentLength; - - if (filterCode) - filterCode[fragmentPos] = RET; - } - xpos += xInc; + int64_t W, V, Z, Cy, Cu, Cv; + int64_t vr = table[0]; + int64_t ub = table[1]; + int64_t ug = -table[2]; + int64_t vg = -table[3]; + int64_t ONE = 65536; + int64_t cy = ONE; + uint8_t *p = (uint8_t*)c->input_rgb2yuv_table; + int i; + static const int8_t map[] = { + BY_IDX, GY_IDX, -1 , BY_IDX, BY_IDX, GY_IDX, -1 , BY_IDX, + RY_IDX, -1 , GY_IDX, RY_IDX, RY_IDX, -1 , GY_IDX, RY_IDX, + RY_IDX, GY_IDX, -1 , RY_IDX, RY_IDX, GY_IDX, -1 , RY_IDX, + BY_IDX, -1 , GY_IDX, BY_IDX, BY_IDX, -1 , GY_IDX, BY_IDX, + BU_IDX, GU_IDX, -1 , BU_IDX, BU_IDX, GU_IDX, -1 , BU_IDX, + RU_IDX, -1 , GU_IDX, RU_IDX, RU_IDX, -1 , GU_IDX, RU_IDX, + RU_IDX, GU_IDX, -1 , RU_IDX, RU_IDX, GU_IDX, -1 , RU_IDX, + BU_IDX, -1 , GU_IDX, BU_IDX, BU_IDX, -1 , GU_IDX, BU_IDX, + BV_IDX, GV_IDX, -1 , BV_IDX, BV_IDX, GV_IDX, -1 , BV_IDX, + RV_IDX, -1 , GV_IDX, RV_IDX, RV_IDX, -1 , GV_IDX, RV_IDX, + RV_IDX, GV_IDX, -1 , RV_IDX, RV_IDX, GV_IDX, -1 , RV_IDX, + BV_IDX, -1 , GV_IDX, BV_IDX, BV_IDX, -1 , GV_IDX, BV_IDX, + RY_IDX, BY_IDX, RY_IDX, BY_IDX, RY_IDX, BY_IDX, RY_IDX, BY_IDX, + BY_IDX, RY_IDX, BY_IDX, RY_IDX, BY_IDX, RY_IDX, BY_IDX, RY_IDX, + GY_IDX, -1 , GY_IDX, -1 , GY_IDX, -1 , GY_IDX, -1 , + -1 , GY_IDX, -1 , GY_IDX, -1 , GY_IDX, -1 , GY_IDX, + RU_IDX, BU_IDX, RU_IDX, BU_IDX, RU_IDX, BU_IDX, RU_IDX, BU_IDX, + BU_IDX, RU_IDX, BU_IDX, RU_IDX, BU_IDX, RU_IDX, BU_IDX, RU_IDX, + GU_IDX, -1 , GU_IDX, -1 , GU_IDX, -1 , GU_IDX, -1 , + -1 , GU_IDX, -1 , GU_IDX, -1 , GU_IDX, -1 , GU_IDX, + RV_IDX, BV_IDX, RV_IDX, BV_IDX, RV_IDX, BV_IDX, RV_IDX, BV_IDX, + BV_IDX, RV_IDX, BV_IDX, RV_IDX, BV_IDX, RV_IDX, BV_IDX, RV_IDX, + GV_IDX, -1 , GV_IDX, -1 , GV_IDX, -1 , GV_IDX, -1 , + -1 , GV_IDX, -1 , GV_IDX, -1 , GV_IDX, -1 , GV_IDX, //23 + -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //24 + -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //25 + -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //26 + -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //27 + -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //28 + -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //29 + -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //30 + -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //31 + BY_IDX, GY_IDX, RY_IDX, -1 , -1 , -1 , -1 , -1 , //32 + BU_IDX, GU_IDX, RU_IDX, -1 , -1 , -1 , -1 , -1 , //33 + BV_IDX, GV_IDX, RV_IDX, -1 , -1 , -1 , -1 , -1 , //34 + }; + + dstRange = 0; //FIXME range = 1 is handled elsewhere + + if (!dstRange) { + cy = cy * 255 / 219; + } else { + vr = vr * 224 / 255; + ub = ub * 224 / 255; + ug = ug * 224 / 255; + vg = vg * 224 / 255; } - if (filterCode) - filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part - - return fragmentPos + 1; + W = ROUNDED_DIV(ONE*ONE*ug, ub); + V = ROUNDED_DIV(ONE*ONE*vg, vr); + Z = ONE*ONE-W-V; + + Cy = ROUNDED_DIV(cy*Z, ONE); + Cu = ROUNDED_DIV(ub*Z, ONE); + Cv = ROUNDED_DIV(vr*Z, ONE); + + c->input_rgb2yuv_table[RY_IDX] = -ROUNDED_DIV((1 << RGB2YUV_SHIFT)*V , Cy); + c->input_rgb2yuv_table[GY_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*ONE*ONE , Cy); + c->input_rgb2yuv_table[BY_IDX] = -ROUNDED_DIV((1 << RGB2YUV_SHIFT)*W , Cy); + + c->input_rgb2yuv_table[RU_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*V , Cu); + c->input_rgb2yuv_table[GU_IDX] = -ROUNDED_DIV((1 << RGB2YUV_SHIFT)*ONE*ONE , Cu); + c->input_rgb2yuv_table[BU_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*(Z+W) , Cu); + + c->input_rgb2yuv_table[RV_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*(V+Z) , Cv); + c->input_rgb2yuv_table[GV_IDX] = -ROUNDED_DIV((1 << RGB2YUV_SHIFT)*ONE*ONE , Cv); + c->input_rgb2yuv_table[BV_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*W , Cv); + + if(/*!dstRange && */!memcmp(table, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], sizeof(ff_yuv2rgb_coeffs[SWS_CS_DEFAULT]))) { + c->input_rgb2yuv_table[BY_IDX] = ((int)(0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + c->input_rgb2yuv_table[BV_IDX] = (-(int)(0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + c->input_rgb2yuv_table[BU_IDX] = ((int)(0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + c->input_rgb2yuv_table[GY_IDX] = ((int)(0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + c->input_rgb2yuv_table[GV_IDX] = (-(int)(0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + c->input_rgb2yuv_table[GU_IDX] = (-(int)(0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + c->input_rgb2yuv_table[RY_IDX] = ((int)(0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + c->input_rgb2yuv_table[RV_IDX] = ((int)(0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + c->input_rgb2yuv_table[RU_IDX] = (-(int)(0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5)); + } + for(i=0; i<FF_ARRAY_ELEMS(map); i++) + AV_WL16(p + 16*4 + 2*i, map[i] >= 0 ? c->input_rgb2yuv_table[map[i]] : 0); } -#endif /* HAVE_MMXEXT_INLINE */ -static void getSubSampleFactors(int *h, int *v, enum AVPixelFormat format) +static void fill_xyztables(struct SwsContext *c) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(format); - *h = desc->log2_chroma_w; - *v = desc->log2_chroma_h; + int i; + double xyzgamma = XYZ_GAMMA; + double rgbgamma = 1.0 / RGB_GAMMA; + double xyzgammainv = 1.0 / XYZ_GAMMA; + double rgbgammainv = RGB_GAMMA; + static const int16_t xyz2rgb_matrix[3][4] = { + {13270, -6295, -2041}, + {-3969, 7682, 170}, + { 228, -835, 4329} }; + static const int16_t rgb2xyz_matrix[3][4] = { + {1689, 1464, 739}, + { 871, 2929, 296}, + { 79, 488, 3891} }; + static int16_t xyzgamma_tab[4096], rgbgamma_tab[4096], xyzgammainv_tab[4096], rgbgammainv_tab[4096]; + + memcpy(c->xyz2rgb_matrix, xyz2rgb_matrix, sizeof(c->xyz2rgb_matrix)); + memcpy(c->rgb2xyz_matrix, rgb2xyz_matrix, sizeof(c->rgb2xyz_matrix)); + c->xyzgamma = xyzgamma_tab; + c->rgbgamma = rgbgamma_tab; + c->xyzgammainv = xyzgammainv_tab; + c->rgbgammainv = rgbgammainv_tab; + + if (rgbgamma_tab[4095]) + return; + + /* set gamma vectors */ + for (i = 0; i < 4096; i++) { + xyzgamma_tab[i] = lrint(pow(i / 4095.0, xyzgamma) * 4095.0); + rgbgamma_tab[i] = lrint(pow(i / 4095.0, rgbgamma) * 4095.0); + xyzgammainv_tab[i] = lrint(pow(i / 4095.0, xyzgammainv) * 4095.0); + rgbgammainv_tab[i] = lrint(pow(i / 4095.0, rgbgammainv) * 4095.0); + } } int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation) { - const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat); - const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(c->srcFormat); - memcpy(c->srcColorspaceTable, inv_table, sizeof(int) * 4); - memcpy(c->dstColorspaceTable, table, sizeof(int) * 4); + const AVPixFmtDescriptor *desc_dst; + const AVPixFmtDescriptor *desc_src; + int need_reinit = 0; + memmove(c->srcColorspaceTable, inv_table, sizeof(int) * 4); + memmove(c->dstColorspaceTable, table, sizeof(int) * 4); + + handle_formats(c); + desc_dst = av_pix_fmt_desc_get(c->dstFormat); + desc_src = av_pix_fmt_desc_get(c->srcFormat); + + if(!isYUV(c->dstFormat) && !isGray(c->dstFormat)) + dstRange = 0; + if(!isYUV(c->srcFormat) && !isGray(c->srcFormat)) + srcRange = 0; c->brightness = brightness; c->contrast = contrast; c->saturation = saturation; + if (c->srcRange != srcRange || c->dstRange != dstRange) + need_reinit = 1; c->srcRange = srcRange; c->dstRange = dstRange; - if (isYUV(c->dstFormat) || isGray(c->dstFormat)) + + //The srcBpc check is possibly wrong but we seem to lack a definitive reference to test this + //and what we have in ticket 2939 looks better with this check + if (need_reinit && (c->srcBpc == 8 || !isYUV(c->srcFormat))) + ff_sws_init_range_convert(c); + + if ((isYUV(c->dstFormat) || isGray(c->dstFormat)) && (isYUV(c->srcFormat) || isGray(c->srcFormat))) return -1; c->dstFormatBpp = av_get_bits_per_pixel(desc_dst); c->srcFormatBpp = av_get_bits_per_pixel(desc_src); - ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, - contrast, saturation); - // FIXME factorize + if (!isYUV(c->dstFormat) && !isGray(c->dstFormat)) { + ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, + contrast, saturation); + // FIXME factorize + + if (ARCH_PPC) + ff_yuv2rgb_init_tables_ppc(c, inv_table, brightness, + contrast, saturation); + } + + fill_rgb2yuv_table(c, table, dstRange); - if (ARCH_PPC) - ff_yuv2rgb_init_tables_ppc(c, inv_table, brightness, - contrast, saturation); return 0; } @@ -823,7 +868,7 @@ int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation) { - if (isYUV(c->dstFormat) || isGray(c->dstFormat)) + if (!c ) return -1; *inv_table = c->srcColorspaceTable; @@ -843,6 +888,9 @@ static int handle_jpeg(enum AVPixelFormat *format) case AV_PIX_FMT_YUVJ420P: *format = AV_PIX_FMT_YUV420P; return 1; + case AV_PIX_FMT_YUVJ411P: + *format = AV_PIX_FMT_YUV411P; + return 1; case AV_PIX_FMT_YUVJ422P: *format = AV_PIX_FMT_YUV422P; return 1; @@ -852,15 +900,51 @@ static int handle_jpeg(enum AVPixelFormat *format) case AV_PIX_FMT_YUVJ440P: *format = AV_PIX_FMT_YUV440P; return 1; + case AV_PIX_FMT_GRAY8: + case AV_PIX_FMT_GRAY16LE: + case AV_PIX_FMT_GRAY16BE: + return 1; default: return 0; } } +static int handle_0alpha(enum AVPixelFormat *format) +{ + switch (*format) { + case AV_PIX_FMT_0BGR : *format = AV_PIX_FMT_ABGR ; return 1; + case AV_PIX_FMT_BGR0 : *format = AV_PIX_FMT_BGRA ; return 4; + case AV_PIX_FMT_0RGB : *format = AV_PIX_FMT_ARGB ; return 1; + case AV_PIX_FMT_RGB0 : *format = AV_PIX_FMT_RGBA ; return 4; + default: return 0; + } +} + +static int handle_xyz(enum AVPixelFormat *format) +{ + switch (*format) { + case AV_PIX_FMT_XYZ12BE : *format = AV_PIX_FMT_RGB48BE; return 1; + case AV_PIX_FMT_XYZ12LE : *format = AV_PIX_FMT_RGB48LE; return 1; + default: return 0; + } +} + +static void handle_formats(SwsContext *c) +{ + c->src0Alpha |= handle_0alpha(&c->srcFormat); + c->dst0Alpha |= handle_0alpha(&c->dstFormat); + c->srcXYZ |= handle_xyz(&c->srcFormat); + c->dstXYZ |= handle_xyz(&c->dstFormat); + if (c->srcXYZ || c->dstXYZ) + fill_xyztables(c); +} + SwsContext *sws_alloc_context(void) { SwsContext *c = av_mallocz(sizeof(SwsContext)); + av_assert0(offsetof(SwsContext, redDither) + DITHER32_INT == offsetof(SwsContext, dither32)); + if (c) { c->av_class = &sws_context_class; av_opt_set_defaults(c); @@ -872,7 +956,7 @@ SwsContext *sws_alloc_context(void) av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) { - int i; + int i, j; int usesVFilter, usesHFilter; int unscaled; SwsFilter dummyFilter = { NULL, NULL, NULL, NULL }; @@ -880,13 +964,12 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, int srcH = c->srcH; int dstW = c->dstW; int dstH = c->dstH; - int dst_stride = FFALIGN(dstW * sizeof(int16_t) + 16, 16); - int dst_stride_px = dst_stride >> 1; + int dst_stride = FFALIGN(dstW * sizeof(int16_t) + 66, 16); int flags, cpu_flags; enum AVPixelFormat srcFormat = c->srcFormat; enum AVPixelFormat dstFormat = c->dstFormat; - const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat); - const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat); + const AVPixFmtDescriptor *desc_src; + const AVPixFmtDescriptor *desc_dst; cpu_flags = av_get_cpu_flags(); flags = c->flags; @@ -896,16 +979,33 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, unscaled = (srcW == dstW && srcH == dstH); + c->srcRange |= handle_jpeg(&c->srcFormat); + c->dstRange |= handle_jpeg(&c->dstFormat); + + if(srcFormat!=c->srcFormat || dstFormat!=c->dstFormat) + av_log(c, AV_LOG_WARNING, "deprecated pixel format used, make sure you did set range correctly\n"); + + if (!c->contrast && !c->saturation && !c->dstFormatBpp) + sws_setColorspaceDetails(c, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], c->srcRange, + ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], + c->dstRange, 0, 1 << 16, 1 << 16); + + handle_formats(c); + srcFormat = c->srcFormat; + dstFormat = c->dstFormat; + desc_src = av_pix_fmt_desc_get(srcFormat); + desc_dst = av_pix_fmt_desc_get(dstFormat); + if (!(unscaled && sws_isSupportedEndiannessConversion(srcFormat) && av_pix_fmt_swap_endianness(srcFormat) == dstFormat)) { if (!sws_isSupportedInput(srcFormat)) { av_log(c, AV_LOG_ERROR, "%s is not supported as input pixel format\n", - sws_format_name(srcFormat)); + av_get_pix_fmt_name(srcFormat)); return AVERROR(EINVAL); } if (!sws_isSupportedOutput(dstFormat)) { av_log(c, AV_LOG_ERROR, "%s is not supported as output pixel format\n", - sws_format_name(dstFormat)); + av_get_pix_fmt_name(dstFormat)); return AVERROR(EINVAL); } } @@ -925,19 +1025,19 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, /* provide a default scaler if not set by caller */ if (!i) { if (dstW < srcW && dstH < srcH) - flags |= SWS_GAUSS; + flags |= SWS_BICUBIC; else if (dstW > srcW && dstH > srcH) - flags |= SWS_SINC; + flags |= SWS_BICUBIC; else - flags |= SWS_LANCZOS; + flags |= SWS_BICUBIC; c->flags = flags; } else if (i & (i - 1)) { av_log(c, AV_LOG_ERROR, - "Exactly one scaler algorithm must be chosen\n"); + "Exactly one scaler algorithm must be chosen, got %X\n", i); return AVERROR(EINVAL); } /* sanity check */ - if (srcW < 4 || srcH < 1 || dstW < 8 || dstH < 1) { + if (srcW < 1 || srcH < 1 || dstW < 1 || dstH < 1) { /* FIXME check if these are enough and try to lower them after * fixing the relevant parts of the code */ av_log(c, AV_LOG_ERROR, "%dx%d -> %dx%d is invalid scaling dimension\n", @@ -965,9 +1065,56 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, (dstFilter->lumH && dstFilter->lumH->length > 1) || (dstFilter->chrH && dstFilter->chrH->length > 1); - getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat); - getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat); + av_pix_fmt_get_chroma_sub_sample(srcFormat, &c->chrSrcHSubSample, &c->chrSrcVSubSample); + av_pix_fmt_get_chroma_sub_sample(dstFormat, &c->chrDstHSubSample, &c->chrDstVSubSample); + + if (isAnyRGB(dstFormat) && !(flags&SWS_FULL_CHR_H_INT)) { + if (dstW&1) { + av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to odd output size\n"); + flags |= SWS_FULL_CHR_H_INT; + c->flags = flags; + } + + if ( c->chrSrcHSubSample == 0 + && c->chrSrcVSubSample == 0 + && c->dither != SWS_DITHER_BAYER //SWS_FULL_CHR_H_INT is currently not supported with SWS_DITHER_BAYER + && !(c->flags & SWS_FAST_BILINEAR) + ) { + av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to input having non subsampled chroma\n"); + flags |= SWS_FULL_CHR_H_INT; + c->flags = flags; + } + } + + if (c->dither == SWS_DITHER_AUTO) { + if (flags & SWS_ERROR_DIFFUSION) + c->dither = SWS_DITHER_ED; + } + if(dstFormat == AV_PIX_FMT_BGR4_BYTE || + dstFormat == AV_PIX_FMT_RGB4_BYTE || + dstFormat == AV_PIX_FMT_BGR8 || + dstFormat == AV_PIX_FMT_RGB8) { + if (c->dither == SWS_DITHER_AUTO) + c->dither = (flags & SWS_FULL_CHR_H_INT) ? SWS_DITHER_ED : SWS_DITHER_BAYER; + if (!(flags & SWS_FULL_CHR_H_INT)) { + if (c->dither == SWS_DITHER_ED || c->dither == SWS_DITHER_A_DITHER || c->dither == SWS_DITHER_X_DITHER) { + av_log(c, AV_LOG_DEBUG, + "Desired dithering only supported in full chroma interpolation for destination format '%s'\n", + av_get_pix_fmt_name(dstFormat)); + flags |= SWS_FULL_CHR_H_INT; + c->flags = flags; + } + } + if (flags & SWS_FULL_CHR_H_INT) { + if (c->dither == SWS_DITHER_BAYER) { + av_log(c, AV_LOG_DEBUG, + "Ordered dither is not supported in full chroma interpolation for destination format '%s'\n", + av_get_pix_fmt_name(dstFormat)); + c->dither = SWS_DITHER_ED; + } + } + } if (isPlanarRGB(dstFormat)) { if (!(flags & SWS_FULL_CHR_H_INT)) { av_log(c, AV_LOG_DEBUG, @@ -988,10 +1135,15 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, dstFormat != AV_PIX_FMT_BGRA && dstFormat != AV_PIX_FMT_ABGR && dstFormat != AV_PIX_FMT_RGB24 && - dstFormat != AV_PIX_FMT_BGR24) { - av_log(c, AV_LOG_ERROR, + dstFormat != AV_PIX_FMT_BGR24 && + dstFormat != AV_PIX_FMT_BGR4_BYTE && + dstFormat != AV_PIX_FMT_RGB4_BYTE && + dstFormat != AV_PIX_FMT_BGR8 && + dstFormat != AV_PIX_FMT_RGB8 + ) { + av_log(c, AV_LOG_WARNING, "full chroma interpolation for destination format '%s' not yet implemented\n", - sws_format_name(dstFormat)); + av_get_pix_fmt_name(dstFormat)); flags &= ~SWS_FULL_CHR_H_INT; c->flags = flags; } @@ -1011,30 +1163,20 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, srcFormat != AV_PIX_FMT_RGB4_BYTE && srcFormat != AV_PIX_FMT_BGR4_BYTE && srcFormat != AV_PIX_FMT_GBRP9BE && srcFormat != AV_PIX_FMT_GBRP9LE && srcFormat != AV_PIX_FMT_GBRP10BE && srcFormat != AV_PIX_FMT_GBRP10LE && + srcFormat != AV_PIX_FMT_GBRP12BE && srcFormat != AV_PIX_FMT_GBRP12LE && + srcFormat != AV_PIX_FMT_GBRP14BE && srcFormat != AV_PIX_FMT_GBRP14LE && srcFormat != AV_PIX_FMT_GBRP16BE && srcFormat != AV_PIX_FMT_GBRP16LE && ((dstW >> c->chrDstHSubSample) <= (srcW >> 1) || (flags & SWS_FAST_BILINEAR))) c->chrSrcHSubSample = 1; - // Note the -((-x)>>y) is so that we always round toward +inf. - c->chrSrcW = -((-srcW) >> c->chrSrcHSubSample); - c->chrSrcH = -((-srcH) >> c->chrSrcVSubSample); - c->chrDstW = -((-dstW) >> c->chrDstHSubSample); - c->chrDstH = -((-dstH) >> c->chrDstVSubSample); + // Note the FF_CEIL_RSHIFT is so that we always round toward +inf. + c->chrSrcW = FF_CEIL_RSHIFT(srcW, c->chrSrcHSubSample); + c->chrSrcH = FF_CEIL_RSHIFT(srcH, c->chrSrcVSubSample); + c->chrDstW = FF_CEIL_RSHIFT(dstW, c->chrDstHSubSample); + c->chrDstH = FF_CEIL_RSHIFT(dstH, c->chrDstVSubSample); - /* unscaled special cases */ - if (unscaled && !usesHFilter && !usesVFilter && - (c->srcRange == c->dstRange || isAnyRGB(dstFormat))) { - ff_get_unscaled_swscale(c); - - if (c->swscale) { - if (flags & SWS_PRINT_INFO) - av_log(c, AV_LOG_INFO, - "using unscaled %s -> %s special converter\n", - sws_format_name(srcFormat), sws_format_name(dstFormat)); - return 0; - } - } + FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail); c->srcBpc = 1 + desc_src->comp[0].depth_minus1; if (c->srcBpc < 8) @@ -1042,21 +1184,23 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, c->dstBpc = 1 + desc_dst->comp[0].depth_minus1; if (c->dstBpc < 8) c->dstBpc = 8; + if (isAnyRGB(srcFormat) || srcFormat == AV_PIX_FMT_PAL8) + c->srcBpc = 16; if (c->dstBpc == 16) dst_stride <<= 1; - FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, - (FFALIGN(srcW, 16) * 2 * FFALIGN(c->srcBpc, 8) >> 3) + 16, - fail); - if (INLINE_MMXEXT(cpu_flags) && c->srcBpc == 8 && c->dstBpc <= 10) { - c->canMMXEXTBeUsed = (dstW >= srcW && (dstW & 31) == 0 && - (srcW & 15) == 0) ? 1 : 0; - if (!c->canMMXEXTBeUsed && dstW >= srcW && (srcW & 15) == 0 + + if (INLINE_MMXEXT(cpu_flags) && c->srcBpc == 8 && c->dstBpc <= 14) { + c->canMMXEXTBeUsed = dstW >= srcW && (dstW & 31) == 0 && + c->chrDstW >= c->chrSrcW && + (srcW & 15) == 0; + if (!c->canMMXEXTBeUsed && dstW >= srcW && c->chrDstW >= c->chrSrcW && (srcW & 15) == 0 + && (flags & SWS_FAST_BILINEAR)) { if (flags & SWS_PRINT_INFO) av_log(c, AV_LOG_INFO, "output width is not a multiple of 32 -> no MMXEXT scaler\n"); } - if (usesHFilter) + if (usesHFilter || isNBPS(c->srcFormat) || is16BPS(c->srcFormat) || isAnyRGB(c->srcFormat)) c->canMMXEXTBeUsed = 0; } else c->canMMXEXTBeUsed = 0; @@ -1077,7 +1221,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, c->chrXInc += 20; } // we don't use the x86 asm scaler if MMX is available - else if (INLINE_MMX(cpu_flags)) { + else if (INLINE_MMX(cpu_flags) && c->dstBpc <= 14) { c->lumXInc = ((int64_t)(srcW - 2) << 16) / (dstW - 2) - 20; c->chrXInc = ((int64_t)(c->chrSrcW - 2) << 16) / (c->chrDstW - 2) - 20; } @@ -1090,9 +1234,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, #if HAVE_MMXEXT_INLINE // can't downscale !!! if (c->canMMXEXTBeUsed && (flags & SWS_FAST_BILINEAR)) { - c->lumMmxextFilterCodeSize = init_hscaler_mmxext(dstW, c->lumXInc, NULL, + c->lumMmxextFilterCodeSize = ff_init_hscaler_mmxext(dstW, c->lumXInc, NULL, NULL, NULL, 8); - c->chrMmxextFilterCodeSize = init_hscaler_mmxext(c->chrDstW, c->chrXInc, + c->chrMmxextFilterCodeSize = ff_init_hscaler_mmxext(c->chrDstW, c->chrXInc, NULL, NULL, NULL, 4); #if USE_MMAP @@ -1118,21 +1262,32 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, c->chrMmxextFilterCode = av_malloc(c->chrMmxextFilterCodeSize); #endif +#ifdef MAP_ANONYMOUS + if (c->lumMmxextFilterCode == MAP_FAILED || c->chrMmxextFilterCode == MAP_FAILED) +#else if (!c->lumMmxextFilterCode || !c->chrMmxextFilterCode) +#endif + { + av_log(c, AV_LOG_ERROR, "Failed to allocate MMX2FilterCode\n"); return AVERROR(ENOMEM); + } + FF_ALLOCZ_OR_GOTO(c, c->hLumFilter, (dstW / 8 + 8) * sizeof(int16_t), fail); FF_ALLOCZ_OR_GOTO(c, c->hChrFilter, (c->chrDstW / 4 + 8) * sizeof(int16_t), fail); FF_ALLOCZ_OR_GOTO(c, c->hLumFilterPos, (dstW / 2 / 8 + 8) * sizeof(int32_t), fail); FF_ALLOCZ_OR_GOTO(c, c->hChrFilterPos, (c->chrDstW / 2 / 4 + 8) * sizeof(int32_t), fail); - init_hscaler_mmxext(dstW, c->lumXInc, c->lumMmxextFilterCode, - c->hLumFilter, c->hLumFilterPos, 8); - init_hscaler_mmxext(c->chrDstW, c->chrXInc, c->chrMmxextFilterCode, - c->hChrFilter, c->hChrFilterPos, 4); + ff_init_hscaler_mmxext( dstW, c->lumXInc, c->lumMmxextFilterCode, + c->hLumFilter, (uint32_t*)c->hLumFilterPos, 8); + ff_init_hscaler_mmxext(c->chrDstW, c->chrXInc, c->chrMmxextFilterCode, + c->hChrFilter, (uint32_t*)c->hChrFilterPos, 4); #if USE_MMAP - mprotect(c->lumMmxextFilterCode, c->lumMmxextFilterCodeSize, PROT_EXEC | PROT_READ); - mprotect(c->chrMmxextFilterCode, c->chrMmxextFilterCodeSize, PROT_EXEC | PROT_READ); + if ( mprotect(c->lumMmxextFilterCode, c->lumMmxextFilterCodeSize, PROT_EXEC | PROT_READ) == -1 + || mprotect(c->chrMmxextFilterCode, c->chrMmxextFilterCodeSize, PROT_EXEC | PROT_READ) == -1) { + av_log(c, AV_LOG_ERROR, "mprotect failed, cannot use fast bilinear scaler\n"); + goto fail; + } #endif } else #endif /* HAVE_MMXEXT_INLINE */ @@ -1145,14 +1300,18 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, srcW, dstW, filterAlign, 1 << 14, (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags, cpu_flags, srcFilter->lumH, dstFilter->lumH, - c->param, 1) < 0) + c->param, + get_local_pos(c, 0, 0, 0), + get_local_pos(c, 0, 0, 0)) < 0) goto fail; if (initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags, cpu_flags, srcFilter->chrH, dstFilter->chrH, - c->param, 1) < 0) + c->param, + get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), + get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0)) < 0) goto fail; } } // initialize horizontal stuff @@ -1166,14 +1325,19 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, c->lumYInc, srcH, dstH, filterAlign, (1 << 12), (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags, cpu_flags, srcFilter->lumV, dstFilter->lumV, - c->param, 0) < 0) + c->param, + get_local_pos(c, 0, 0, 1), + get_local_pos(c, 0, 0, 1)) < 0) goto fail; if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc, c->chrSrcH, c->chrDstH, filterAlign, (1 << 12), (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags, cpu_flags, srcFilter->chrV, dstFilter->chrV, - c->param, 0) < 0) + c->param, + get_local_pos(c, c->chrSrcVSubSample, c->src_v_chr_pos, 1), + get_local_pos(c, c->chrDstVSubSample, c->dst_v_chr_pos, 1)) < 0) + goto fail; #if HAVE_ALTIVEC @@ -1215,6 +1379,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, c->vChrFilterPos[chrI]; } + for (i = 0; i < 4; i++) + FF_ALLOCZ_OR_GOTO(c, c->dither_error[i], (c->dstW+2) * sizeof(int), fail); + /* Allocate pixbufs (we use dynamic allocation because otherwise we would * need to allocate several megabytes to handle all possible cases) */ FF_ALLOC_OR_GOTO(c, c->lumPixBuf, c->vLumBufSize * 3 * sizeof(int16_t *), fail); @@ -1230,9 +1397,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, dst_stride + 16, fail); c->lumPixBuf[i] = c->lumPixBuf[i + c->vLumBufSize]; } - // 64 / (c->dstBpc & ~7) is the same as 16 / sizeof(scaling_intermediate) - c->uv_off_px = dst_stride_px + 64 / (c->dstBpc & ~7); - c->uv_off_byte = dst_stride + 16; + // 64 / c->scalingBpp is the same as 16 / sizeof(scaling_intermediate) + c->uv_off = (dst_stride>>1) + 64 / (c->dstBpc &~ 7); + c->uv_offx2 = dst_stride + 16; for (i = 0; i < c->vChrBufSize; i++) { FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i + c->vChrBufSize], dst_stride * 2 + 32, fail); @@ -1249,38 +1416,30 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, // try to avoid drawing green stuff between the right end and the stride end for (i = 0; i < c->vChrBufSize; i++) - memset(c->chrUPixBuf[i], 64, dst_stride * 2 + 1); + if(desc_dst->comp[0].depth_minus1 == 15){ + av_assert0(c->dstBpc > 14); + for(j=0; j<dst_stride/2+1; j++) + ((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18; + } else + for(j=0; j<dst_stride+1; j++) + ((int16_t*)(c->chrUPixBuf[i]))[j] = 1<<14; - assert(c->chrDstH <= dstH); + av_assert0(c->chrDstH <= dstH); if (flags & SWS_PRINT_INFO) { - if (flags & SWS_FAST_BILINEAR) - av_log(c, AV_LOG_INFO, "FAST_BILINEAR scaler, "); - else if (flags & SWS_BILINEAR) - av_log(c, AV_LOG_INFO, "BILINEAR scaler, "); - else if (flags & SWS_BICUBIC) - av_log(c, AV_LOG_INFO, "BICUBIC scaler, "); - else if (flags & SWS_X) - av_log(c, AV_LOG_INFO, "Experimental scaler, "); - else if (flags & SWS_POINT) - av_log(c, AV_LOG_INFO, "Nearest Neighbor / POINT scaler, "); - else if (flags & SWS_AREA) - av_log(c, AV_LOG_INFO, "Area Averaging scaler, "); - else if (flags & SWS_BICUBLIN) - av_log(c, AV_LOG_INFO, "luma BICUBIC / chroma BILINEAR scaler, "); - else if (flags & SWS_GAUSS) - av_log(c, AV_LOG_INFO, "Gaussian scaler, "); - else if (flags & SWS_SINC) - av_log(c, AV_LOG_INFO, "Sinc scaler, "); - else if (flags & SWS_LANCZOS) - av_log(c, AV_LOG_INFO, "Lanczos scaler, "); - else if (flags & SWS_SPLINE) - av_log(c, AV_LOG_INFO, "Bicubic spline scaler, "); - else - av_log(c, AV_LOG_INFO, "ehh flags invalid?! "); + const char *scaler = NULL, *cpucaps; - av_log(c, AV_LOG_INFO, "from %s to %s%s ", - sws_format_name(srcFormat), + for (i = 0; i < FF_ARRAY_ELEMS(scale_algorithms); i++) { + if (flags & scale_algorithms[i].flag) { + scaler = scale_algorithms[i].description; + break; + } + } + if (!scaler) + scaler = "ehh flags invalid?!"; + av_log(c, AV_LOG_INFO, "%s scaler, from %s to %s%s ", + scaler, + av_get_pix_fmt_name(srcFormat), #ifdef DITHER1XBPP dstFormat == AV_PIX_FMT_BGR555 || dstFormat == AV_PIX_FMT_BGR565 || dstFormat == AV_PIX_FMT_RGB444BE || dstFormat == AV_PIX_FMT_RGB444LE || @@ -1289,18 +1448,20 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, #else "", #endif - sws_format_name(dstFormat)); + av_get_pix_fmt_name(dstFormat)); if (INLINE_MMXEXT(cpu_flags)) - av_log(c, AV_LOG_INFO, "using MMXEXT\n"); + cpucaps = "MMXEXT"; else if (INLINE_AMD3DNOW(cpu_flags)) - av_log(c, AV_LOG_INFO, "using 3DNOW\n"); + cpucaps = "3DNOW"; else if (INLINE_MMX(cpu_flags)) - av_log(c, AV_LOG_INFO, "using MMX\n"); + cpucaps = "MMX"; else if (PPC_ALTIVEC(cpu_flags)) - av_log(c, AV_LOG_INFO, "using AltiVec\n"); + cpucaps = "AltiVec"; else - av_log(c, AV_LOG_INFO, "using C\n"); + cpucaps = "C"; + + av_log(c, AV_LOG_INFO, "using %s\n", cpucaps); av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH); av_log(c, AV_LOG_DEBUG, @@ -1312,6 +1473,20 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, c->chrXInc, c->chrYInc); } + /* unscaled special cases */ + if (unscaled && !usesHFilter && !usesVFilter && + (c->srcRange == c->dstRange || isAnyRGB(dstFormat))) { + ff_get_unscaled_swscale(c); + + if (c->swscale) { + if (flags & SWS_PRINT_INFO) + av_log(c, AV_LOG_INFO, + "using unscaled %s -> %s special converter\n", + av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat)); + return 0; + } + } + c->swscale = ff_getSwsFunc(c); return 0; fail: // FIXME replace things by appropriate error codes @@ -1333,8 +1508,6 @@ SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat, c->srcH = srcH; c->dstW = dstW; c->dstH = dstH; - c->srcRange = handle_jpeg(&srcFormat); - c->dstRange = handle_jpeg(&dstFormat); c->srcFormat = srcFormat; c->dstFormat = dstFormat; @@ -1342,9 +1515,6 @@ SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat, c->param[0] = param[0]; c->param[1] = param[1]; } - sws_setColorspaceDetails(c, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], c->srcRange, - ff_yuv2rgb_coeffs[SWS_CS_DEFAULT] /* FIXME*/, - c->dstRange, 0, 1 << 16, 1 << 16); if (sws_init_context(c, srcFilter, dstFilter) < 0) { sws_freeContext(c); @@ -1418,7 +1588,12 @@ SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur, SwsVector *sws_allocVec(int length) { - SwsVector *vec = av_malloc(sizeof(SwsVector)); + SwsVector *vec; + + if(length <= 0 || length > INT_MAX/ sizeof(double)) + return NULL; + + vec = av_malloc(sizeof(SwsVector)); if (!vec) return NULL; vec->length = length; @@ -1433,7 +1608,12 @@ SwsVector *sws_getGaussianVec(double variance, double quality) const int length = (int)(variance * quality + 0.5) | 1; int i; double middle = (length - 1) * 0.5; - SwsVector *vec = sws_allocVec(length); + SwsVector *vec; + + if(variance < 0 || quality < 0) + return NULL; + + vec = sws_allocVec(length); if (!vec) return NULL; @@ -1600,14 +1780,12 @@ void sws_convVec(SwsVector *a, SwsVector *b) SwsVector *sws_cloneVec(SwsVector *a) { - int i; SwsVector *vec = sws_allocVec(a->length); if (!vec) return NULL; - for (i = 0; i < a->length; i++) - vec->coeff[i] = a->coeff[i]; + memcpy(vec->coeff, a->coeff, a->length * sizeof(*a->coeff)); return vec; } @@ -1652,14 +1830,10 @@ void sws_freeFilter(SwsFilter *filter) if (!filter) return; - if (filter->lumH) - sws_freeVec(filter->lumH); - if (filter->lumV) - sws_freeVec(filter->lumV); - if (filter->chrH) - sws_freeVec(filter->chrH); - if (filter->chrV) - sws_freeVec(filter->chrV); + sws_freeVec(filter->lumH); + sws_freeVec(filter->lumV); + sws_freeVec(filter->chrH); + sws_freeVec(filter->chrV); av_free(filter); } @@ -1688,6 +1862,9 @@ void sws_freeContext(SwsContext *c) av_freep(&c->alpPixBuf); } + for (i = 0; i < 4; i++) + av_freep(&c->dither_error[i]); + av_freep(&c->vLumFilter); av_freep(&c->vChrFilter); av_freep(&c->hLumFilter); @@ -1722,7 +1899,7 @@ void sws_freeContext(SwsContext *c) #endif /* HAVE_MMX_INLINE */ av_freep(&c->yuvTable); - av_free(c->formatConvBuffer); + av_freep(&c->formatConvBuffer); av_free(c); } @@ -1760,19 +1937,13 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context, int srcW, return NULL; context->srcW = srcW; context->srcH = srcH; - context->srcRange = handle_jpeg(&srcFormat); context->srcFormat = srcFormat; context->dstW = dstW; context->dstH = dstH; - context->dstRange = handle_jpeg(&dstFormat); context->dstFormat = dstFormat; context->flags = flags; context->param[0] = param[0]; context->param[1] = param[1]; - sws_setColorspaceDetails(context, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], - context->srcRange, - ff_yuv2rgb_coeffs[SWS_CS_DEFAULT] /* FIXME*/, - context->dstRange, 0, 1 << 16, 1 << 16); if (sws_init_context(context, srcFilter, dstFilter) < 0) { sws_freeContext(context); return NULL; diff --git a/libswscale/version.h b/libswscale/version.h index 7213ab3..09da910 100644 --- a/libswscale/version.h +++ b/libswscale/version.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,8 +27,8 @@ #include "libavutil/version.h" #define LIBSWSCALE_VERSION_MAJOR 2 -#define LIBSWSCALE_VERSION_MINOR 1 -#define LIBSWSCALE_VERSION_MICRO 3 +#define LIBSWSCALE_VERSION_MINOR 6 +#define LIBSWSCALE_VERSION_MICRO 101 #define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \ LIBSWSCALE_VERSION_MINOR, \ @@ -49,6 +49,9 @@ #ifndef FF_API_SWS_CPU_CAPS #define FF_API_SWS_CPU_CAPS (LIBSWSCALE_VERSION_MAJOR < 3) #endif +#ifndef FF_API_SWS_FORMAT_NAME +#define FF_API_SWS_FORMAT_NAME (LIBSWSCALE_VERSION_MAJOR < 3) +#endif #ifndef FF_API_ARCH_BFIN #define FF_API_ARCH_BFIN (LIBSWSCALE_VERSION_MAJOR < 3) #endif diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index b94b14a..6901207 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -1,7 +1,11 @@ +$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS) + OBJS += x86/rgb2rgb.o \ x86/swscale.o \ x86/yuv2rgb.o \ +MMX-OBJS += x86/hscale_fast_bilinear_simd.o \ + OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o YASM-OBJS += x86/input.o \ diff --git a/libswscale/x86/hscale_fast_bilinear_simd.c b/libswscale/x86/hscale_fast_bilinear_simd.c new file mode 100644 index 0000000..103793d --- /dev/null +++ b/libswscale/x86/hscale_fast_bilinear_simd.c @@ -0,0 +1,374 @@ +/* + * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "../swscale_internal.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" + +#define RET 0xC3 // near return opcode for x86 +#define PREFETCH "prefetchnta" + +#if HAVE_INLINE_ASM +av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, + int16_t *filter, int32_t *filterPos, + int numSplits) +{ + uint8_t *fragmentA; + x86_reg imm8OfPShufW1A; + x86_reg imm8OfPShufW2A; + x86_reg fragmentLengthA; + uint8_t *fragmentB; + x86_reg imm8OfPShufW1B; + x86_reg imm8OfPShufW2B; + x86_reg fragmentLengthB; + int fragmentPos; + + int xpos, i; + + // create an optimized horizontal scaling routine + /* This scaler is made of runtime-generated MMXEXT code using specially tuned + * pshufw instructions. For every four output pixels, if four input pixels + * are enough for the fast bilinear scaling, then a chunk of fragmentB is + * used. If five input pixels are needed, then a chunk of fragmentA is used. + */ + + // code fragment + + __asm__ volatile ( + "jmp 9f \n\t" + // Begin + "0: \n\t" + "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" + "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" + "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "pshufw $0xFF, %%mm1, %%mm1 \n\t" + "1: \n\t" + "pshufw $0xFF, %%mm0, %%mm0 \n\t" + "2: \n\t" + "psubw %%mm1, %%mm0 \n\t" + "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "psllw $7, %%mm1 \n\t" + "paddw %%mm1, %%mm0 \n\t" + + "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" + + "add $8, %%"REG_a" \n\t" + // End + "9: \n\t" + // "int $3 \n\t" + "lea " LOCAL_MANGLE(0b) ", %0 \n\t" + "lea " LOCAL_MANGLE(1b) ", %1 \n\t" + "lea " LOCAL_MANGLE(2b) ", %2 \n\t" + "dec %1 \n\t" + "dec %2 \n\t" + "sub %0, %1 \n\t" + "sub %0, %2 \n\t" + "lea " LOCAL_MANGLE(9b) ", %3 \n\t" + "sub %0, %3 \n\t" + + + : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), + "=r" (fragmentLengthA) + ); + + __asm__ volatile ( + "jmp 9f \n\t" + // Begin + "0: \n\t" + "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" + "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "pshufw $0xFF, %%mm0, %%mm1 \n\t" + "1: \n\t" + "pshufw $0xFF, %%mm0, %%mm0 \n\t" + "2: \n\t" + "psubw %%mm1, %%mm0 \n\t" + "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "psllw $7, %%mm1 \n\t" + "paddw %%mm1, %%mm0 \n\t" + + "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" + + "add $8, %%"REG_a" \n\t" + // End + "9: \n\t" + // "int $3 \n\t" + "lea " LOCAL_MANGLE(0b) ", %0 \n\t" + "lea " LOCAL_MANGLE(1b) ", %1 \n\t" + "lea " LOCAL_MANGLE(2b) ", %2 \n\t" + "dec %1 \n\t" + "dec %2 \n\t" + "sub %0, %1 \n\t" + "sub %0, %2 \n\t" + "lea " LOCAL_MANGLE(9b) ", %3 \n\t" + "sub %0, %3 \n\t" + + + : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), + "=r" (fragmentLengthB) + ); + + xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers + fragmentPos = 0; + + for (i = 0; i < dstW / numSplits; i++) { + int xx = xpos >> 16; + + if ((i & 3) == 0) { + int a = 0; + int b = ((xpos + xInc) >> 16) - xx; + int c = ((xpos + xInc * 2) >> 16) - xx; + int d = ((xpos + xInc * 3) >> 16) - xx; + int inc = (d + 1 < 4); + uint8_t *fragment = inc ? fragmentB : fragmentA; + x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A; + x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A; + x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA; + int maxShift = 3 - (d + inc); + int shift = 0; + + if (filterCode) { + filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9; + filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9; + filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9; + filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9; + filterPos[i / 2] = xx; + + memcpy(filterCode + fragmentPos, fragment, fragmentLength); + + filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) | + ((b + inc) << 2) | + ((c + inc) << 4) | + ((d + inc) << 6); + filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) | + (c << 4) | + (d << 6); + + if (i + 4 - inc >= dstW) + shift = maxShift; // avoid overread + else if ((filterPos[i / 2] & 3) <= maxShift) + shift = filterPos[i / 2] & 3; // align + + if (shift && i >= shift) { + filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift; + filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift; + filterPos[i / 2] -= shift; + } + } + + fragmentPos += fragmentLength; + + if (filterCode) + filterCode[fragmentPos] = RET; + } + xpos += xInc; + } + if (filterCode) + filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part + + return fragmentPos + 1; +} + +void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, + int dstWidth, const uint8_t *src, + int srcW, int xInc) +{ + int32_t *filterPos = c->hLumFilterPos; + int16_t *filter = c->hLumFilter; + void *mmxextFilterCode = c->lumMmxextFilterCode; + int i; +#if defined(PIC) + uint64_t ebxsave; +#endif +#if ARCH_X86_64 + uint64_t retsave; +#endif + + __asm__ volatile( +#if defined(PIC) + "mov %%"REG_b", %5 \n\t" +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %6 \n\t" +#endif +#else +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %5 \n\t" +#endif +#endif + "pxor %%mm7, %%mm7 \n\t" + "mov %0, %%"REG_c" \n\t" + "mov %1, %%"REG_D" \n\t" + "mov %2, %%"REG_d" \n\t" + "mov %3, %%"REG_b" \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" // i + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + +#if ARCH_X86_64 +#define CALL_MMXEXT_FILTER_CODE \ + "movl (%%"REG_b"), %%esi \n\t"\ + "call *%4 \n\t"\ + "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ + "add %%"REG_S", %%"REG_c" \n\t"\ + "add %%"REG_a", %%"REG_D" \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + +#else +#define CALL_MMXEXT_FILTER_CODE \ + "movl (%%"REG_b"), %%esi \n\t"\ + "call *%4 \n\t"\ + "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ + "add %%"REG_a", %%"REG_D" \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + +#endif /* ARCH_X86_64 */ + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + +#if defined(PIC) + "mov %5, %%"REG_b" \n\t" +#if ARCH_X86_64 + "mov %6, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#endif +#else +#if ARCH_X86_64 + "mov %5, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#endif +#endif + :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), + "m" (mmxextFilterCode) +#if defined(PIC) + ,"m" (ebxsave) +#endif +#if ARCH_X86_64 + ,"m"(retsave) +#endif + : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D +#if !defined(PIC) + ,"%"REG_b +#endif + ); + + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) + dst[i] = src[srcW-1]*128; +} + +void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, + int dstWidth, const uint8_t *src1, + const uint8_t *src2, int srcW, int xInc) +{ + int32_t *filterPos = c->hChrFilterPos; + int16_t *filter = c->hChrFilter; + void *mmxextFilterCode = c->chrMmxextFilterCode; + int i; +#if defined(PIC) + DECLARE_ALIGNED(8, uint64_t, ebxsave); +#endif +#if ARCH_X86_64 + DECLARE_ALIGNED(8, uint64_t, retsave); +#endif + + __asm__ volatile( +#if defined(PIC) + "mov %%"REG_b", %7 \n\t" +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %8 \n\t" +#endif +#else +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %7 \n\t" +#endif +#endif + "pxor %%mm7, %%mm7 \n\t" + "mov %0, %%"REG_c" \n\t" + "mov %1, %%"REG_D" \n\t" + "mov %2, %%"REG_d" \n\t" + "mov %3, %%"REG_b" \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" // i + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + "xor %%"REG_a", %%"REG_a" \n\t" // i + "mov %5, %%"REG_c" \n\t" // src + "mov %6, %%"REG_D" \n\t" // buf2 + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + +#if defined(PIC) + "mov %7, %%"REG_b" \n\t" +#if ARCH_X86_64 + "mov %8, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#endif +#else +#if ARCH_X86_64 + "mov %7, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#endif +#endif + :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), + "m" (mmxextFilterCode), "m" (src2), "m"(dst2) +#if defined(PIC) + ,"m" (ebxsave) +#endif +#if ARCH_X86_64 + ,"m"(retsave) +#endif + : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D +#if !defined(PIC) + ,"%"REG_b +#endif + ); + + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { + dst1[i] = src1[srcW-1]*128; + dst2[i] = src2[srcW-1]*128; + } +} +#endif //HAVE_INLINE_ASM diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 6f5677e..af9afca 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -4,20 +4,20 @@ ;* into YUV planes also. ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -35,33 +35,59 @@ SECTION_RODATA %define GV 0xD0E3 %define BV 0xF6E4 -rgb_Yrnd: times 4 dd 0x84000 ; 16.5 << 15 -rgb_UVrnd: times 4 dd 0x404000 ; 128.5 << 15 -bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY -bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY -rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY -rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY -bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU -bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU -rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU -rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU -bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV -bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV -rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV -rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV - -rgba_Ycoeff_rb: times 4 dw RY, BY -rgba_Ycoeff_br: times 4 dw BY, RY -rgba_Ycoeff_ga: times 4 dw GY, 0 -rgba_Ycoeff_ag: times 4 dw 0, GY -rgba_Ucoeff_rb: times 4 dw RU, BU -rgba_Ucoeff_br: times 4 dw BU, RU -rgba_Ucoeff_ga: times 4 dw GU, 0 -rgba_Ucoeff_ag: times 4 dw 0, GU -rgba_Vcoeff_rb: times 4 dw RV, BV -rgba_Vcoeff_br: times 4 dw BV, RV -rgba_Vcoeff_ga: times 4 dw GV, 0 -rgba_Vcoeff_ag: times 4 dw 0, GV +rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15 +rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15 +%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq +%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq +%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq +%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq +%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq +%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq +%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq +%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq +%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq +%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq +%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq +%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq + +%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq +%define rgba_Ycoeff_br 16*4 + 16*13 + tableq +%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq +%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq +%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq +%define rgba_Ucoeff_br 16*4 + 16*17 + tableq +%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq +%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq +%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq +%define rgba_Vcoeff_br 16*4 + 16*21 + tableq +%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq +%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq + +; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY +; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY +; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY +; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY +; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU +; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU +; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU +; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU +; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV +; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV +; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV +; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV + +; rgba_Ycoeff_rb: times 4 dw RY, BY +; rgba_Ycoeff_br: times 4 dw BY, RY +; rgba_Ycoeff_ga: times 4 dw GY, 0 +; rgba_Ycoeff_ag: times 4 dw 0, GY +; rgba_Ucoeff_rb: times 4 dw RU, BU +; rgba_Ucoeff_br: times 4 dw BU, RU +; rgba_Ucoeff_ga: times 4 dw GU, 0 +; rgba_Ucoeff_ag: times 4 dw 0, GU +; rgba_Vcoeff_rb: times 4 dw RV, BV +; rgba_Vcoeff_br: times 4 dw BV, RV +; rgba_Vcoeff_ga: times 4 dw GV, 0 +; rgba_Vcoeff_ag: times 4 dw 0, GV shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \ 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80 @@ -82,7 +108,7 @@ SECTION .text ; %1 = nr. of XMM registers ; %2 = rgb or bgr %macro RGB24_TO_Y_FN 2-3 -cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w +cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table %if mmsize == 8 mova m5, [%2_Ycoeff_12x4] mova m6, [%2_Ycoeff_3x56] @@ -114,6 +140,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w %if ARCH_X86_64 movsxd wq, wd %endif + add wq, wq add dstq, wq neg wq %if notcpuflag(ssse3) @@ -157,12 +184,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7] paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] } paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] } - psrad m0, 15 - psrad m2, 15 + psrad m0, 9 + psrad m2, 9 packssdw m0, m2 ; (word) { Y[0-7] } - packuswb m0, m0 ; (byte) { Y[0-7] } - movh [dstq+wq], m0 - add wq, mmsize / 2 + mova [dstq+wq], m0 + add wq, mmsize jl .loop REP_RET %endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 @@ -171,7 +197,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w ; %1 = nr. of XMM registers ; %2 = rgb or bgr %macro RGB24_TO_UV_FN 2-3 -cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table %if ARCH_X86_64 mova m8, [%2_Ucoeff_12x4] mova m9, [%2_Ucoeff_3x56] @@ -202,10 +228,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w %endif ; x86-32/64 %endif ; cpuflag(ssse3) %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif + add wq, wq add dstUq, wq add dstVq, wq neg wq @@ -263,23 +290,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] } paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] } paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] } - psrad m0, 15 - psrad m2, 15 - psrad m1, 15 - psrad m4, 15 + psrad m0, 9 + psrad m2, 9 + psrad m1, 9 + psrad m4, 9 packssdw m0, m1 ; (word) { U[0-7] } packssdw m2, m4 ; (word) { V[0-7] } %if mmsize == 8 - packuswb m0, m0 ; (byte) { U[0-3] } - packuswb m2, m2 ; (byte) { V[0-3] } - movh [dstUq+wq], m0 - movh [dstVq+wq], m2 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %else ; mmsize == 16 - packuswb m0, m2 ; (byte) { U[0-7], V[0-7] } - movh [dstUq+wq], m0 - movhps [dstVq+wq], m0 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %endif ; mmsize == 8/16 - add wq, mmsize / 2 + add wq, mmsize jl .loop REP_RET %endif ; ARCH_X86_64 && %0 == 3 @@ -305,13 +329,15 @@ RGB24_FUNCS 10, 12 INIT_XMM ssse3 RGB24_FUNCS 11, 13 +%if HAVE_AVX_EXTERNAL INIT_XMM avx RGB24_FUNCS 11, 13 +%endif ; %1 = nr. of XMM registers ; %2-5 = rgba, bgra, argb or abgr (in individual characters) %macro RGB32_TO_Y_FN 5-6 -cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w +cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table mova m5, [rgba_Ycoeff_%2%4] mova m6, [rgba_Ycoeff_%3%5] %if %0 == 6 @@ -321,7 +347,9 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w %if ARCH_X86_64 movsxd wq, wd %endif - lea srcq, [srcq+wq*4] + add wq, wq + sub wq, mmsize - 1 + lea srcq, [srcq+wq*2] add dstq, wq neg wq mova m4, [rgb_Yrnd] @@ -329,8 +357,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w psrlw m7, 8 ; (word) { 0x00ff } x4 .loop: ; FIXME check alignment and use mova - movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] @@ -340,13 +368,29 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w paddd m2, m4 ; += rgb_Yrnd paddd m0, m1 ; (dword) { Y[0-3] } paddd m2, m3 ; (dword) { Y[4-7] } - psrad m0, 15 - psrad m2, 15 + psrad m0, 9 + psrad m2, 9 packssdw m0, m2 ; (word) { Y[0-7] } - packuswb m0, m0 ; (byte) { Y[0-7] } - movh [dstq+wq], m0 - add wq, mmsize / 2 + mova [dstq+wq], m0 + add wq, mmsize jl .loop + sub wq, mmsize - 1 + jz .end + add srcq, 2*mmsize - 2 + add dstq, mmsize - 1 +.loop2: + movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] + pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] + pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] + paddd m0, m4 ; += rgb_Yrnd + paddd m0, m1 ; (dword) { Y[0-3] } + psrad m0, 9 + packssdw m0, m0 ; (word) { Y[0-7] } + movd [dstq+wq], m0 + add wq, 2 + jl .loop2 +.end: REP_RET %endif ; %0 == 3 %endmacro @@ -354,7 +398,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w ; %1 = nr. of XMM registers ; %2-5 = rgba, bgra, argb or abgr (in individual characters) %macro RGB32_TO_UV_FN 5-6 -cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table %if ARCH_X86_64 mova m8, [rgba_Ucoeff_%2%4] mova m9, [rgba_Ucoeff_%3%5] @@ -375,21 +419,23 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w %else ; ARCH_X86_64 && %0 == 6 .body: %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif + add wq, wq + sub wq, mmsize - 1 add dstUq, wq add dstVq, wq - lea srcq, [srcq+wq*4] + lea srcq, [srcq+wq*2] neg wq pcmpeqb m7, m7 psrlw m7, 8 ; (word) { 0x00ff } x4 mova m6, [rgb_UVrnd] .loop: ; FIXME check alignment and use mova - movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] @@ -405,26 +451,48 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] paddd m3, m6 ; += rgb_UVrnd paddd m5, m6 ; += rgb_UVrnd - psrad m0, 15 + psrad m0, 9 paddd m1, m3 ; (dword) { V[4-7] } paddd m4, m5 ; (dword) { U[4-7] } - psrad m2, 15 - psrad m4, 15 - psrad m1, 15 + psrad m2, 9 + psrad m4, 9 + psrad m1, 9 packssdw m0, m4 ; (word) { U[0-7] } packssdw m2, m1 ; (word) { V[0-7] } %if mmsize == 8 - packuswb m0, m0 ; (byte) { U[0-7] } - packuswb m2, m2 ; (byte) { V[0-7] } - movh [dstUq+wq], m0 - movh [dstVq+wq], m2 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %else ; mmsize == 16 - packuswb m0, m2 ; (byte) { U[0-7], V[0-7] } - movh [dstUq+wq], m0 - movhps [dstVq+wq], m0 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %endif ; mmsize == 8/16 - add wq, mmsize / 2 + add wq, mmsize jl .loop + sub wq, mmsize - 1 + jz .end + add srcq , 2*mmsize - 2 + add dstUq, mmsize - 1 + add dstVq, mmsize - 1 +.loop2: + movd m0, [srcq+wq*2] ; (byte) { Bx, Gx, Rx, xx }[0-3] + DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] + pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] + pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] + pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3] + pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3] + paddd m3, m6 ; += rgb_UVrnd + paddd m1, m6 ; += rgb_UVrnd + paddd m2, m3 ; (dword) { V[0-3] } + paddd m0, m1 ; (dword) { U[0-3] } + psrad m0, 9 + psrad m2, 9 + packssdw m0, m0 ; (word) { U[0-7] } + packssdw m2, m2 ; (word) { V[0-7] } + movd [dstUq+wq], m0 + movd [dstVq+wq], m2 + add wq, 2 + jl .loop2 +.end: REP_RET %endif ; ARCH_X86_64 && %0 == 3 %endmacro @@ -451,8 +519,10 @@ RGB32_FUNCS 0, 0 INIT_XMM sse2 RGB32_FUNCS 8, 12 +%if HAVE_AVX_EXTERNAL INIT_XMM avx RGB32_FUNCS 8, 12 +%endif ;----------------------------------------------------------------------------- ; YUYV/UYVY/NV12/NV21 packed pixel shuffling. @@ -489,7 +559,7 @@ RGB32_FUNCS 8, 12 ; will be the same (i.e. YUYV+AVX), and thus we don't need to ; split the loop in an aligned and unaligned case %macro YUYV_TO_Y_FN 2-3 -cglobal %2ToY, 3, 3, %1, dst, src, w +cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w %if ARCH_X86_64 movsxd wq, wd %endif @@ -559,11 +629,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w ; will be the same (i.e. UYVY+AVX), and thus we don't need to ; split the loop in an aligned and unaligned case %macro YUYV_TO_UV_FN 2-3 -cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif add dstUq, wq add dstVq, wq @@ -593,8 +663,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w .loop_%1: mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } - pand m2, m0, m4 ; (word) { U0, U1, ..., U7 } - pand m3, m1, m4 ; (word) { U8, U9, ..., U15 } + pand m2, m0, m5 ; (word) { U0, U1, ..., U7 } + pand m3, m1, m5 ; (word) { U8, U9, ..., U15 } psrlw m0, 8 ; (word) { V0, V1, ..., V7 } psrlw m1, 8 ; (word) { V8, V9, ..., V15 } packuswb m2, m3 ; (byte) { U0, ..., U15 } @@ -614,11 +684,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w ; %1 = nr. of XMM registers ; %2 = nv12 or nv21 %macro NVXX_TO_UV_FN 2 -cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif add dstUq, wq add dstVq, wq @@ -626,8 +696,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w test srcq, 15 %endif lea srcq, [srcq+wq*2] - pcmpeqb m4, m4 ; (byte) { 0xff } x 16 - psrlw m4, 8 ; (word) { 0x00ff } x 8 + pcmpeqb m5, m5 ; (byte) { 0xff } x 16 + psrlw m5, 8 ; (word) { 0x00ff } x 8 %if mmsize == 16 jnz .loop_u_start neg wq @@ -659,6 +729,7 @@ YUYV_TO_UV_FN 3, uyvy NVXX_TO_UV_FN 5, nv12 NVXX_TO_UV_FN 5, nv21 +%if HAVE_AVX_EXTERNAL INIT_XMM avx ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but ; that's not faster in practice @@ -666,3 +737,4 @@ YUYV_TO_UV_FN 3, yuyv YUYV_TO_UV_FN 3, uyvy, 1 NVXX_TO_UV_FN 5, nv12 NVXX_TO_UV_FN 5, nv21 +%endif diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index e1ceded..9ea4af9 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* Kieran Kunhya <kieran@kunhya.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -264,10 +264,12 @@ yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 10, 7, 5 yuv2planeX_fn 16, 8, 5 +%if HAVE_AVX_EXTERNAL INIT_XMM avx yuv2planeX_fn 8, 10, 7 yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 10, 7, 5 +%endif ; %1=outout-bpc, %2=alignment (u/a) %macro yuv2plane1_mainloop 2 @@ -402,8 +404,10 @@ yuv2plane1_fn 16, 6, 3 INIT_XMM sse4 yuv2plane1_fn 16, 5, 3 +%if HAVE_AVX_EXTERNAL INIT_XMM avx yuv2plane1_fn 8, 5, 5 yuv2plane1_fn 9, 5, 3 yuv2plane1_fn 10, 5, 3 yuv2plane1_fn 16, 5, 3 +%endif diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 9cfe831..b80e869 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -6,20 +6,20 @@ * Written by Nick Kurshev. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -72,8 +72,14 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; +DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL; +DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL; +DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL; + +DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset); +DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111); +DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); -#define RGB2YUV_SHIFT 8 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) @@ -125,6 +131,7 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; #undef COMPILE_TEMPLATE_AMD3DNOW #define COMPILE_TEMPLATE_MMXEXT 0 #define COMPILE_TEMPLATE_SSE2 0 +#define COMPILE_TEMPLATE_AVX 0 #define COMPILE_TEMPLATE_AMD3DNOW 1 #define RENAME(a) a ## _3dnow #include "rgb2rgb_template.c" diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c index 5d34c21..3899d0a 100644 --- a/libswscale/x86/rgb2rgb_template.c +++ b/libswscale/x86/rgb2rgb_template.c @@ -7,20 +7,20 @@ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * lot of big-endian byte order fixes by Alex Beregszaszi * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -131,14 +131,11 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int sr "movq %%mm4, %%mm3 \n\t" \ "psllq $48, %%mm2 \n\t" \ "psllq $32, %%mm3 \n\t" \ - "pand "MANGLE(mask24hh)", %%mm2\n\t" \ - "pand "MANGLE(mask24hhh)", %%mm3\n\t" \ "por %%mm2, %%mm0 \n\t" \ "psrlq $16, %%mm1 \n\t" \ "psrlq $32, %%mm4 \n\t" \ "psllq $16, %%mm5 \n\t" \ "por %%mm3, %%mm1 \n\t" \ - "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \ "por %%mm5, %%mm4 \n\t" \ \ MOVNTQ" %%mm0, (%0) \n\t" \ @@ -168,6 +165,7 @@ static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int sr "movq %%mm5, %%mm7 \n\t" STORE_BGR24_MMX :: "r"(dest), "r"(s) + NAMED_CONSTRAINTS_ADD(mask24l,mask24h) :"memory"); dest += 24; s += 32; @@ -717,27 +715,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s } } -/* - I use less accurate approximation here by simply left-shifting the input - value and filling the low order bits with zeroes. This method improves PNG - compression but this scheme cannot reproduce white exactly, since it does - not generate an all-ones maximum value; the net effect is to darken the - image slightly. - - The better method should be "left bit replication": - - 4 3 2 1 0 - --------- - 1 1 0 1 1 - - 7 6 5 4 3 2 1 0 - ---------------- - 1 1 0 1 1 1 1 0 - |=======| |===| - | leftmost bits repeated to fill open bits - | - original bits -*/ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) { const uint16_t *end; @@ -756,9 +733,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -786,9 +764,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -809,6 +788,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr :"=m"(*d) :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) + NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi) :"memory"); /* borrowed 32 to 24 */ __asm__ volatile( @@ -825,6 +805,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr STORE_BGR24_MMX :: "r"(d), "m"(*s) + NAMED_CONSTRAINTS_ADD(mask24l,mask24h) :"memory"); d += 24; s += 8; @@ -834,9 +815,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); } } @@ -858,9 +839,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -888,9 +871,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -910,6 +895,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr "por %%mm5, %%mm3 \n\t" :"=m"(*d) :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) + NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi) :"memory"); /* borrowed 32 to 24 */ __asm__ volatile( @@ -926,6 +912,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr STORE_BGR24_MMX :: "r"(d), "m"(*s) + NAMED_CONSTRAINTS_ADD(mask24l,mask24h) :"memory"); d += 24; s += 8; @@ -935,9 +922,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); } } @@ -980,11 +967,13 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw %5, %%mm0 \n\t" + "pmulhw %5, %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" PACK_RGB32 - ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) + ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) + NAMED_CONSTRAINTS_ADD(mul15_hi) :"memory"); d += 16; s += 4; @@ -994,9 +983,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); *d++ = 255; } } @@ -1021,11 +1010,14 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw %5, %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" PACK_RGB32 - ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r) + ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) + NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi) :"memory"); d += 16; s += 4; @@ -1035,9 +1027,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); *d++ = 255; } } @@ -1150,6 +1142,7 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int sr "2: \n\t" : "+a" (mmx_size) : "r" (src-mmx_size), "r"(dst-mmx_size) + NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b) ); __asm__ volatile(SFENCE:::"memory"); @@ -1485,6 +1478,7 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWid :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), "g" (-mmxSize) + NAMED_CONSTRAINTS_ADD(mmx_ff) : "%"REG_a ); @@ -1629,10 +1623,15 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t * others are ignored in the C version. * FIXME: Write HQ version. */ +#if HAVE_7REGS static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, - int lumStride, int chromStride, int srcStride) + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv) { +#define BGR2Y_IDX "16*4+16*32" +#define BGR2U_IDX "16*4+16*33" +#define BGR2V_IDX "16*4+16*34" int y; const x86_reg chromWidth= width>>1; for (y=0; y<height-2; y+=2) { @@ -1640,7 +1639,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ for (i=0; i<2; i++) { __asm__ volatile( "mov %2, %%"REG_a" \n\t" - "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" + "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" "movq "MANGLE(ff_w1111)", %%mm5 \n\t" "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" @@ -1659,12 +1658,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "pmaddwd %%mm6, %%mm1 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm0 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm1, %%mm0 \n\t" "packssdw %%mm3, %%mm2 \n\t" "pmaddwd %%mm5, %%mm0 \n\t" @@ -1684,12 +1681,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "pmaddwd %%mm6, %%mm1 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm4 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm1, %%mm4 \n\t" "packssdw %%mm3, %%mm2 \n\t" "pmaddwd %%mm5, %%mm4 \n\t" @@ -1704,7 +1699,8 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" "add $8, %%"REG_a" \n\t" " js 1b \n\t" - : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) + : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) + NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset) : "%"REG_a, "%"REG_d ); ydst += lumStride; @@ -1714,7 +1710,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ __asm__ volatile( "mov %4, %%"REG_a" \n\t" "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" + "movq "BGR2U_IDX"(%5), %%mm6 \n\t" "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" "add %%"REG_d", %%"REG_d" \n\t" @@ -1763,19 +1759,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "psrlw $2, %%mm0 \n\t" "psrlw $2, %%mm2 \n\t" #endif - "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" + "movq "BGR2V_IDX"(%5), %%mm1 \n\t" + "movq "BGR2V_IDX"(%5), %%mm3 \n\t" "pmaddwd %%mm0, %%mm1 \n\t" "pmaddwd %%mm2, %%mm3 \n\t" "pmaddwd %%mm6, %%mm0 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm0 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm2, %%mm0 \n\t" "packssdw %%mm3, %%mm1 \n\t" "pmaddwd %%mm5, %%mm0 \n\t" @@ -1825,19 +1819,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "psrlw $2, %%mm4 \n\t" "psrlw $2, %%mm2 \n\t" #endif - "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" + "movq "BGR2V_IDX"(%5), %%mm1 \n\t" + "movq "BGR2V_IDX"(%5), %%mm3 \n\t" "pmaddwd %%mm4, %%mm1 \n\t" "pmaddwd %%mm2, %%mm3 \n\t" "pmaddwd %%mm6, %%mm4 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm4 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm2, %%mm4 \n\t" "packssdw %%mm3, %%mm1 \n\t" "pmaddwd %%mm5, %%mm4 \n\t" @@ -1856,7 +1848,8 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "movd %%mm0, (%3, %%"REG_a") \n\t" "add $4, %%"REG_a" \n\t" " js 1b \n\t" - : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) + : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) + NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset) : "%"REG_a, "%"REG_d ); @@ -1869,8 +1862,9 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ SFENCE" \n\t" :::"memory"); - rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride); + ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); } +#endif /* HAVE_7REGS */ #endif /* !COMPILE_TEMPLATE_SSE2 */ #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX @@ -1945,9 +1939,13 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui } #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */ +#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src, const uint8_t *unused, int w, + const uint8_t *unused, + const uint8_t *src1, + const uint8_t *src2, + int w, uint32_t *unused2); static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, @@ -1956,7 +1954,7 @@ static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t int h; for (h = 0; h < height; h++) { - RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL); + RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL); src += srcStride; dst1 += dst1Stride; dst2 += dst2Stride; @@ -1968,6 +1966,7 @@ static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t ); } #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ +#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */ #if !COMPILE_TEMPLATE_SSE2 #if !COMPILE_TEMPLATE_AMD3DNOW @@ -2187,6 +2186,44 @@ static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count } } +static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count) +{ + src ++; + dst += count; + src += 2*count; + count= - count; + + if(count < -16) { + count += 16; + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" + "1: \n\t" + "movq -32(%1, %0, 2), %%mm0 \n\t" + "movq -24(%1, %0, 2), %%mm1 \n\t" + "movq -16(%1, %0, 2), %%mm2 \n\t" + "movq -8(%1, %0, 2), %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + MOVNTQ" %%mm0,-16(%2, %0) \n\t" + MOVNTQ" %%mm2,- 8(%2, %0) \n\t" + "add $16, %0 \n\t" + " js 1b \n\t" + : "+r"(count) + : "r"(src), "r"(dst) + ); + count -= 16; + } + while(count<0) { + dst[count]= src[2*count]; + count++; + } +} + #if !COMPILE_TEMPLATE_AMD3DNOW static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) { @@ -2397,7 +2434,7 @@ static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { RENAME(extract_even)(src, ydst, width); @@ -2423,7 +2460,7 @@ static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { RENAME(extract_even)(src, ydst, width); @@ -2447,10 +2484,10 @@ static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { - RENAME(extract_even)(src+1, ydst, width); + RENAME(extract_odd)(src, ydst, width); if(y&1) { RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth); udst+= chromStride; @@ -2473,10 +2510,10 @@ static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { - RENAME(extract_even)(src+1, ydst, width); + RENAME(extract_odd)(src, ydst, width); RENAME(extract_even2)(src, udst, vdst, chromWidth); src += srcStride; @@ -2529,7 +2566,9 @@ static av_cold void RENAME(rgb2rgb_init)(void) #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW planar2x = RENAME(planar2x); #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ - rgb24toyv12 = RENAME(rgb24toyv12); +#if HAVE_7REGS + ff_rgb24toyv12 = RENAME(rgb24toyv12); +#endif /* HAVE_7REGS */ yuyvtoyuv420 = RENAME(yuyvtoyuv420); uyvytoyuv420 = RENAME(uyvytoyuv420); @@ -2538,7 +2577,9 @@ static av_cold void RENAME(rgb2rgb_init)(void) #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX interleaveBytes = RENAME(interleaveBytes); #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */ +#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM deinterleaveBytes = RENAME(deinterleaveBytes); #endif +#endif } diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index 440a27b..7af92f7 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -2,20 +2,20 @@ ;* x86-optimized horizontal line scaling functions ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -407,11 +407,15 @@ SCALE_FUNC %1, %2, X, X8, 7, %3 SCALE_FUNCS 8, 15, %1 SCALE_FUNCS 9, 15, %2 SCALE_FUNCS 10, 15, %2 +SCALE_FUNCS 12, 15, %2 +SCALE_FUNCS 14, 15, %2 SCALE_FUNCS 16, 15, %3 %endif ; !sse4 SCALE_FUNCS 8, 19, %1 SCALE_FUNCS 9, 19, %2 SCALE_FUNCS 10, 19, %2 +SCALE_FUNCS 12, 19, %2 +SCALE_FUNCS 14, 19, %2 SCALE_FUNCS 16, 19, %3 %endmacro @@ -420,7 +424,7 @@ INIT_MMX mmx SCALE_FUNCS2 0, 0, 0 %endif INIT_XMM sse2 -SCALE_FUNCS2 6, 7, 8 +SCALE_FUNCS2 7, 6, 8 INIT_XMM ssse3 SCALE_FUNCS2 6, 6, 8 INIT_XMM sse4 diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index f310a75..c4c0e28 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,6 +23,7 @@ #include "libswscale/swscale.h" #include "libswscale/swscale_internal.h" #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavutil/intreadwrite.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" @@ -57,19 +58,11 @@ DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL; DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL; DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL; -#ifdef FAST_BGR2YV12 -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL; -#else -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL; -#endif /* FAST_BGR2YV12 */ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL; DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; + //MMX versions #if HAVE_MMX_INLINE #undef RENAME @@ -117,9 +110,9 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI c->greenDither= ff_dither4[dstY&1]; c->redDither= ff_dither8[(dstY+1)&1]; if (dstY < dstH - 2) { - const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; - const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; - const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; + const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; + const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; + const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; int i; if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) { @@ -186,7 +179,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i]; lumMmxFilter[4*i+2]= lumMmxFilter[4*i+3]= - ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; + ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U; if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i]; alpMmxFilter[4*i+2]= @@ -197,12 +190,85 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i]; chrMmxFilter[4*i+2]= chrMmxFilter[4*i+3]= - ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; + ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U; } } } } +#if HAVE_MMXEXT +static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + if(((int)dest) & 15){ + yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); + return; + } + if (offset) { + __asm__ volatile("movq (%0), %%xmm3\n\t" + "movdqa %%xmm3, %%xmm4\n\t" + "psrlq $24, %%xmm3\n\t" + "psllq $40, %%xmm4\n\t" + "por %%xmm4, %%xmm3\n\t" + :: "r"(dither) + ); + } else { + __asm__ volatile("movq (%0), %%xmm3\n\t" + :: "r"(dither) + ); + } + filterSize--; + __asm__ volatile( + "pxor %%xmm0, %%xmm0\n\t" + "punpcklbw %%xmm0, %%xmm3\n\t" + "movd %0, %%xmm1\n\t" + "punpcklwd %%xmm1, %%xmm1\n\t" + "punpckldq %%xmm1, %%xmm1\n\t" + "punpcklqdq %%xmm1, %%xmm1\n\t" + "psllw $3, %%xmm1\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "psraw $4, %%xmm3\n\t" + ::"m"(filterSize) + ); + __asm__ volatile( + "movdqa %%xmm3, %%xmm4\n\t" + "movdqa %%xmm3, %%xmm7\n\t" + "movl %3, %%ecx\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\ + "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\ + "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%xmm0, %%xmm2 \n\t"\ + "pmulhw %%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm2, %%xmm3 \n\t"\ + "paddw %%xmm5, %%xmm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%xmm3 \n\t"\ + "psraw $3, %%xmm4 \n\t"\ + "packuswb %%xmm4, %%xmm3 \n\t" + "movntdq %%xmm3, (%1, %%"REG_c")\n\t" + "add $16, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movdqa %%xmm7, %%xmm3\n\t" + "movdqa %%xmm7, %%xmm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) + "%"REG_d, "%"REG_S, "%"REG_c + ); +} +#endif + #endif /* HAVE_INLINE_ASM */ #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ @@ -216,10 +282,14 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ SCALE_FUNC(filter_n, 8, 15, opt); \ SCALE_FUNC(filter_n, 9, 15, opt); \ SCALE_FUNC(filter_n, 10, 15, opt); \ + SCALE_FUNC(filter_n, 12, 15, opt); \ + SCALE_FUNC(filter_n, 14, 15, opt); \ SCALE_FUNC(filter_n, 16, 15, opt); \ SCALE_FUNC(filter_n, 8, 19, opt); \ SCALE_FUNC(filter_n, 9, 19, opt); \ SCALE_FUNC(filter_n, 10, 19, opt); \ + SCALE_FUNC(filter_n, 12, 19, opt); \ + SCALE_FUNC(filter_n, 14, 19, opt); \ SCALE_FUNC(filter_n, 16, 19, opt) #define SCALE_FUNCS_MMX(opt) \ @@ -275,11 +345,14 @@ VSCALE_FUNCS(avx, avx); #define INPUT_Y_FUNC(fmt, opt) \ void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \ + const uint8_t *unused1, const uint8_t *unused2, \ int w, uint32_t *unused) #define INPUT_UV_FUNC(fmt, opt) \ void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ - const uint8_t *src, const uint8_t *unused1, \ - int w, uint32_t *unused2) + const uint8_t *unused0, \ + const uint8_t *src1, \ + const uint8_t *src2, \ + int w, uint32_t *unused) #define INPUT_FUNC(fmt, opt) \ INPUT_Y_FUNC(fmt, opt); \ INPUT_UV_FUNC(fmt, opt) @@ -313,20 +386,31 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c) #if HAVE_MMXEXT_INLINE if (INLINE_MMXEXT(cpu_flags)) sws_init_swscale_mmxext(c); + if (cpu_flags & AV_CPU_FLAG_SSE3){ + if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) + c->yuv2planeX = yuv2yuvX_sse3; + } #endif #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \ if (c->srcBpc == 8) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ ff_hscale8to19_ ## filtersize ## _ ## opt1; \ } else if (c->srcBpc == 9) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ ff_hscale9to19_ ## filtersize ## _ ## opt1; \ } else if (c->srcBpc == 10) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ ff_hscale10to19_ ## filtersize ## _ ## opt1; \ - } else /* c->srcBpc == 16 */ { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ + } else if (c->srcBpc == 12) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale12to19_ ## filtersize ## _ ## opt1; \ + } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale14to19_ ## filtersize ## _ ## opt1; \ + } else { /* c->srcBpc == 16 */ \ + av_assert0(c->srcBpc == 16);\ + hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ ff_hscale16to19_ ## filtersize ## _ ## opt1; \ } \ } while (0) @@ -341,14 +425,15 @@ switch(c->dstBpc){ \ case 16: do_16_case; break; \ case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \ case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \ - default: if (condition_8bit) vscalefn = ff_yuv2planeX_8_ ## opt; break; \ + default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \ } #define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \ switch(c->dstBpc){ \ case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \ case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \ case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \ - default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + default: av_assert0(c->dstBpc>8); \ } #define case_rgb(x, X, opt) \ case AV_PIX_FMT_ ## X: \ diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 1e42ec5..36a606c 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -25,21 +25,101 @@ #undef REAL_MOVNTQ #undef MOVNTQ +#undef MOVNTQ2 #undef PREFETCH -#if COMPILE_TEMPLATE_MMXEXT -#define PREFETCH "prefetchnta" -#else -#define PREFETCH " # nop" -#endif #if COMPILE_TEMPLATE_MMXEXT #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" +#define MOVNTQ2 "movntq " #else #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" +#define MOVNTQ2 "movq " #endif #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) +#if !COMPILE_TEMPLATE_MMXEXT +static av_always_inline void +dither_8to16(const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "psrlq $24, %%mm3\n\t" + "psllq $40, %%mm4\n\t" + "por %%mm4, %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } +} +#endif + +static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + dither_8to16(dither, offset); + filterSize--; + __asm__ volatile( + "movd %0, %%mm1\n\t" + "punpcklwd %%mm1, %%mm1\n\t" + "punpckldq %%mm1, %%mm1\n\t" + "psllw $3, %%mm1\n\t" + "paddw %%mm1, %%mm3\n\t" + "paddw %%mm1, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + ::"m"(filterSize) + ); + + __asm__ volatile(\ + "movq %%mm3, %%mm6\n\t" + "movq %%mm4, %%mm7\n\t" + "movl %3, %%ecx\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ + "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%mm3 \n\t"\ + "psraw $3, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm3 \n\t" + MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" + "add $8, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movq %%mm6, %%mm3\n\t" + "movq %%mm7, %%mm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : "%"REG_d, "%"REG_S, "%"REG_c + ); +} + #define YSCALEYUV2PACKEDX_UV \ __asm__ volatile(\ "xor %%"REG_a", %%"REG_a" \n\t"\ @@ -92,6 +172,7 @@ :: "r" (&c->redDither), \ "m" (dummy), "m" (dummy), "m" (dummy),\ "r" (dest), "m" (dstW_reg), "m"(uv_off) \ + NAMED_CONSTRAINTS_ADD(bF8,bFC) \ : "%"REG_a, "%"REG_d, "%"REG_S \ ); @@ -252,7 +333,7 @@ MOVNTQ( q3, 24(dst, index, 4))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) @@ -265,7 +346,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX_ACCURATE @@ -278,13 +359,13 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) YSCALEYUV2PACKEDX_END } else { YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } } @@ -298,7 +379,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX @@ -307,13 +388,13 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } else { YSCALEYUV2PACKEDX YSCALEYUV2RGBX "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } } @@ -342,7 +423,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm1, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) @@ -355,7 +436,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -366,7 +447,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" "paddusb "RED_DITHER"(%0), %%mm5\n\t" #endif - WRITERGB16(%4, %5, %%REGa) + WRITERGB16(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -379,7 +460,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -390,7 +471,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" "paddusb "RED_DITHER"(%0), %%mm5 \n\t" #endif - WRITERGB16(%4, %5, %%REGa) + WRITERGB16(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -419,7 +500,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm1, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) @@ -432,7 +513,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -443,7 +524,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" "paddusb "RED_DITHER"(%0), %%mm5\n\t" #endif - WRITERGB15(%4, %5, %%REGa) + WRITERGB15(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -456,7 +537,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -467,7 +548,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" "paddusb "RED_DITHER"(%0), %%mm5 \n\t" #endif - WRITERGB15(%4, %5, %%REGa) + WRITERGB15(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -521,7 +602,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "add $24, "#dst" \n\t"\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEBGR24MMXEXT(dst, dstw, index) \ @@ -569,7 +650,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "add $24, "#dst" \n\t"\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #if COMPILE_TEMPLATE_MMXEXT @@ -580,6 +661,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) #endif +#if HAVE_6REGS static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, @@ -589,17 +671,18 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize "add %4, %%"REG_c" \n\t" - WRITEBGR24(%%REGc, %5, %%REGa) + WRITEBGR24(%%REGc, "%5", %%REGa) :: "r" (&c->redDither), "m" (dummy), "m" (dummy), "m" (dummy), "r" (dest), "m" (dstW_reg), "m"(uv_off) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S ); } @@ -613,20 +696,22 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize "add %4, %%"REG_c" \n\t" - WRITEBGR24(%%REGc, %5, %%REGa) + WRITEBGR24(%%REGc, "%5", %%REGa) :: "r" (&c->redDither), "m" (dummy), "m" (dummy), "m" (dummy), "r" (dest), "m" (dstW_reg), "m"(uv_off) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S ); } +#endif /* HAVE_6REGS */ #define REAL_WRITEYUY2(dst, dstw, index) \ "packuswb %%mm3, %%mm3 \n\t"\ @@ -641,7 +726,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm7, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) @@ -654,7 +739,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -662,7 +747,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm4 \n\t" "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" - WRITEYUY2(%4, %5, %%REGa) + WRITEYUY2(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -675,7 +760,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -683,7 +768,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm4 \n\t" "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" - WRITEYUY2(%4, %5, %%REGa) + WRITEYUY2(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -784,15 +869,15 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), "a" (&c->redDither), "r" (abuf0), "r" (abuf1) : "%r8" ); #else - *(const uint16_t **)(&c->u_temp)=abuf0; - *(const uint16_t **)(&c->v_temp)=abuf1; + c->u_temp=(intptr_t)abuf0; + c->v_temp=(intptr_t)abuf1; __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -808,7 +893,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "packuswb %%mm7, %%mm1 \n\t" "pop %1 \n\t" "pop %0 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -822,7 +907,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "push %%"REG_BP" \n\t" YSCALEYUV2RGB(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -839,18 +924,18 @@ static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2RGB(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } @@ -862,7 +947,6 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -875,11 +959,12 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } @@ -891,7 +976,6 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -904,11 +988,12 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } @@ -960,13 +1045,12 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1109,7 +1193,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1122,7 +1206,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1138,7 +1222,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1151,7 +1235,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1177,11 +1261,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1191,11 +1276,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } } @@ -1222,11 +1308,12 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1242,11 +1329,12 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } } @@ -1273,11 +1361,12 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1293,11 +1382,12 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } } @@ -1354,7 +1444,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED1(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1367,7 +1457,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED1b(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1375,203 +1465,20 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, ); } } - -#if COMPILE_TEMPLATE_MMXEXT -static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, - int dstWidth, const uint8_t *src, - int srcW, int xInc) -{ - int32_t *filterPos = c->hLumFilterPos; - int16_t *filter = c->hLumFilter; - void *mmxextFilterCode = c->lumMmxextFilterCode; - int i; -#if defined(PIC) - uint64_t ebxsave; -#endif -#if ARCH_X86_64 - uint64_t retsave; -#endif - - __asm__ volatile( -#if defined(PIC) - "mov %%"REG_b", %5 \n\t" -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %6 \n\t" -#endif -#else -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %5 \n\t" -#endif -#endif - "pxor %%mm7, %%mm7 \n\t" - "mov %0, %%"REG_c" \n\t" - "mov %1, %%"REG_D" \n\t" - "mov %2, %%"REG_d" \n\t" - "mov %3, %%"REG_b" \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" // i - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - -#if ARCH_X86_64 -#define CALL_MMXEXT_FILTER_CODE \ - "movl (%%"REG_b"), %%esi \n\t"\ - "call *%4 \n\t"\ - "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ - "add %%"REG_S", %%"REG_c" \n\t"\ - "add %%"REG_a", %%"REG_D" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - -#else -#define CALL_MMXEXT_FILTER_CODE \ - "movl (%%"REG_b"), %%esi \n\t"\ - "call *%4 \n\t"\ - "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ - "add %%"REG_a", %%"REG_D" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - -#endif /* ARCH_X86_64 */ - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - -#if defined(PIC) - "mov %5, %%"REG_b" \n\t" -#if ARCH_X86_64 - "mov %6, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#else -#if ARCH_X86_64 - "mov %5, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#endif - :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), - "m" (mmxextFilterCode) -#if defined(PIC) - ,"m" (ebxsave) -#endif -#if ARCH_X86_64 - ,"m"(retsave) -#endif - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D -#if !defined(PIC) - ,"%"REG_b -#endif - ); - - for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) - dst[i] = src[srcW-1]*128; -} - -static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, - int dstWidth, const uint8_t *src1, - const uint8_t *src2, int srcW, int xInc) -{ - int32_t *filterPos = c->hChrFilterPos; - int16_t *filter = c->hChrFilter; - void *mmxextFilterCode = c->chrMmxextFilterCode; - int i; -#if defined(PIC) - DECLARE_ALIGNED(8, uint64_t, ebxsave); -#endif -#if ARCH_X86_64 - DECLARE_ALIGNED(8, uint64_t, retsave); -#endif - - __asm__ volatile( -#if defined(PIC) - "mov %%"REG_b", %7 \n\t" -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %8 \n\t" -#endif -#else -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %7 \n\t" -#endif -#endif - "pxor %%mm7, %%mm7 \n\t" - "mov %0, %%"REG_c" \n\t" - "mov %1, %%"REG_D" \n\t" - "mov %2, %%"REG_d" \n\t" - "mov %3, %%"REG_b" \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" // i - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - "xor %%"REG_a", %%"REG_a" \n\t" // i - "mov %5, %%"REG_c" \n\t" // src - "mov %6, %%"REG_D" \n\t" // buf2 - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - -#if defined(PIC) - "mov %7, %%"REG_b" \n\t" -#if ARCH_X86_64 - "mov %8, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#else -#if ARCH_X86_64 - "mov %7, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#endif - :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), - "m" (mmxextFilterCode), "m" (src2), "m"(dst2) -#if defined(PIC) - ,"m" (ebxsave) -#endif -#if ARCH_X86_64 - ,"m"(retsave) -#endif - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D -#if !defined(PIC) - ,"%"REG_b -#endif - ); - - for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { - dst1[i] = src1[srcW-1]*128; - dst2[i] = src2[srcW-1]*128; - } -} -#endif /* COMPILE_TEMPLATE_MMXEXT */ - static av_cold void RENAME(sws_init_swscale)(SwsContext *c) { enum AVPixelFormat dstFormat = c->dstFormat; - if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && - dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) { - if (!(c->flags & SWS_BITEXACT)) { + c->use_mmx_vfilter= 0; + if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 + && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { if (c->flags & SWS_ACCURATE_RND) { if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; +#if HAVE_6REGS case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; +#endif case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; @@ -1579,10 +1486,14 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } } else { + c->use_mmx_vfilter= 1; + c->yuv2planeX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; +#if HAVE_6REGS case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; +#endif case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; @@ -1590,7 +1501,6 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } } - } if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: @@ -1619,12 +1529,12 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } - if (c->srcBpc == 8 && c->dstBpc <= 10) { + if (c->srcBpc == 8 && c->dstBpc <= 14) { // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). #if COMPILE_TEMPLATE_MMXEXT if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { - c->hyscale_fast = RENAME(hyscale_fast); - c->hcscale_fast = RENAME(hcscale_fast); + c->hyscale_fast = ff_hyscale_fast_mmxext; + c->hcscale_fast = ff_hcscale_fast_mmxext; } else { #endif /* COMPILE_TEMPLATE_MMXEXT */ c->hyscale_fast = NULL; diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c index dd9a2a4..88143d9 100644 --- a/libswscale/x86/w64xmmtest.c +++ b/libswscale/x86/w64xmmtest.c @@ -2,20 +2,20 @@ * check XMM registers for clobbers on Win64 * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c index bacc87f..5e2f77c 100644 --- a/libswscale/x86/yuv2rgb.c +++ b/libswscale/x86/yuv2rgb.c @@ -7,27 +7,26 @@ * 1,4,8bpp support and context / deglobalize stuff * by Michael Niedermayer (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdio.h> #include <stdlib.h> #include <inttypes.h> -#include <assert.h> #include "config.h" #include "libswscale/rgb2rgb.h" @@ -51,34 +50,30 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL; DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL; //MMX versions -#if HAVE_MMX_INLINE +#if HAVE_MMX_INLINE && HAVE_6REGS #undef RENAME #undef COMPILE_TEMPLATE_MMXEXT #define COMPILE_TEMPLATE_MMXEXT 0 #define RENAME(a) a ## _mmx #include "yuv2rgb_template.c" -#endif /* HAVE_MMX_INLINE */ +#endif /* HAVE_MMX_INLINE && HAVE_6REGS */ // MMXEXT versions -#if HAVE_MMXEXT_INLINE +#if HAVE_MMXEXT_INLINE && HAVE_6REGS #undef RENAME #undef COMPILE_TEMPLATE_MMXEXT #define COMPILE_TEMPLATE_MMXEXT 1 #define RENAME(a) a ## _mmxext #include "yuv2rgb_template.c" -#endif /* HAVE_MMXEXT_INLINE */ +#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */ #endif /* HAVE_INLINE_ASM */ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) { -#if HAVE_MMX_INLINE +#if HAVE_MMX_INLINE && HAVE_6REGS int cpu_flags = av_get_cpu_flags(); - if (c->srcFormat != AV_PIX_FMT_YUV420P && - c->srcFormat != AV_PIX_FMT_YUVA420P) - return NULL; - #if HAVE_MMXEXT_INLINE if (INLINE_MMXEXT(cpu_flags)) { switch (c->dstFormat) { @@ -118,7 +113,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) return yuv420_rgb15_mmx; } } -#endif /* HAVE_MMX_INLINE */ +#endif /* HAVE_MMX_INLINE && HAVE_6REGS */ return NULL; } diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c index 0b97516..acb78f5 100644 --- a/libswscale/x86/yuv2rgb_template.c +++ b/libswscale/x86/yuv2rgb_template.c @@ -4,20 +4,20 @@ * Copyright (C) 2001-2007 Michael Niedermayer * (c) 2010 Konstantin Shishkov * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -48,17 +48,14 @@ if (h_size * depth > FFABS(dstStride[0])) \ h_size -= 8; \ \ - if (c->srcFormat == AV_PIX_FMT_YUV422P) { \ - srcStride[1] *= 2; \ - srcStride[2] *= 2; \ - } \ + vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \ \ __asm__ volatile ("pxor %mm4, %mm4\n\t"); \ for (y = 0; y < srcSliceH; y++) { \ uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \ const uint8_t *py = src[0] + y * srcStride[0]; \ - const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \ - const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \ + const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \ + const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \ x86_reg index = -h_size / 2; \ #define YUV2RGB_INITIAL_LOAD \ @@ -142,10 +139,21 @@ "add $4, %0\n\t" \ "js 1b\n\t" \ +#if COMPILE_TEMPLATE_MMXEXT +#undef RGB_PACK24_B_OPERANDS +#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ARRAY_ADD(mask1101,mask0110,mask0100,mask0010,mask1001) +#else +#undef RGB_PACK24_B_OPERANDS +#define RGB_PACK24_B_OPERANDS +#endif + #define YUV2RGB_OPERANDS \ : "+r" (index), "+r" (image) \ : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ "r" (py - 2*index) \ + NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,pb_07,mmx_redmask,pb_e0) \ + RGB_PACK24_B_OPERANDS \ + : "memory" \ ); \ } \ @@ -153,6 +161,8 @@ : "+r" (index), "+r" (image) \ : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ "r" (py - 2*index), "r" (pa - 2*index) \ + NAMED_CONSTRAINTS_ADD(mmx_00ffw) \ + : "memory" \ ); \ } \ @@ -193,7 +203,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(2) @@ -221,7 +231,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(2) @@ -311,7 +321,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(3) @@ -329,7 +339,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(3) @@ -373,7 +383,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -394,7 +404,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -416,7 +426,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -437,7 +447,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c index 480fbe3..8e92e6d 100644 --- a/libswscale/yuv2rgb.c +++ b/libswscale/yuv2rgb.c @@ -6,27 +6,26 @@ * 1,4,8bpp support and context / deglobalize stuff * by Michael Niedermayer (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdio.h> #include <stdlib.h> #include <inttypes.h> -#include <assert.h> #include "libavutil/cpu.h" #include "libavutil/bswap.h" @@ -34,6 +33,7 @@ #include "rgb2rgb.h" #include "swscale.h" #include "swscale_internal.h" +#include "libavutil/pixdesc.h" const int32_t ff_yuv2rgb_coeffs[8][4] = { { 117504, 138453, 13954, 34903 }, /* no sequence_display_extension */ @@ -56,9 +56,9 @@ const int *sws_getCoefficients(int colorspace) #define LOADCHROMA(i) \ U = pu[i]; \ V = pv[i]; \ - r = (void *)c->table_rV[V]; \ - g = (void *)(c->table_gU[U] + c->table_gV[V]); \ - b = (void *)c->table_bU[U]; + r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM]; \ + g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]); \ + b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM]; #define PUTRGB(dst, src, i) \ Y = src[2 * i]; \ @@ -382,24 +382,65 @@ ENDYUV2RGBLINE(24, 1) PUTBGR24(dst_2, py_2, 0); ENDYUV2RGBFUNC() -// This is exactly the same code as yuv2rgb_c_32 except for the types of -// r, g, b, dst_1, dst_2 -YUV2RGBFUNC(yuv2rgb_c_16, uint16_t, 0) +YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0) + const uint8_t *d16 = ff_dither_2x2_8[y & 1]; + const uint8_t *e16 = ff_dither_2x2_4[y & 1]; + const uint8_t *f16 = ff_dither_2x2_8[(y & 1)^1]; + +#define PUTRGB16(dst, src, i, o) \ + Y = src[2 * i]; \ + dst[2 * i] = r[Y + d16[0 + o]] + \ + g[Y + e16[0 + o]] + \ + b[Y + f16[0 + o]]; \ + Y = src[2 * i + 1]; \ + dst[2 * i + 1] = r[Y + d16[1 + o]] + \ + g[Y + e16[1 + o]] + \ + b[Y + f16[1 + o]]; LOADCHROMA(0); - PUTRGB(dst_1, py_1, 0); - PUTRGB(dst_2, py_2, 0); + PUTRGB16(dst_1, py_1, 0, 0); + PUTRGB16(dst_2, py_2, 0, 0 + 8); LOADCHROMA(1); - PUTRGB(dst_2, py_2, 1); - PUTRGB(dst_1, py_1, 1); + PUTRGB16(dst_2, py_2, 1, 2 + 8); + PUTRGB16(dst_1, py_1, 1, 2); LOADCHROMA(2); - PUTRGB(dst_1, py_1, 2); - PUTRGB(dst_2, py_2, 2); + PUTRGB16(dst_1, py_1, 2, 4); + PUTRGB16(dst_2, py_2, 2, 4 + 8); LOADCHROMA(3); - PUTRGB(dst_2, py_2, 3); - PUTRGB(dst_1, py_1, 3); + PUTRGB16(dst_2, py_2, 3, 6 + 8); + PUTRGB16(dst_1, py_1, 3, 6); +CLOSEYUV2RGBFUNC(8) + +YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0) + const uint8_t *d16 = ff_dither_2x2_8[y & 1]; + const uint8_t *e16 = ff_dither_2x2_8[(y & 1)^1]; + +#define PUTRGB15(dst, src, i, o) \ + Y = src[2 * i]; \ + dst[2 * i] = r[Y + d16[0 + o]] + \ + g[Y + d16[1 + o]] + \ + b[Y + e16[0 + o]]; \ + Y = src[2 * i + 1]; \ + dst[2 * i + 1] = r[Y + d16[1 + o]] + \ + g[Y + d16[0 + o]] + \ + b[Y + e16[1 + o]]; + LOADCHROMA(0); + PUTRGB15(dst_1, py_1, 0, 0); + PUTRGB15(dst_2, py_2, 0, 0 + 8); + + LOADCHROMA(1); + PUTRGB15(dst_2, py_2, 1, 2 + 8); + PUTRGB15(dst_1, py_1, 1, 2); + + LOADCHROMA(2); + PUTRGB15(dst_1, py_1, 2, 4); + PUTRGB15(dst_2, py_2, 2, 4 + 8); + + LOADCHROMA(3); + PUTRGB15(dst_2, py_2, 3, 6 + 8); + PUTRGB15(dst_1, py_1, 3, 6); CLOSEYUV2RGBFUNC(8) // r, g, b, dst_1, dst_2 @@ -532,7 +573,7 @@ CLOSEYUV2RGBFUNC(8) YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0) const uint8_t *d128 = ff_dither_8x8_220[y & 7]; char out_1 = 0, out_2 = 0; - g = c->table_gU[128] + c->table_gV[128]; + g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM]; #define PUTRGB1(out, src, i, o) \ Y = src[2 * i]; \ @@ -570,7 +611,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c) av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found from %s to %s.\n", - sws_format_name(c->srcFormat), sws_format_name(c->dstFormat)); + av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat)); switch (c->dstFormat) { case AV_PIX_FMT_BGR48BE: @@ -581,23 +622,21 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c) return yuv2rgb_c_48; case AV_PIX_FMT_ARGB: case AV_PIX_FMT_ABGR: - if (CONFIG_SWSCALE_ALPHA && c->srcFormat == AV_PIX_FMT_YUVA420P) + if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) return yuva2argb_c; case AV_PIX_FMT_RGBA: case AV_PIX_FMT_BGRA: - if (CONFIG_SWSCALE_ALPHA && c->srcFormat == AV_PIX_FMT_YUVA420P) - return yuva2rgba_c; - else - return yuv2rgb_c_32; + return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32; case AV_PIX_FMT_RGB24: return yuv2rgb_c_24_rgb; case AV_PIX_FMT_BGR24: return yuv2rgb_c_24_bgr; case AV_PIX_FMT_RGB565: case AV_PIX_FMT_BGR565: + return yuv2rgb_c_16_ordered_dither; case AV_PIX_FMT_RGB555: case AV_PIX_FMT_BGR555: - return yuv2rgb_c_16; + return yuv2rgb_c_15_ordered_dither; case AV_PIX_FMT_RGB444: case AV_PIX_FMT_BGR444: return yuv2rgb_c_12_ordered_dither; @@ -612,36 +651,32 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c) return yuv2rgb_c_4b_ordered_dither; case AV_PIX_FMT_MONOBLACK: return yuv2rgb_c_1_ordered_dither; - default: - assert(0); } return NULL; } -static void fill_table(uint8_t *table[256], const int elemsize, - const int inc, void *y_tab) +static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, + const int64_t inc, void *y_tab) { int i; - int64_t cb = 0; uint8_t *y_table = y_tab; y_table -= elemsize * (inc >> 9); - for (i = 0; i < 256; i++) { + for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) { + int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc; table[i] = y_table + elemsize * (cb >> 16); - cb += inc; } } -static void fill_gv_table(int table[256], const int elemsize, const int inc) +static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc) { int i; - int64_t cb = 0; int off = -(inc >> 9); - for (i = 0; i < 256; i++) { + for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) { + int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc; table[i] = elemsize * (off + (cb >> 16)); - cb += inc; } } @@ -684,7 +719,7 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], uint8_t *y_table; uint16_t *y_table16; uint32_t *y_table32; - int i, base, rbase, gbase, bbase, abase, needAlpha; + int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha; const int yoffs = fullRange ? 384 : 326; int64_t crv = inv_table[0]; @@ -729,12 +764,12 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], c->yuv2rgb_u2b_coeff = (int16_t)roundToInt16(cbu << 13); //scale coefficients by cy - crv = ((crv << 16) + 0x8000) / cy; - cbu = ((cbu << 16) + 0x8000) / cy; - cgu = ((cgu << 16) + 0x8000) / cy; - cgv = ((cgv << 16) + 0x8000) / cy; + crv = ((crv << 16) + 0x8000) / FFMAX(cy, 1); + cbu = ((cbu << 16) + 0x8000) / FFMAX(cy, 1); + cgu = ((cgu << 16) + 0x8000) / FFMAX(cy, 1); + cgv = ((cgv << 16) + 0x8000) / FFMAX(cy, 1); - av_free(c->yuvTable); + av_freep(&c->yuvTable); switch (bpp) { case 1: @@ -847,6 +882,7 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], fill_gv_table(c->table_gV, 1, cgv); break; case 32: + case 64: base = (c->dstFormat == AV_PIX_FMT_RGB32_1 || c->dstFormat == AV_PIX_FMT_BGR32_1) ? 8 : 0; rbase = base + (isRgb ? 16 : 0); @@ -872,7 +908,6 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], fill_gv_table(c->table_gV, 4, cgv); break; default: - c->yuvTable = NULL; if(!isPlanar(c->dstFormat) || bpp <= 24) av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp); return -1; |