32 files changed, 1322 insertions, 911 deletions
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 36c2bef..e78ec42 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -1,3 +1,5 @@
+include $(SUBDIR)../config.mak
+
 NAME = swscale
 FFLIBS = avutil
 
@@ -20,6 +22,8 @@ MMX-OBJS-$(HAVE_YASM)      +=  x86/input.o              \
                                x86/output.o             \
                                x86/scale.o
 
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
 OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 
 TESTPROGS = colorspace swscale
diff --git a/libswscale/bfin/internal_bfin.S b/libswscale/bfin/internal_bfin.S
index b007f07..eab30aa 100644
--- a/libswscale/bfin/internal_bfin.S
+++ b/libswscale/bfin/internal_bfin.S
@@ -5,20 +5,20 @@
  * Blackfin video color space converter operations
  * convert I420 YV12 to RGB in various formats
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/bfin/swscale_bfin.c b/libswscale/bfin/swscale_bfin.c
index f9eba1e..3cd4f28 100644
--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -3,20 +3,20 @@
  *
  * Blackfin software video scaler operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/bfin/yuv2rgb_bfin.c b/libswscale/bfin/yuv2rgb_bfin.c
index 91a7aee..e7f657f 100644
--- a/libswscale/bfin/yuv2rgb_bfin.c
+++ b/libswscale/bfin/yuv2rgb_bfin.c
@@ -4,20 +4,20 @@
  * Blackfin video color space converter operations
  * convert I420 YV12 to RGB in various formats
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include <inttypes.h>
 #include <assert.h>
 #include <unistd.h>
+#include "libavutil/pixdesc.h"
 
 #include "config.h"
 #include "libswscale/rgb2rgb.h"
@@ -198,7 +199,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
     }
 
     av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
-           sws_format_name(c->dstFormat));
+           av_get_pix_fmt_name(c->dstFormat));
 
     return f;
 }
diff --git a/libswscale/colorspace-test.c b/libswscale/colorspace-test.c
index 135924c..89713a8 100644
--- a/libswscale/colorspace-test.c
+++ b/libswscale/colorspace-test.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 
 #define FUNC(s, d, n) { s, d, #n, n }
 
-int main(void)
+int main(int argc, char **argv)
 {
     int i, funcNum;
     uint8_t *srcBuffer = av_malloc(SIZE);
diff --git a/libswscale/input.c b/libswscale/input.c
index e636eac..9838677 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -104,7 +104,7 @@ rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
 #undef input_pixel
 
 #define rgb48funcs(pattern, BE_LE, origin) \
-static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
+static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
                                     int width, uint32_t *unused) \
 { \
     const uint16_t *src = (const uint16_t *) _src; \
@@ -113,7 +113,7 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src,
 } \
  \
 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
-                                    const uint8_t *_src1, const uint8_t *_src2, \
+                                    const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
                                     int width, uint32_t *unused) \
 { \
     const uint16_t *src1 = (const uint16_t *) _src1, \
@@ -123,7 +123,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
 } \
  \
 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
-                                    const uint8_t *_src1, const uint8_t *_src2, \
+                                    const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
                                     int width, uint32_t *unused) \
 { \
     const uint16_t *src1 = (const uint16_t *) _src1, \
@@ -142,14 +142,14 @@ rgb48funcs(bgr, BE, PIX_FMT_BGR48BE)
                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
 
 static av_always_inline void
-rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
+rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
                        int width, enum PixelFormat origin,
                        int shr,   int shg,   int shb, int shp,
                        int maskr, int maskg, int maskb,
                        int rsh,   int gsh,   int bsh, int S)
 {
     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh;
-    const unsigned rnd = 33u << (S - 1);
+    const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
     int i;
 
     for (i = 0; i < width; i++) {
@@ -158,12 +158,12 @@ rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
         int g = (px & maskg) >> shg;
         int r = (px & maskr) >> shr;
 
-        dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
+        dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
     }
 }
 
 static av_always_inline void
-rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
+rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
                         const uint8_t *src, int width,
                         enum PixelFormat origin,
                         int shr,   int shg,   int shb, int shp,
@@ -172,7 +172,7 @@ rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
 {
     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh;
-    const unsigned rnd = 257u << (S - 1);
+    const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
     int i;
 
     for (i = 0; i < width; i++) {
@@ -181,13 +181,13 @@ rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
         int g = (px & maskg) >> shg;
         int r = (px & maskr) >> shr;
 
-        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
-        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
+        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
+        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
     }
 }
 
 static av_always_inline void
-rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
+rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
                              const uint8_t *src, int width,
                              enum PixelFormat origin,
                              int shr,   int shg,   int shb, int shp,
@@ -197,7 +197,7 @@ rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
               maskgx = ~(maskr | maskb);
-    const unsigned rnd = 257u << S;
+    const unsigned rnd = (256U<<(S)) + (1<<(S-6));
     int i;
 
     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
@@ -216,8 +216,8 @@ rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
         }
         r = (rb & maskr) >> shr;
 
-        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
-        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
+        dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
+        dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
     }
 }
 
@@ -225,26 +225,29 @@ rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
 
 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
                          maskg, maskb, rsh, gsh, bsh, S) \
-static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
+static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
                           int width, uint32_t *unused) \
 { \
-    rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
+    rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, \
+                           shr, shg, shb, shp, \
                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
 } \
  \
 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
-                           const uint8_t *src, const uint8_t *dummy, \
+                           const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
                            int width, uint32_t *unused) \
 { \
-    rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
+    rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,  \
+                            shr, shg, shb, shp, \
                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
 } \
  \
 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
-                                const uint8_t *src, const uint8_t *dummy, \
+                                const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
                                 int width, uint32_t *unused) \
 { \
-    rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
+    rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
+                                 shr, shg, shb, shp, \
                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
 }
 
@@ -265,34 +268,59 @@ rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x0
 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
 rgb16_32_wrapper(PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
 
-static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
+static void gbr24pToUV_half_c(uint16_t *dstU, uint16_t *dstV,
+                         const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
+                         int width, enum PixelFormat origin)
 {
     int i;
     for (i=0; i<width; i++) {
-        dst[i]= src[4*i];
+        unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
+        unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
+        unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
+
+        dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+        dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+    }
+}
+
+static void abgrToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i]<<6;
+    }
+}
+
+static void rgbaToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i+3]<<6;
     }
 }
 
-static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
+static void palToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
 {
     int i;
     for (i=0; i<width; i++) {
-        dst[i]= src[4*i+3];
+        int d= src[i];
+
+        dst[i]= (pal[d] >> 24)<<6;
     }
 }
 
-static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
+static void palToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, long width, uint32_t *pal)
 {
     int i;
     for (i=0; i<width; i++) {
         int d= src[i];
 
-        dst[i]= pal[d] & 0xFF;
+        dst[i]= (pal[d] & 0xFF)<<6;
     }
 }
 
-static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
-                      const uint8_t *src1, const uint8_t *src2,
+static void palToUV_c(uint16_t *dstU, int16_t *dstV,
+                           const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                       int width, uint32_t *pal)
 {
     int i;
@@ -300,34 +328,42 @@ static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
     for (i=0; i<width; i++) {
         int p= pal[src1[i]];
 
-        dstU[i]= p>>8;
-        dstV[i]= p>>16;
+        dstU[i]= (uint8_t)(p>> 8)<<6;
+        dstV[i]= (uint8_t)(p>>16)<<6;
     }
 }
 
-static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
-                          int width, uint32_t *unused)
+static void monowhite2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
 {
     int i, j;
     for (i=0; i<width/8; i++) {
         int d= ~src[i];
         for(j=0; j<8; j++)
-            dst[8*i+j]= ((d>>(7-j))&1)*255;
+            dst[8*i+j]= ((d>>(7-j))&1)*16383;
+    }
+    if(width&7){
+        int d= ~src[i];
+        for(j=0; j<(width&7); j++)
+            dst[8*i+j]= ((d>>(7-j))&1)*16383;
     }
 }
 
-static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
-                          int width, uint32_t *unused)
+static void monoblack2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
 {
     int i, j;
     for (i=0; i<width/8; i++) {
         int d= src[i];
         for(j=0; j<8; j++)
-            dst[8*i+j]= ((d>>(7-j))&1)*255;
+            dst[8*i+j]= ((d>>(7-j))&1)*16383;
+    }
+    if(width&7){
+        int d= src[i];
+        for(j=0; j<(width&7); j++)
+            dst[8*i+j]= ((d>>(7-j))&1)*16383;
     }
 }
 
-static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
                       uint32_t *unused)
 {
     int i;
@@ -335,7 +371,7 @@ static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
         dst[i]= src[2*i];
 }
 
-static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                        const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -346,7 +382,7 @@ static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
     assert(src1 == src2);
 }
 
-static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
+static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
 {
     int i;
     const uint16_t *src = (const uint16_t *) _src;
@@ -356,7 +392,7 @@ static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *
     }
 }
 
-static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
+static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
                         const uint8_t *_src2, int width, uint32_t *unused)
 {
     int i;
@@ -371,7 +407,7 @@ static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
 
 /* This is almost identical to the previous, end exists only because
  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
-static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
                       uint32_t *unused)
 {
     int i;
@@ -379,7 +415,7 @@ static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
         dst[i]= src[2*i+1];
 }
 
-static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                        const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -401,14 +437,14 @@ static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
 }
 
 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *src1, const uint8_t *src2,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                        int width, uint32_t *unused)
 {
     nvXXtoUV_c(dstU, dstV, src1, width);
 }
 
 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *src1, const uint8_t *src2,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                        int width, uint32_t *unused)
 {
     nvXXtoUV_c(dstV, dstU, src1, width);
@@ -416,7 +452,7 @@ static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
 
 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
 
-static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
+static void bgr24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
                        int width, uint32_t *unused)
 {
     int i;
@@ -425,11 +461,11 @@ static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
         int g= src[i*3+1];
         int r= src[i*3+2];
 
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
+        dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
     }
 }
 
-static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                         const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -438,13 +474,13 @@ static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         int g= src1[3*i + 1];
         int r= src1[3*i + 2];
 
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+        dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+        dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
     }
     assert(src1 == src2);
 }
 
-static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                              const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -453,13 +489,13 @@ static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         int g= src1[6*i + 1] + src1[6*i + 4];
         int r= src1[6*i + 2] + src1[6*i + 5];
 
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+        dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+        dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
     }
     assert(src1 == src2);
 }
 
-static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void rgb24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
                        uint32_t *unused)
 {
     int i;
@@ -468,11 +504,11 @@ static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
         int g= src[i*3+1];
         int b= src[i*3+2];
 
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
+        dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
     }
 }
 
-static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                         const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -482,12 +518,12 @@ static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         int g= src1[3*i + 1];
         int b= src1[3*i + 2];
 
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+        dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+        dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
     }
 }
 
-static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                              const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -497,12 +533,12 @@ static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         int g= src1[6*i + 1] + src1[6*i + 4];
         int b= src1[6*i + 2] + src1[6*i + 5];
 
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+        dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+        dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
     }
 }
 
-static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width)
+static void planar_rgb_to_y(uint16_t *dst, const uint8_t *src[4], int width)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -510,7 +546,7 @@ static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width)
         int b = src[1][i];
         int r = src[2][i];
 
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+        dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
     }
 }
 
@@ -542,7 +578,7 @@ static void planar_rgb16be_to_y(uint8_t *_dst, const uint8_t *_src[4], int width
     }
 }
 
-static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], int width)
+static void planar_rgb_to_uv(uint16_t *dstU, uint16_t *dstV, const uint8_t *src[4], int width)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -550,8 +586,8 @@ static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4]
         int b = src[1][i];
         int r = src[2][i];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+        dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+        dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
     }
 }
 
@@ -653,6 +689,7 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
+        case PIX_FMT_GBR24P  : c->chrToYV12 = gbr24pToUV_half_c;  break;
         case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_half_c; break;
         case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_half_c; break;
         }
@@ -756,6 +793,7 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case PIX_FMT_ABGR:
         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
+        case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
         }
     }
 }
diff --git a/libswscale/options.c b/libswscale/options.c
index 9ba6e5e..65095d5 100644
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/output.c b/libswscale/output.c
index 8263da1..cae2c31 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 #include <string.h>
 
 #include "libavutil/avutil.h"
+#include "libavutil/avassert.h"
 #include "libavutil/bswap.h"
 #include "libavutil/cpu.h"
 #include "libavutil/intreadwrite.h"
@@ -135,7 +136,8 @@ yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
                          int big_endian, int output_bits)
 {
     int i;
-    int shift = 19 - output_bits;
+    int shift = 3;
+    av_assert0(output_bits == 16);
 
     for (i = 0; i < dstW; i++) {
         int val = src[i] + (1 << (shift - 1));
@@ -149,10 +151,11 @@ yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
                          int big_endian, int output_bits)
 {
     int i;
-    int shift = 15 + 16 - output_bits;
+    int shift = 15;
+    av_assert0(output_bits == 16);
 
     for (i = 0; i < dstW; i++) {
-        int val = 1 << (30-output_bits);
+        int val = 1 << (shift - 1);
         int j;
 
         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
@@ -199,7 +202,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
     int shift = 11 + 16 - output_bits;
 
     for (i = 0; i < dstW; i++) {
-        int val = 1 << (26-output_bits);
+        int val = 1 << (shift - 1);
         int j;
 
         for (j = 0; j < filterSize; j++)
@@ -388,14 +391,14 @@ yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
     for (i = 0; i < dstW - 7; i += 8) {
         int acc = 0;
 
-        accumulate_bit(acc, (buf0[i + 0] >> 7) + d128[0]);
-        accumulate_bit(acc, (buf0[i + 1] >> 7) + d128[1]);
-        accumulate_bit(acc, (buf0[i + 2] >> 7) + d128[2]);
-        accumulate_bit(acc, (buf0[i + 3] >> 7) + d128[3]);
-        accumulate_bit(acc, (buf0[i + 4] >> 7) + d128[4]);
-        accumulate_bit(acc, (buf0[i + 5] >> 7) + d128[5]);
-        accumulate_bit(acc, (buf0[i + 6] >> 7) + d128[6]);
-        accumulate_bit(acc, (buf0[i + 7] >> 7) + d128[7]);
+        accumulate_bit(acc, ((buf0[i + 0] + 64) >> 7) + d128[0]);
+        accumulate_bit(acc, ((buf0[i + 1] + 64) >> 7) + d128[1]);
+        accumulate_bit(acc, ((buf0[i + 2] + 64) >> 7) + d128[2]);
+        accumulate_bit(acc, ((buf0[i + 3] + 64) >> 7) + d128[3]);
+        accumulate_bit(acc, ((buf0[i + 4] + 64) >> 7) + d128[4]);
+        accumulate_bit(acc, ((buf0[i + 5] + 64) >> 7) + d128[5]);
+        accumulate_bit(acc, ((buf0[i + 6] + 64) >> 7) + d128[6]);
+        accumulate_bit(acc, ((buf0[i + 7] + 64) >> 7) + d128[7]);
 
         output_pixel(*dest++, acc);
     }
@@ -511,6 +514,13 @@ yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 
+        if ((Y1 | Y2 | U | V) & 0x100) {
+            Y1 = av_clip_uint8(Y1);
+            Y2 = av_clip_uint8(Y2);
+            U  = av_clip_uint8(U);
+            V  = av_clip_uint8(V);
+        }
+
         output_pixels(i * 4, Y1, U, Y2, V);
     }
 }
@@ -526,20 +536,34 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 
     if (uvalpha < 2048) {
         for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 = buf0[i * 2]     >> 7;
-            int Y2 = buf0[i * 2 + 1] >> 7;
-            int U  = ubuf0[i]        >> 7;
-            int V  = vbuf0[i]        >> 7;
+            int Y1 = (buf0[i * 2    ]+64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]+64) >> 7;
+            int U  = (ubuf0[i]       +64) >> 7;
+            int V  = (vbuf0[i]       +64) >> 7;
+
+            if ((Y1 | Y2 | U | V) & 0x100) {
+                Y1 = av_clip_uint8(Y1);
+                Y2 = av_clip_uint8(Y2);
+                U  = av_clip_uint8(U);
+                V  = av_clip_uint8(V);
+            }
 
             output_pixels(i * 4, Y1, U, Y2, V);
         }
     } else {
         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
         for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 =  buf0[i * 2]          >> 7;
-            int Y2 =  buf0[i * 2 + 1]      >> 7;
-            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
+            int Y1 = (buf0[i * 2    ]    + 64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]    + 64) >> 7;
+            int U  = (ubuf0[i] + ubuf1[i]+128) >> 8;
+            int V  = (vbuf0[i] + vbuf1[i]+128) >> 8;
+
+            if ((Y1 | Y2 | U | V) & 0x100) {
+                Y1 = av_clip_uint8(Y1);
+                Y2 = av_clip_uint8(Y2);
+                U  = av_clip_uint8(U);
+                V  = av_clip_uint8(V);
+            }
 
             output_pixels(i * 4, Y1, U, Y2, V);
         }
@@ -792,7 +816,7 @@ YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE)
  * correct RGB values into the destination buffer.
  */
 static av_always_inline void
-yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
               unsigned A1, unsigned A2,
               const void *_r, const void *_g, const void *_b, int y,
               enum PixelFormat target, int hasAlpha)
@@ -828,6 +852,7 @@ yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
 
 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
+
         dest[i * 6 + 0] = r_b[Y1];
         dest[i * 6 + 1] =   g[Y1];
         dest[i * 6 + 2] = b_r[Y1];
@@ -933,12 +958,6 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
         Y2 >>= 19;
         U  >>= 19;
         V  >>= 19;
-        if ((Y1 | Y2 | U | V) & 0x100) {
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-        }
         if (hasAlpha) {
             A1 = 1 << 18;
             A2 = 1 << 18;
@@ -954,10 +973,9 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
             }
         }
 
-        /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
-        r =  c->table_rV[V];
-        g = (c->table_gU[U] + c->table_gV[V]);
-        b =  c->table_bU[U];
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
         yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
                       r, g, b, y, target, hasAlpha);
@@ -986,9 +1004,9 @@ yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
         int A1, A2;
-        const void *r =  c->table_rV[V],
-                   *g = (c->table_gU[U] + c->table_gV[V]),
-                   *b =  c->table_bU[U];
+        const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                   *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                   *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
         if (hasAlpha) {
             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
@@ -1012,18 +1030,18 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
 
     if (uvalpha < 2048) {
         for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 = buf0[i * 2]     >> 7;
-            int Y2 = buf0[i * 2 + 1] >> 7;
-            int U  = ubuf0[i]        >> 7;
-            int V  = vbuf0[i]        >> 7;
+            int Y1 = (buf0[i * 2    ] + 64) >> 7;
+            int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
+            int U  = (ubuf0[i]        + 64) >> 7;
+            int V  = (vbuf0[i]        + 64) >> 7;
             int A1, A2;
-            const void *r =  c->table_rV[V],
-                       *g = (c->table_gU[U] + c->table_gV[V]),
-                       *b =  c->table_bU[U];
+            const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                       *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                       *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
             if (hasAlpha) {
-                A1 = abuf0[i * 2    ] >> 7;
-                A2 = abuf0[i * 2 + 1] >> 7;
+                A1 = (abuf0[i * 2    ] + 64) >> 7;
+                A2 = (abuf0[i * 2 + 1] + 64) >> 7;
             }
 
             yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
@@ -1032,18 +1050,18 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
     } else {
         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
         for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 =  buf0[i * 2]          >> 7;
-            int Y2 =  buf0[i * 2 + 1]      >> 7;
-            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
+            int Y1 = (buf0[i * 2    ]     +  64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]     +  64) >> 7;
+            int U  = (ubuf0[i] + ubuf1[i] + 128) >> 8;
+            int V  = (vbuf0[i] + vbuf1[i] + 128) >> 8;
             int A1, A2;
-            const void *r =  c->table_rV[V],
-                       *g = (c->table_gU[U] + c->table_gV[V]),
-                       *b =  c->table_bU[U];
+            const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                       *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                       *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
             if (hasAlpha) {
-                A1 = abuf0[i * 2    ] >> 7;
-                A2 = abuf0[i * 2 + 1] >> 7;
+                A1 = (abuf0[i * 2    ] + 64) >> 7;
+                A2 = (abuf0[i * 2 + 1] + 64) >> 7;
             }
 
             yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
@@ -1117,9 +1135,9 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
 
     for (i = 0; i < dstW; i++) {
         int j;
-        int Y = 0;
-        int U = -128 << 19;
-        int V = -128 << 19;
+        int Y = 1<<9;
+        int U = (1<<9)-(128 << 19);
+        int V = (1<<9)-(128 << 19);
         int R, G, B, A;
 
         for (j = 0; j < lumFilterSize; j++) {
@@ -1133,7 +1151,7 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
         U >>= 10;
         V >>= 10;
         if (hasAlpha) {
-            A = 1 << 21;
+            A = 1 << 18;
             for (j = 0; j < lumFilterSize; j++) {
                 A += alpSrc[j][i] * lumFilter[j];
             }
@@ -1176,7 +1194,6 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
             dest[1] = B >> 22;
             dest[2] = G >> 22;
             dest[3] = R >> 22;
-            dest += 4;
             break;
         case PIX_FMT_BGR24:
             dest[0] = B >> 22;
@@ -1307,7 +1324,10 @@ void ff_sws_init_output_funcs(SwsContext *c,
             *yuv2packedX = yuv2bgr24_full_X_c;
             break;
         }
+        if(!*yuv2packedX)
+            goto YUV_PACKED;
     } else {
+        YUV_PACKED:
         switch (dstFormat) {
         case PIX_FMT_RGB48LE:
             *yuv2packed1 = yuv2rgb48le_1_c;
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 13c63e8..f8f109e 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c
index 73c02e9..8e84c26 100644
--- a/libswscale/ppc/yuv2rgb_altivec.c
+++ b/libswscale/ppc/yuv2rgb_altivec.c
@@ -3,20 +3,20 @@
  *
  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -95,6 +95,7 @@ adjustment.
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 #include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
 #include "yuv2rgb_altivec.h"
 
 #undef PROFILE_THE_BEAST
@@ -298,7 +299,7 @@ static int altivec_##name (SwsContext *c,                               \
     vector signed short R1,G1,B1;                                       \
     vector unsigned char R,G,B;                                         \
                                                                         \
-    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
+    const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;            \
     vector unsigned char align_perm;                                    \
                                                                         \
     vector signed short                                                 \
@@ -335,10 +336,10 @@ static int altivec_##name (SwsContext *c,                               \
                                                                         \
         for (j=0;j<w/16;j++) {                                          \
                                                                         \
-            y1ivP = (vector unsigned char *)y1i;                        \
-            y2ivP = (vector unsigned char *)y2i;                        \
-            uivP  = (vector unsigned char *)ui;                         \
-            vivP  = (vector unsigned char *)vi;                         \
+            y1ivP = (const vector unsigned char *)y1i;                  \
+            y2ivP = (const vector unsigned char *)y2i;                  \
+            uivP  = (const vector unsigned char *)ui;                   \
+            vivP  = (const vector unsigned char *)vi;                   \
                                                                         \
             align_perm = vec_lvsl (0, y1i);                             \
             y0 = (vector unsigned char)                                 \
@@ -720,7 +721,7 @@ ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
                 static int printed_error_message;
                 if (!printed_error_message) {
                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
-                           sws_format_name(c->dstFormat));
+                           av_get_pix_fmt_name(c->dstFormat));
                     printed_error_message=1;
                 }
                 return;
@@ -795,7 +796,7 @@ ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
         default:
             /* Unreachable, I think. */
             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
-                   sws_format_name(c->dstFormat));
+                   av_get_pix_fmt_name(c->dstFormat));
             return;
         }
 
diff --git a/libswscale/ppc/yuv2rgb_altivec.h b/libswscale/ppc/yuv2rgb_altivec.h
index 2c5e7ed..aa52a47 100644
--- a/libswscale/ppc/yuv2rgb_altivec.h
+++ b/libswscale/ppc/yuv2rgb_altivec.h
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/ppc/yuv2yuv_altivec.c b/libswscale/ppc/yuv2yuv_altivec.c
index 4cd02ff..82c265a 100644
--- a/libswscale/ppc/yuv2yuv_altivec.c
+++ b/libswscale/ppc/yuv2yuv_altivec.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 5ae0703..3ef0925 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -6,20 +6,20 @@
  * Written by Nick Kurshev.
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <inttypes.h>
@@ -171,13 +171,13 @@ void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
         bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
 #else
-        *d++ = (bgr&0xF800)>>8;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0x1F)<<3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
         *d++ = 255;
 #endif
     }
@@ -211,9 +211,9 @@ void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0xF800)>>8;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0x1F)<<3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
     }
 }
 
@@ -250,13 +250,13 @@ void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
         bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
 #else
-        *d++ = (bgr&0x7C00)>>7;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x1F)<<3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
         *d++ = 255;
 #endif
     }
@@ -271,9 +271,9 @@ void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x7C00)>>7;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x1F)<<3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
     }
 }
 
@@ -314,21 +314,6 @@ void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
     }
 }
 
-void bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size)
-{
-    int i;
-    int num_pixels = src_size;
-    for (i=0; i<num_pixels; i++) {
-        unsigned b,g,r;
-        register uint8_t rgb;
-        rgb = src[i];
-        r = (rgb&0x07);
-        g = (rgb&0x38)>>3;
-        b = (rgb&0xC0)>>6;
-        dst[i] = ((b<<1)&0x07) | ((g&0x07)<<3) | ((r&0x03)<<6);
-    }
-}
-
 #define DEFINE_SHUFFLE_BYTES(a, b, c, d)                                \
 void shuffle_bytes_##a##b##c##d(const uint8_t *src, uint8_t *dst, int src_size) \
 {                                                                       \
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 42f468f..c447986 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -6,20 +6,20 @@
  *  Written by Nick Kurshev.
  *  YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,7 +64,6 @@ void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
 void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
-void   bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size);
 
 void shuffle_bytes_0321(const uint8_t *src, uint8_t *dst, int src_size);
 void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index c02015e..6363bc1 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -7,20 +7,20 @@
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  * lot of big-endian byte order fixes by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -227,27 +227,6 @@ static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
     }
 }
 
-/*
-  I use less accurate approximation here by simply left-shifting the input
-  value and filling the low order bits with zeroes. This method improves PNG
-  compression but this scheme cannot reproduce white exactly, since it does
-  not generate an all-ones maximum value; the net effect is to darken the
-  image slightly.
-
-  The better method should be "left bit replication":
-
-   4 3 2 1 0
-   ---------
-   1 1 0 1 1
-
-   7 6 5 4 3  2 1 0
-   ----------------
-   1 1 0 1 1  1 1 0
-   |=======|  |===|
-       |      leftmost bits repeated to fill open bits
-       |
-   original bits
-*/
 static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size)
 {
     const uint16_t *end;
@@ -257,9 +236,9 @@ static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
     }
 }
 
@@ -272,9 +251,9 @@ static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
     }
 }
 
@@ -289,13 +268,13 @@ static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
         bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr&0x7C00)>>7;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x1F)<<3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
 #else
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
         *d++ = 255;
 #endif
     }
@@ -312,13 +291,13 @@ static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
         bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr&0xF800)>>8;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0x1F)<<3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
 #else
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
         *d++ = 255;
 #endif
     }
@@ -658,6 +637,9 @@ void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
         ydst += lumStride;
         src  += srcStride;
 
+        if(y+1 == height)
+            break;
+
         for (i=0; i<chromWidth; i++) {
             unsigned int b = src[6*i+0];
             unsigned int g = src[6*i+1];
diff --git a/libswscale/sparc/yuv2rgb_vis.c b/libswscale/sparc/yuv2rgb_vis.c
index bcd2081..62f5026 100644
--- a/libswscale/sparc/yuv2rgb_vis.c
+++ b/libswscale/sparc/yuv2rgb_vis.c
@@ -2,20 +2,20 @@
  * VIS optimized software YUV to RGB converter
  * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/swscale-test.c b/libswscale/swscale-test.c
index 3497dff..ef6c55c 100644
--- a/libswscale/swscale-test.c
+++ b/libswscale/swscale-test.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -103,6 +103,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
 
         av_image_fill_linesizes(srcStride, srcFormat, srcW);
         for (p = 0; p < 4; p++) {
+            srcStride[p] = FFALIGN(srcStride[p], 16);
             if (srcStride[p])
                 src[p] = av_mallocz(srcStride[p] * srcH + 16);
             if (srcStride[p] && !src[p]) {
@@ -136,6 +137,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
          * allocated with av_malloc). */
         /* An extra 16 bytes is being allocated because some scalers may write
          * out of bounds. */
+        dstStride[i] = FFALIGN(dstStride[i], 16);
         if (dstStride[i])
             dst[i] = av_mallocz(dstStride[i] * dstH + 16);
         if (dstStride[i] && !dst[i]) {
@@ -175,6 +177,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
         ssdA = r->ssdA;
     } else {
         for (i = 0; i < 4; i++) {
+            refStride[i] = FFALIGN(refStride[i], 16);
             if (refStride[i])
                 out[i] = av_mallocz(refStride[i] * h);
             if (refStride[i] && !out[i]) {
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index b231302..0578e2d 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "swscale.h"
 #include "swscale_internal.h"
 #include "rgb2rgb.h"
+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/cpu.h"
 #include "libavutil/avutil.h"
@@ -47,6 +48,7 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 {  64, 64, 64, 64, 64, 64, 64, 64 };
 
+
 static av_always_inline void fillPlane(uint8_t* plane, int stride,
                                        int width, int height,
                                        int y, uint8_t val)
@@ -69,6 +71,9 @@ static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t
     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
     int sh = bits - 4;
 
+    if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
+        sh= 9;
+
     for (i = 0; i < dstW; i++) {
         int j;
         int srcPos = filterPos[i];
@@ -90,6 +95,9 @@ static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t
     const uint16_t *src = (const uint16_t *) _src;
     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
 
+    if(sh<15)
+        sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+
     for (i = 0; i < dstW; i++) {
         int j;
         int srcPos = filterPos[i];
@@ -204,7 +212,7 @@ static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
     int i;
     int32_t *dst = (int32_t *) _dst;
     for (i = 0; i < width; i++)
-        dst[i] = (dst[i]*14071 + (33561947<<4))>>14;
+        dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
 }
 
 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
@@ -218,6 +226,8 @@ static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
         xpos+=xInc;
     }
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+        dst[i] = src[srcW-1]*128;
 }
 
 // *** horizontal scale Y line to temp buffer
@@ -228,12 +238,12 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
                                      uint8_t *formatConvBuffer,
                                      uint32_t *pal, int isAlpha)
 {
-    void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
+    void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
     const uint8_t *src = src_in[isAlpha ? 3 : 0];
 
     if (toYV12) {
-        toYV12(formatConvBuffer, src, srcW, pal);
+        toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
         src= formatConvBuffer;
     } else if (c->readLumPlanar && !isAlpha) {
         c->readLumPlanar(formatConvBuffer, src_in, srcW);
@@ -263,6 +273,10 @@ static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
         xpos+=xInc;
     }
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+        dst1[i] = src1[srcW-1]*128;
+        dst2[i] = src2[srcW-1]*128;
+    }
 }
 
 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
@@ -273,12 +287,12 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2
 {
     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
     if (c->chrToYV12) {
-        uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
-        c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
+        uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
+        c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= buf2;
     } else if (c->readChrPlanar) {
-        uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
+        uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
         src1= formatConvBuffer;
         src2= buf2;
@@ -316,8 +330,6 @@ static int swScale(SwsContext *c, const uint8_t* src[],
     int16_t *vChrFilterPos= c->vChrFilterPos;
     int16_t *hLumFilterPos= c->hLumFilterPos;
     int16_t *hChrFilterPos= c->hChrFilterPos;
-    int16_t *vLumFilter= c->vLumFilter;
-    int16_t *vChrFilter= c->vChrFilter;
     int16_t *hLumFilter= c->hLumFilter;
     int16_t *hChrFilter= c->hChrFilter;
     int32_t *lumMmxFilter= c->lumMmxFilter;
@@ -337,13 +349,14 @@ static int swScale(SwsContext *c, const uint8_t* src[],
     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
     int lastDstY;
     uint32_t *pal=c->pal_yuv;
+    int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
+
     yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
     yuv2planarX_fn yuv2planeX = c->yuv2planeX;
     yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
-    int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat);
 
     /* vars which will change and which we need to store back in the context */
     int dstY= c->dstY;
@@ -373,7 +386,7 @@ static int swScale(SwsContext *c, const uint8_t* src[],
     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
 
-    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
+    if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 || dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
         static int warnedAlready=0; //FIXME move this into the context perhaps
         if (flags & SWS_PRINT_INFO && !warnedAlready) {
             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
@@ -382,6 +395,18 @@ static int swScale(SwsContext *c, const uint8_t* src[],
         }
     }
 
+    if ((int)dst[0]%16 || (int)dst[1]%16 || (int)dst[2]%16 || (int)src[0]%16 || (int)src[1]%16 || (int)src[2]%16
+        || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
+        || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
+    ) {
+        static int warnedAlready=0;
+        int cpu_flags = av_get_cpu_flags();
+        if (HAVE_MMX2 && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
+            av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
+            warnedAlready=1;
+        }
+    }
+
     /* Note the user might start scaling the picture in the middle so this
        will not get executed. This is not really intended but works
        currently, so people might do it. */
@@ -406,15 +431,14 @@ static int swScale(SwsContext *c, const uint8_t* src[],
             dst[2] + dstStride[2] * chrDstY,
             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
         };
+        int use_mmx_vfilter= c->use_mmx_vfilter;
 
         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
-
-        // Last line needed as input
-        int lastLumSrcY  = FFMIN(c->srcH,    firstLumSrcY  + vLumFilterSize) - 1;
-        int lastLumSrcY2 = FFMIN(c->srcH,    firstLumSrcY2 + vLumFilterSize) - 1;
-        int lastChrSrcY  = FFMIN(c->chrSrcH, firstChrSrcY  + vChrFilterSize) - 1;
+        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
+        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
+        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
         int enough_lines;
 
         //handle holes (FAST_BILINEAR & weird filters)
@@ -503,85 +527,66 @@ static int swScale(SwsContext *c, const uint8_t* src[],
             // hmm looks like we can't use MMX here without overwriting this array's tail
             ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
                                      &yuv2packed1, &yuv2packed2, &yuv2packedX);
+            use_mmx_vfilter= 0;
         }
 
         {
-            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-            const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
-
-            if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
-                const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
-                int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
-                for (i = 0; i < neg;            i++)
-                    tmpY[i] = lumSrcPtr[neg];
-                for (     ; i < end;            i++)
-                    tmpY[i] = lumSrcPtr[i];
-                for (     ; i < vLumFilterSize; i++)
-                    tmpY[i] = tmpY[i-1];
-                lumSrcPtr = tmpY;
-
-                if (alpSrcPtr) {
-                    const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
-                    for (i = 0; i < neg;            i++)
-                        tmpA[i] = alpSrcPtr[neg];
-                    for (     ; i < end;            i++)
-                        tmpA[i] = alpSrcPtr[i];
-                    for (     ; i < vLumFilterSize; i++)
-                        tmpA[i] = tmpA[i - 1];
-                    alpSrcPtr = tmpA;
-                }
-            }
-            if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
-                const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize,
-                              **tmpV = (const int16_t **) chrVPixBuf + 2 * vChrBufSize;
-                int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
-                for (i = 0; i < neg;            i++) {
-                    tmpU[i] = chrUSrcPtr[neg];
-                    tmpV[i] = chrVSrcPtr[neg];
-                }
-                for (     ; i < end;            i++) {
-                    tmpU[i] = chrUSrcPtr[i];
-                    tmpV[i] = chrVSrcPtr[i];
-                }
-                for (     ; i < vChrFilterSize; i++) {
-                    tmpU[i] = tmpU[i - 1];
-                    tmpV[i] = tmpV[i - 1];
-                }
-                chrUSrcPtr = tmpU;
-                chrVSrcPtr = tmpV;
-            }
+            const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+            const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+            const int16_t **chrVSrcPtr= (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+            int16_t *vLumFilter= c->vLumFilter;
+            int16_t *vChrFilter= c->vChrFilter;
 
             if (isPlanarYUV(dstFormat) || (isGray(dstFormat) && !isALPHA(dstFormat))) { //YV12 like
                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
 
+                vLumFilter +=    dstY * vLumFilterSize;
+                vChrFilter += chrDstY * vChrFilterSize;
+
+//                 av_assert0(use_mmx_vfilter != (
+//                                yuv2planeX == yuv2planeX_10BE_c
+//                             || yuv2planeX == yuv2planeX_10LE_c
+//                             || yuv2planeX == yuv2planeX_9BE_c
+//                             || yuv2planeX == yuv2planeX_9LE_c
+//                             || yuv2planeX == yuv2planeX_16BE_c
+//                             || yuv2planeX == yuv2planeX_16LE_c
+//                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
+
+                if(use_mmx_vfilter){
+                    vLumFilter= c->lumMmxFilter;
+                    vChrFilter= c->chrMmxFilter;
+                }
+
                 if (vLumFilterSize == 1) {
                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
                 } else {
-                    yuv2planeX(vLumFilter + dstY * vLumFilterSize, vLumFilterSize,
+                    yuv2planeX(vLumFilter, vLumFilterSize,
                                lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
                 }
 
                 if (!((dstY&chrSkipMask) || isGray(dstFormat))) {
                     if (yuv2nv12cX) {
-                        yuv2nv12cX(c, vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
+                        yuv2nv12cX(c, vChrFilter, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
                     } else if (vChrFilterSize == 1) {
                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
                     } else {
-                        yuv2planeX(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize,
+                        yuv2planeX(vChrFilter, vChrFilterSize,
                                    chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
-                        yuv2planeX(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize,
-                                   chrVSrcPtr, dest[2], chrDstW, c->chrDither8, 3);
+                        yuv2planeX(vChrFilter, vChrFilterSize,
+                                   chrVSrcPtr, dest[2], chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
                     }
                 }
 
                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
+                    if(use_mmx_vfilter){
+                        vLumFilter= c->alpMmxFilter;
+                    }
                     if (vLumFilterSize == 1) {
                         yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0);
                     } else {
-                        yuv2planeX(vLumFilter + dstY * vLumFilterSize, vLumFilterSize,
+                        yuv2planeX(vLumFilter, vLumFilterSize,
                                    alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
                     }
                 }
@@ -643,6 +648,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
 
     ff_sws_init_input_funcs(c);
 
+
     if (c->srcBpc == 8) {
         if (c->dstBpc <= 10) {
             c->hyScale = c->hcScale = hScale8To15_c;
diff --git a/libswscale/swscale.h b/libswscale/swscale.h
index b5a6a57..fa7100c 100644
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 
 #define LIBSWSCALE_VERSION_MAJOR 2
 #define LIBSWSCALE_VERSION_MINOR 1
-#define LIBSWSCALE_VERSION_MICRO 0
+#define LIBSWSCALE_VERSION_MICRO 100
 
 #define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
                                                LIBSWSCALE_VERSION_MINOR, \
@@ -55,6 +55,9 @@
 #ifndef FF_API_SWS_CPU_CAPS
 #define FF_API_SWS_CPU_CAPS    (LIBSWSCALE_VERSION_MAJOR < 3)
 #endif
+#ifndef FF_API_SWS_FORMAT_NAME
+#define FF_API_SWS_FORMAT_NAME  (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
 
 /**
  * Return the LIBSWSCALE_VERSION_INT constant.
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 8da1201..18ec4d9 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,10 +34,14 @@
 
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
+#define YUVRGB_TABLE_HEADROOM 128
+
 #define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
 
 #define MAX_FILTER_SIZE 256
 
+#define DITHER1XBPP
+
 #if HAVE_BIGENDIAN
 #define ALT32_CORR (-1)
 #else
@@ -315,10 +319,10 @@ typedef struct SwsContext {
     int dstY;                     ///< Last destination vertical line output from last slice.
     int flags;                    ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
     void *yuvTable;             // pointer to the yuv->rgb table start so it can be freed()
-    uint8_t *table_rV[256];
-    uint8_t *table_gU[256];
-    int table_gV[256];
-    uint8_t *table_bU[256];
+    uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
+    int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
 
     //Colorspace stuff
     int contrast, brightness, saturation;    // for sws_getColorspaceDetails
@@ -326,6 +330,8 @@ typedef struct SwsContext {
     int dstColorspaceTable[4];
     int srcRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (source      image).
     int dstRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
+    int src0Alpha;
+    int dst0Alpha;
     int yuv2rgb_y_offset;
     int yuv2rgb_y_coeff;
     int yuv2rgb_v2r_coeff;
@@ -380,8 +386,8 @@ typedef struct SwsContext {
     // alignment of these values is not necessary, but merely here
     // to maintain the same offset across x8632 and x86-64. Once we
     // use proper offset macros in the asm, they can be removed.
-    DECLARE_ALIGNED(8, ptrdiff_t, uv_off_px);   ///< offset (in pixels) between u and v planes
-    DECLARE_ALIGNED(8, ptrdiff_t, uv_off_byte); ///< offset (in bytes) between u and v planes
+    DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
+    DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
     DECLARE_ALIGNED(8, uint16_t, dither16)[8];
     DECLARE_ALIGNED(8, uint32_t, dither32)[8];
 
@@ -415,6 +421,8 @@ typedef struct SwsContext {
 #if HAVE_VIS
     DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
 #endif
+    int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];
+    int use_mmx_vfilter;
 
     /* function pointers for swScale() */
     yuv2planar1_fn yuv2plane1;
@@ -425,14 +433,14 @@ typedef struct SwsContext {
     yuv2packedX_fn yuv2packedX;
 
     /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
-    void (*lumToYV12)(uint8_t *dst, const uint8_t *src,
+    void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
     /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
-    void (*alpToYV12)(uint8_t *dst, const uint8_t *src,
+    void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
     /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
     void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
-                      const uint8_t *src1, const uint8_t *src2,
+                      const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
 
     /**
@@ -537,7 +545,13 @@ SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c);
 SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c);
 void ff_bfin_get_unscaled_swscale(SwsContext *c);
 
+#if FF_API_SWS_FORMAT_NAME
+/**
+ * @deprecated Use av_get_pix_fmt_name() instead.
+ */
+attribute_deprecated
 const char *sws_format_name(enum PixelFormat format);
+#endif
 
 #define is16BPS(x) \
     (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 15)
@@ -546,6 +560,8 @@ const char *sws_format_name(enum PixelFormat format);
     (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 8 || \
      av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 9)
 
+#define isNBPS(x) is9_OR_10BPS(x)
+
 #define isBE(x) \
     (av_pix_fmt_descriptors[x].flags & PIX_FMT_BE)
 
@@ -559,7 +575,6 @@ const char *sws_format_name(enum PixelFormat format);
 
 #define isRGB(x) \
     (av_pix_fmt_descriptors[x].flags & PIX_FMT_RGB)
-
 #if 0 // FIXME
 #define isGray(x) \
     (!(av_pix_fmt_descriptors[x].flags & PIX_FMT_PAL) && \
@@ -572,55 +587,95 @@ const char *sws_format_name(enum PixelFormat format);
      (x) == PIX_FMT_GRAY16LE)
 #endif
 
-#define isRGBinInt(x)                  \
-    ((x) == PIX_FMT_RGB48BE     ||     \
-     (x) == PIX_FMT_RGB48LE     ||     \
-     (x) == PIX_FMT_RGB32       ||     \
-     (x) == PIX_FMT_RGB32_1     ||     \
-     (x) == PIX_FMT_RGB24       ||     \
-     (x) == PIX_FMT_RGB565BE    ||     \
-     (x) == PIX_FMT_RGB565LE    ||     \
-     (x) == PIX_FMT_RGB555BE    ||     \
-     (x) == PIX_FMT_RGB555LE    ||     \
-     (x) == PIX_FMT_RGB444BE    ||     \
-     (x) == PIX_FMT_RGB444LE    ||     \
-     (x) == PIX_FMT_RGB8        ||     \
-     (x) == PIX_FMT_RGB4        ||     \
-     (x) == PIX_FMT_RGB4_BYTE   ||     \
-     (x) == PIX_FMT_MONOBLACK   ||     \
-     (x) == PIX_FMT_MONOWHITE)
-
-#define isBGRinInt(x)                  \
-    ((x) == PIX_FMT_BGR48BE     ||     \
-     (x) == PIX_FMT_BGR48LE     ||     \
-     (x) == PIX_FMT_BGR32       ||     \
-     (x) == PIX_FMT_BGR32_1     ||     \
-     (x) == PIX_FMT_BGR24       ||     \
-     (x) == PIX_FMT_BGR565BE    ||     \
-     (x) == PIX_FMT_BGR565LE    ||     \
-     (x) == PIX_FMT_BGR555BE    ||     \
-     (x) == PIX_FMT_BGR555LE    ||     \
-     (x) == PIX_FMT_BGR444BE    ||     \
-     (x) == PIX_FMT_BGR444LE    ||     \
-     (x) == PIX_FMT_BGR8        ||     \
-     (x) == PIX_FMT_BGR4        ||     \
-     (x) == PIX_FMT_BGR4_BYTE   ||     \
-     (x) == PIX_FMT_MONOBLACK   ||     \
-     (x) == PIX_FMT_MONOWHITE)
-
-#define isAnyRGB(x)                    \
-    (isRGBinInt(x)              ||     \
-     isBGRinInt(x))
-
-#define isALPHA(x)                                            \
-    (av_pix_fmt_descriptors[x].nb_components == 2          || \
+#define isRGBinInt(x) \
+    (           \
+        (x)==PIX_FMT_RGB48BE   ||   \
+        (x)==PIX_FMT_RGB48LE   ||   \
+        (x)==PIX_FMT_RGBA64BE  ||   \
+        (x)==PIX_FMT_RGBA64LE  ||   \
+        (x)==PIX_FMT_RGB32     ||   \
+        (x)==PIX_FMT_RGB32_1   ||   \
+        (x)==PIX_FMT_RGB24     ||   \
+        (x)==PIX_FMT_RGB565BE  ||   \
+        (x)==PIX_FMT_RGB565LE  ||   \
+        (x)==PIX_FMT_RGB555BE  ||   \
+        (x)==PIX_FMT_RGB555LE  ||   \
+        (x)==PIX_FMT_RGB444BE  ||   \
+        (x)==PIX_FMT_RGB444LE  ||   \
+        (x)==PIX_FMT_RGB8      ||   \
+        (x)==PIX_FMT_RGB4      ||   \
+        (x)==PIX_FMT_RGB4_BYTE ||   \
+        (x)==PIX_FMT_MONOBLACK ||   \
+        (x)==PIX_FMT_MONOWHITE   \
+    )
+#define isBGRinInt(x) \
+    (           \
+         (x)==PIX_FMT_BGR48BE  ||   \
+         (x)==PIX_FMT_BGR48LE  ||   \
+         (x)==PIX_FMT_BGRA64BE ||   \
+         (x)==PIX_FMT_BGRA64LE ||   \
+         (x)==PIX_FMT_BGR32    ||   \
+         (x)==PIX_FMT_BGR32_1  ||   \
+         (x)==PIX_FMT_BGR24    ||   \
+         (x)==PIX_FMT_BGR565BE ||   \
+         (x)==PIX_FMT_BGR565LE ||   \
+         (x)==PIX_FMT_BGR555BE ||   \
+         (x)==PIX_FMT_BGR555LE ||   \
+         (x)==PIX_FMT_BGR444BE ||   \
+         (x)==PIX_FMT_BGR444LE ||   \
+         (x)==PIX_FMT_BGR8     ||   \
+         (x)==PIX_FMT_BGR4     ||   \
+         (x)==PIX_FMT_BGR4_BYTE||   \
+         (x)==PIX_FMT_MONOBLACK||   \
+         (x)==PIX_FMT_MONOWHITE   \
+    )
+
+#define isRGBinBytes(x) (           \
+           (x)==PIX_FMT_RGB48BE     \
+        || (x)==PIX_FMT_RGB48LE     \
+        || (x)==PIX_FMT_RGBA64BE    \
+        || (x)==PIX_FMT_RGBA64LE    \
+        || (x)==PIX_FMT_RGBA        \
+        || (x)==PIX_FMT_ARGB        \
+        || (x)==PIX_FMT_RGB24       \
+    )
+#define isBGRinBytes(x) (           \
+           (x)==PIX_FMT_BGR48BE     \
+        || (x)==PIX_FMT_BGR48LE     \
+        || (x)==PIX_FMT_BGRA64BE    \
+        || (x)==PIX_FMT_BGRA64LE    \
+        || (x)==PIX_FMT_BGRA        \
+        || (x)==PIX_FMT_ABGR        \
+        || (x)==PIX_FMT_BGR24       \
+    )
+
+#define isAnyRGB(x) \
+    (           \
+          isRGBinInt(x)       ||    \
+          isBGRinInt(x)       ||    \
+          (x)==PIX_FMT_GBR24P     \
+    )
+
+#define isALPHA(x) \
+    (av_pix_fmt_descriptors[x].nb_components == 2 || \
      av_pix_fmt_descriptors[x].nb_components == 4)
 
+#if 1
+#define isPacked(x)         (       \
+           (x)==PIX_FMT_PAL8        \
+        || (x)==PIX_FMT_YUYV422     \
+        || (x)==PIX_FMT_UYVY422     \
+        || (x)==PIX_FMT_Y400A       \
+        ||  isRGBinInt(x)           \
+        ||  isBGRinInt(x)           \
+    )
+#else
 #define isPacked(x)                                            \
     ((av_pix_fmt_descriptors[x].nb_components >= 2         &&  \
       !(av_pix_fmt_descriptors[x].flags & PIX_FMT_PLANAR)) ||  \
      (x) == PIX_FMT_PAL8)
 
+#endif
 #define isPlanar(x) \
     (av_pix_fmt_descriptors[x].nb_components >= 2          &&  \
      (av_pix_fmt_descriptors[x].flags & PIX_FMT_PLANAR))
@@ -631,6 +686,9 @@ const char *sws_format_name(enum PixelFormat format);
 
 extern const uint64_t ff_dither4[2];
 extern const uint64_t ff_dither8[2];
+extern const uint8_t dithers[8][8][8];
+extern const uint16_t dither_scale[15][16];
+
 
 extern const AVClass sws_context_class;
 
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 5fe2b14..dd7b04c 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,48 +34,6 @@
 #include "libavutil/bswap.h"
 #include "libavutil/pixdesc.h"
 
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_1)[8][8] = {
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-};
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_3)[8][8] = {
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-};
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_64)[8][8] = {
-    {  18, 34, 30, 46, 17, 33, 29, 45,},
-    {  50,  2, 62, 14, 49,  1, 61, 13,},
-    {  26, 42, 22, 38, 25, 41, 21, 37,},
-    {  58, 10, 54,  6, 57,  9, 53,  5,},
-    {  16, 32, 28, 44, 19, 35, 31, 47,},
-    {  48,  0, 60, 12, 51,  3, 63, 15,},
-    {  24, 40, 20, 36, 27, 43, 23, 39,},
-    {  56,  8, 52,  4, 59, 11, 55,  7,},
-};
-extern const uint8_t dither_8x8_128[8][8];
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_256)[8][8] = {
-    {  72, 136, 120, 184,  68, 132, 116, 180,},
-    { 200,   8, 248,  56, 196,   4, 244,  52,},
-    { 104, 168,  88, 152, 100, 164,  84, 148,},
-    { 232,  40, 216,  24, 228,  36, 212,  20,},
-    {  64, 128, 102, 176,  76, 140, 124, 188,},
-    { 192,   0, 240,  48, 204,  12, 252,  60,},
-    {  96, 160,  80, 144, 108, 172,  92, 156,},
-    { 224,  32, 208,  16, 236,  44, 220,  28,},
-};
-
 #define RGB2YUV_SHIFT 15
 #define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
 #define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
@@ -87,6 +45,102 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_256)[8][8] = {
 #define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
 #define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
 
+DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
+{
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+},{
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+},{
+  {   2,  4,  3,  5,  2,  4,  3,  5,},
+  {   6,  0,  7,  1,  6,  0,  7,  1,},
+  {   3,  5,  2,  4,  3,  5,  2,  4,},
+  {   7,  1,  6,  0,  7,  1,  6,  0,},
+  {   2,  4,  3,  5,  2,  4,  3,  5,},
+  {   6,  0,  7,  1,  6,  0,  7,  1,},
+  {   3,  5,  2,  4,  3,  5,  2,  4,},
+  {   7,  1,  6,  0,  7,  1,  6,  0,},
+},{
+  {   4,  8,  7, 11,  4,  8,  7, 11,},
+  {  12,  0, 15,  3, 12,  0, 15,  3,},
+  {   6, 10,  5,  9,  6, 10,  5,  9,},
+  {  14,  2, 13,  1, 14,  2, 13,  1,},
+  {   4,  8,  7, 11,  4,  8,  7, 11,},
+  {  12,  0, 15,  3, 12,  0, 15,  3,},
+  {   6, 10,  5,  9,  6, 10,  5,  9,},
+  {  14,  2, 13,  1, 14,  2, 13,  1,},
+},{
+  {   9, 17, 15, 23,  8, 16, 14, 22,},
+  {  25,  1, 31,  7, 24,  0, 30,  6,},
+  {  13, 21, 11, 19, 12, 20, 10, 18,},
+  {  29,  5, 27,  3, 28,  4, 26,  2,},
+  {   8, 16, 14, 22,  9, 17, 15, 23,},
+  {  24,  0, 30,  6, 25,  1, 31,  7,},
+  {  12, 20, 10, 18, 13, 21, 11, 19,},
+  {  28,  4, 26,  2, 29,  5, 27,  3,},
+},{
+  {  18, 34, 30, 46, 17, 33, 29, 45,},
+  {  50,  2, 62, 14, 49,  1, 61, 13,},
+  {  26, 42, 22, 38, 25, 41, 21, 37,},
+  {  58, 10, 54,  6, 57,  9, 53,  5,},
+  {  16, 32, 28, 44, 19, 35, 31, 47,},
+  {  48,  0, 60, 12, 51,  3, 63, 15,},
+  {  24, 40, 20, 36, 27, 43, 23, 39,},
+  {  56,  8, 52,  4, 59, 11, 55,  7,},
+},{
+  {  18, 34, 30, 46, 17, 33, 29, 45,},
+  {  50,  2, 62, 14, 49,  1, 61, 13,},
+  {  26, 42, 22, 38, 25, 41, 21, 37,},
+  {  58, 10, 54,  6, 57,  9, 53,  5,},
+  {  16, 32, 28, 44, 19, 35, 31, 47,},
+  {  48,  0, 60, 12, 51,  3, 63, 15,},
+  {  24, 40, 20, 36, 27, 43, 23, 39,},
+  {  56,  8, 52,  4, 59, 11, 55,  7,},
+},{
+  {  36, 68, 60, 92, 34, 66, 58, 90,},
+  { 100,  4,124, 28, 98,  2,122, 26,},
+  {  52, 84, 44, 76, 50, 82, 42, 74,},
+  { 116, 20,108, 12,114, 18,106, 10,},
+  {  32, 64, 56, 88, 38, 70, 62, 94,},
+  {  96,  0,120, 24,102,  6,126, 30,},
+  {  48, 80, 40, 72, 54, 86, 46, 78,},
+  { 112, 16,104,  8,118, 22,110, 14,},
+}};
+
+static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
+
+const uint16_t dither_scale[15][16]={
+{    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
+{    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
+{    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
+{    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
+{    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
+{    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
+{    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
+{    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
+{    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
+{    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
+{    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
+{    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
+{    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
+{    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
+{    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
+};
+
+
 static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
                       uint8_t val)
 {
@@ -98,6 +152,20 @@ static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
     }
 }
 
+static void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
+                      int alpha, int bits)
+{
+    int i, j;
+    uint8_t *ptr = plane + stride * y;
+    int v = alpha ? -1 : (1<<bits);
+    for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j++) {
+            AV_WN16(ptr+2*j, v);
+        }
+        ptr += stride;
+    }
+}
+
 static void copyPlane(const uint8_t *src, int srcStride,
                       int srcSliceY, int srcSliceH, int width,
                       uint8_t *dst, int dstStride)
@@ -310,7 +378,7 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
     uint8_t *dstPtr = dst[0] + dstStride[0] * srcSliceY;
     const uint8_t *srcPtr = src[0];
 
-    if (srcFormat == PIX_FMT_Y400A) {
+    if (srcFormat == PIX_FMT_GRAY8A) {
         switch (dstFormat) {
         case PIX_FMT_RGB32  : conv = gray8aToPacked32; break;
         case PIX_FMT_BGR32  : conv = gray8aToPacked32; break;
@@ -332,7 +400,7 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
 
     if (!conv)
         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
-               sws_format_name(srcFormat), sws_format_name(dstFormat));
+               av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
     else {
         for (i = 0; i < srcSliceH; i++) {
             conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb);
@@ -344,6 +412,92 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
     return srcSliceH;
 }
 
+static void gbr24ptopacked24(const uint8_t* src[], int srcStride[], uint8_t* dst, int dstStride, int srcSliceH, int width)
+{
+    int x, h, i;
+    for (h = 0; h < srcSliceH; h++) {
+        uint8_t *dest = dst + dstStride * h;
+        for (x = 0; x < width; x++) {
+            *dest++ = src[0][x];
+            *dest++ = src[1][x];
+            *dest++ = src[2][x];
+        }
+
+        for (i = 0; i < 3; i++)
+            src[i] += srcStride[i];
+    }
+}
+
+static void gbr24ptopacked32(const uint8_t* src[], int srcStride[], uint8_t* dst, int dstStride, int srcSliceH, int alpha_first, int width)
+{
+    int x, h, i;
+    for (h = 0; h < srcSliceH; h++) {
+        uint8_t *dest = dst + dstStride * h;
+
+        if (alpha_first) {
+            for (x = 0; x < width; x++) {
+                *dest++ = 0xff;
+                *dest++ = src[0][x];
+                *dest++ = src[1][x];
+                *dest++ = src[2][x];
+            }
+        } else {
+            for (x = 0; x < width; x++) {
+                *dest++ = src[0][x];
+                *dest++ = src[1][x];
+                *dest++ = src[2][x];
+                *dest++ = 0xff;
+            }
+        }
+
+        for (i = 0; i < 3; i++)
+            src[i] += srcStride[i];
+    }
+}
+
+static int planarRgbToRgbWrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
+        int srcSliceH, uint8_t* dst[], int dstStride[])
+{
+    int alpha_first = 0;
+    if (c->srcFormat != PIX_FMT_GBR24P) {
+        av_log(c, AV_LOG_ERROR, "unsupported planar RGB conversion %s -> %s\n",
+              av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
+        return srcSliceH;
+    }
+
+    switch (c->dstFormat) {
+        case PIX_FMT_BGR24:
+            gbr24ptopacked24((const uint8_t* []) {src[1], src[0], src[2]}, (int []) {srcStride[1], srcStride[0], srcStride[2]},
+                  dst[0] + srcSliceY * dstStride[0], dstStride[0], srcSliceH, c->srcW);
+            break;
+
+        case PIX_FMT_RGB24:
+            gbr24ptopacked24((const uint8_t* []) {src[2], src[0], src[1]}, (int []) {srcStride[2], srcStride[0], srcStride[1]},
+                  dst[0] + srcSliceY * dstStride[0], dstStride[0], srcSliceH, c->srcW);
+            break;
+
+        case PIX_FMT_ARGB:
+            alpha_first = 1;
+        case PIX_FMT_RGBA:
+            gbr24ptopacked32((const uint8_t* []) {src[2], src[0], src[1]}, (int []) {srcStride[2], srcStride[0], srcStride[1]},
+                  dst[0] + srcSliceY * dstStride[0], dstStride[0], srcSliceH, alpha_first, c->srcW);
+            break;
+
+        case PIX_FMT_ABGR:
+            alpha_first = 1;
+        case PIX_FMT_BGRA:
+            gbr24ptopacked32((const uint8_t* []) {src[1], src[0], src[2]}, (int []) {srcStride[1], srcStride[0], srcStride[2]},
+                  dst[0] + srcSliceY * dstStride[0], dstStride[0], srcSliceH, alpha_first, c->srcW);
+            break;
+
+        default:
+            av_log(c, AV_LOG_ERROR, "unsupported planar RGB conversion %s -> %s\n",
+                    av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
+    }
+
+    return srcSliceH;
+}
+
 #define isRGBA32(x) (            \
            (x) == PIX_FMT_ARGB   \
         || (x) == PIX_FMT_RGBA   \
@@ -442,7 +596,7 @@ static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
 
     if (!conv) {
         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
-               sws_format_name(srcFormat), sws_format_name(dstFormat));
+               av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
     } else {
         const uint8_t *srcPtr = src[0];
               uint8_t *dstPtr = dst[0];
@@ -532,25 +686,25 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
-#define clip9(x)  av_clip_uintp2(x,  9)
-#define clip10(x) av_clip_uintp2(x, 10)
-#define DITHER_COPY(dst, dstStride, wfunc, src, srcStride, rfunc, dithers, shift, clip) \
-    for (i = 0; i < height; i++) { \
-        const uint8_t *dither = dithers[i & 7]; \
-        for (j = 0; j < length - 7; j += 8) { \
-            wfunc(&dst[j + 0], clip((rfunc(&src[j + 0]) + dither[0]) >> shift)); \
-            wfunc(&dst[j + 1], clip((rfunc(&src[j + 1]) + dither[1]) >> shift)); \
-            wfunc(&dst[j + 2], clip((rfunc(&src[j + 2]) + dither[2]) >> shift)); \
-            wfunc(&dst[j + 3], clip((rfunc(&src[j + 3]) + dither[3]) >> shift)); \
-            wfunc(&dst[j + 4], clip((rfunc(&src[j + 4]) + dither[4]) >> shift)); \
-            wfunc(&dst[j + 5], clip((rfunc(&src[j + 5]) + dither[5]) >> shift)); \
-            wfunc(&dst[j + 6], clip((rfunc(&src[j + 6]) + dither[6]) >> shift)); \
-            wfunc(&dst[j + 7], clip((rfunc(&src[j + 7]) + dither[7]) >> shift)); \
-        } \
-        for (; j < length; j++) \
-            wfunc(&dst[j],     (rfunc(&src[j]) + dither[j & 7]) >> shift); \
-        dst += dstStride; \
-        src += srcStride; \
+#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
+    uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
+    int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
+    for (i = 0; i < height; i++) {\
+        const uint8_t *dither= dithers[src_depth-9][i&7];\
+        for (j = 0; j < length-7; j+=8){\
+            dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\
+            dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\
+            dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\
+            dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\
+            dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\
+            dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\
+            dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\
+            dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\
+        }\
+        for (; j < length; j++)\
+            dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\
+        dst += dstStride;\
+        src += srcStride;\
     }
 
 static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
@@ -564,162 +718,99 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
         int height = (plane == 0 || plane == 3) ? srcSliceH: -((-srcSliceH) >> c->chrDstVSubSample);
         const uint8_t *srcPtr = src[plane];
         uint8_t *dstPtr = dst[plane] + dstStride[plane] * y;
+        int shiftonly= plane==1 || plane==2 || (!c->srcRange && plane==0);
 
         if (!dst[plane])
             continue;
         // ignore palette for GRAY8
         if (plane == 1 && !dst[2]) continue;
         if (!src[plane] || (plane == 1 && !src[2])) {
-            if (is16BPS(c->dstFormat))
-                length *= 2;
-            fillPlane(dst[plane], dstStride[plane], length, height, y,
-                      (plane == 3) ? 255 : 128);
+            if (is16BPS(c->dstFormat) || isNBPS(c->dstFormat)) {
+                fillPlane16(dst[plane], dstStride[plane], length, height, y,
+                        plane == 3, av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1);
+            } else {
+                fillPlane(dst[plane], dstStride[plane], length, height, y,
+                        (plane == 3) ? 255 : 128);
+            }
         } else {
-            if (is9_OR_10BPS(c->srcFormat)) {
+            if(isNBPS(c->srcFormat) || isNBPS(c->dstFormat)
+               || (is16BPS(c->srcFormat) != is16BPS(c->dstFormat))
+            ) {
                 const int src_depth = av_pix_fmt_descriptors[c->srcFormat].comp[plane].depth_minus1 + 1;
                 const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1 + 1;
                 const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
+                uint16_t *dstPtr2 = (uint16_t*)dstPtr;
 
-                if (is16BPS(c->dstFormat)) {
-                    uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-#define COPY9_OR_10TO16(rfunc, wfunc) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            int srcpx = rfunc(&srcPtr2[j]); \
-                            wfunc(&dstPtr2[j], (srcpx << (16 - src_depth)) | (srcpx >> (2 * src_depth - 16))); \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr2 += srcStride[plane] / 2; \
+                if (dst_depth == 8) {
+                    if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
+                        DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, , )
+                    } else {
+                        DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, av_bswap16, )
                     }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO16(AV_RB16, AV_WB16);
-                        } else {
-                            COPY9_OR_10TO16(AV_RL16, AV_WB16);
+                } else if (src_depth == 8) {
+                    for (i = 0; i < height; i++) {
+                        #define COPY816(w)\
+                        if(shiftonly){\
+                            for (j = 0; j < length; j++)\
+                                w(&dstPtr2[j], srcPtr[j]<<(dst_depth-8));\
+                        }else{\
+                            for (j = 0; j < length; j++)\
+                                w(&dstPtr2[j], (srcPtr[j]<<(dst_depth-8)) |\
+                                               (srcPtr[j]>>(2*8-dst_depth)));\
                         }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO16(AV_RB16, AV_WL16);
+                        if(isBE(c->dstFormat)){
+                            COPY816(AV_WB16)
                         } else {
-                            COPY9_OR_10TO16(AV_RL16, AV_WL16);
+                            COPY816(AV_WL16)
                         }
+                        dstPtr2 += dstStride[plane]/2;
+                        srcPtr  += srcStride[plane];
                     }
-                } else if (is9_OR_10BPS(c->dstFormat)) {
-                    uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-#define COPY9_OR_10TO9_OR_10(loop) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            loop; \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr2 += srcStride[plane] / 2; \
-                    }
-#define COPY9_OR_10TO9_OR_10_2(rfunc, wfunc) \
-                    if (dst_depth > src_depth) { \
-                        COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \
-                            wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \
-                    } else if (dst_depth < src_depth) { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_1, 1, clip9); \
-                    } else { \
-                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]))); \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WB16);
-                        } else {
-                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WB16);
-                        }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WL16);
+                } else if (src_depth <= dst_depth) {
+                    for (i = 0; i < height; i++) {
+#define COPY_UP(r,w) \
+    if(shiftonly){\
+        for (j = 0; j < length; j++){ \
+            unsigned int v= r(&srcPtr2[j]);\
+            w(&dstPtr2[j], v<<(dst_depth-src_depth));\
+        }\
+    }else{\
+        for (j = 0; j < length; j++){ \
+            unsigned int v= r(&srcPtr2[j]);\
+            w(&dstPtr2[j], (v<<(dst_depth-src_depth)) | \
+                        (v>>(2*src_depth-dst_depth)));\
+        }\
+    }
+                        if(isBE(c->srcFormat)){
+                            if(isBE(c->dstFormat)){
+                                COPY_UP(AV_RB16, AV_WB16)
+                            } else {
+                                COPY_UP(AV_RB16, AV_WL16)
+                            }
                         } else {
-                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WL16);
+                            if(isBE(c->dstFormat)){
+                                COPY_UP(AV_RL16, AV_WB16)
+                            } else {
+                                COPY_UP(AV_RL16, AV_WL16)
+                            }
                         }
+                        dstPtr2 += dstStride[plane]/2;
+                        srcPtr2 += srcStride[plane]/2;
                     }
                 } else {
-#define W8(a, b) { *(a) = (b); }
-#define COPY9_OR_10TO8(rfunc) \
-                    if (src_depth == 9) { \
-                        DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_1, 1, av_clip_uint8); \
-                    } else { \
-                        DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_3, 2, av_clip_uint8); \
-                    }
-                    if (isBE(c->srcFormat)) {
-                        COPY9_OR_10TO8(AV_RB16);
-                    } else {
-                        COPY9_OR_10TO8(AV_RL16);
-                    }
-                }
-            } else if (is9_OR_10BPS(c->dstFormat)) {
-                const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1 + 1;
-                uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-
-                if (is16BPS(c->srcFormat)) {
-                    const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
-#define COPY16TO9_OR_10(rfunc, wfunc) \
-                    if (dst_depth == 9) { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_128, 7, clip9); \
-                    } else { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_64, 6, clip10); \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY16TO9_OR_10(AV_RB16, AV_WB16);
+                    if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
+                        if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , )
                         } else {
-                            COPY16TO9_OR_10(AV_RL16, AV_WB16);
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , av_bswap16)
                         }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY16TO9_OR_10(AV_RB16, AV_WL16);
+                    }else{
+                        if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, )
                         } else {
-                            COPY16TO9_OR_10(AV_RL16, AV_WL16);
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, av_bswap16)
                         }
                     }
-                } else /* 8bit */ {
-#define COPY8TO9_OR_10(wfunc) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            const int srcpx = srcPtr[j]; \
-                            wfunc(&dstPtr2[j], (srcpx << (dst_depth - 8)) | (srcpx >> (16 - dst_depth))); \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr  += srcStride[plane]; \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        COPY8TO9_OR_10(AV_WB16);
-                    } else {
-                        COPY8TO9_OR_10(AV_WL16);
-                    }
-                }
-            } else if (is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
-                const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
-#define COPY16TO8(rfunc) \
-                    DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                srcPtr2, srcStride[plane] / 2, rfunc, \
-                                dither_8x8_256, 8, av_clip_uint8);
-                if (isBE(c->srcFormat)) {
-                    COPY16TO8(AV_RB16);
-                } else {
-                    COPY16TO8(AV_RL16);
-                }
-            } else if (!is16BPS(c->srcFormat) && is16BPS(c->dstFormat)) {
-                for (i = 0; i < height; i++) {
-                    for (j = 0; j < length; j++) {
-                        dstPtr[ j << 1     ] = srcPtr[j];
-                        dstPtr[(j << 1) + 1] = srcPtr[j];
-                    }
-                    srcPtr += srcStride[plane];
-                    dstPtr += dstStride[plane];
                 }
             } else if (is16BPS(c->srcFormat) && is16BPS(c->dstFormat) &&
                       isBE(c->srcFormat) != isBE(c->dstFormat)) {
@@ -797,6 +888,17 @@ void ff_get_unscaled_swscale(SwsContext *c)
         && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
         c->swScale= rgbToRgbWrapper;
 
+#define isByteRGB(f) (\
+        f == PIX_FMT_RGB32   ||\
+        f == PIX_FMT_RGB32_1 ||\
+        f == PIX_FMT_RGB24   ||\
+        f == PIX_FMT_BGR32   ||\
+        f == PIX_FMT_BGR32_1 ||\
+        f == PIX_FMT_BGR24)
+
+    if (isAnyRGB(srcFormat) && isPlanar(srcFormat) && isByteRGB(dstFormat))
+        c->swScale= planarRgbToRgbWrapper;
+
     /* bswap 16 bits per pixel/component packed formats */
     if (IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_BGR444) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_BGR48)  ||
@@ -809,13 +911,7 @@ void ff_get_unscaled_swscale(SwsContext *c)
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_RGB565))
         c->swScale = packed_16bpc_bswap;
 
-    if ((usePal(srcFormat) && (
-        dstFormat == PIX_FMT_RGB32   ||
-        dstFormat == PIX_FMT_RGB32_1 ||
-        dstFormat == PIX_FMT_RGB24   ||
-        dstFormat == PIX_FMT_BGR32   ||
-        dstFormat == PIX_FMT_BGR32_1 ||
-        dstFormat == PIX_FMT_BGR24)))
+    if (usePal(srcFormat) && isByteRGB(dstFormat))
         c->swScale = palToRgbWrapper;
 
     if (srcFormat == PIX_FMT_YUV422P) {
@@ -846,13 +942,14 @@ void ff_get_unscaled_swscale(SwsContext *c)
     if (srcFormat == PIX_FMT_UYVY422 && dstFormat == PIX_FMT_YUV422P)
         c->swScale = uyvyToYuv422Wrapper;
 
+#define isPlanarGray(x) (isGray(x) && (x) != PIX_FMT_GRAY8A)
     /* simple copy */
     if ( srcFormat == dstFormat ||
         (srcFormat == PIX_FMT_YUVA420P && dstFormat == PIX_FMT_YUV420P) ||
         (srcFormat == PIX_FMT_YUV420P && dstFormat == PIX_FMT_YUVA420P) ||
-        (isPlanarYUV(srcFormat) && isGray(dstFormat)) ||
-        (isPlanarYUV(dstFormat) && isGray(srcFormat)) ||
-        (isGray(dstFormat) && isGray(srcFormat)) ||
+        (isPlanarYUV(srcFormat) && isPlanarGray(dstFormat)) ||
+        (isPlanarYUV(dstFormat) && isPlanarGray(srcFormat)) ||
+        (isPlanarGray(dstFormat) && isPlanarGray(srcFormat)) ||
         (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat) &&
          c->chrDstHSubSample == c->chrSrcHSubSample &&
          c->chrDstVSubSample == c->chrSrcVSubSample &&
@@ -883,7 +980,7 @@ static void reset_ptr(const uint8_t *src[], int format)
     }
 }
 
-static int check_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt,
+static int check_image_pointers(const uint8_t * const data[4], enum PixelFormat pix_fmt,
                                 const int linesizes[4])
 {
     const AVPixFmtDescriptor *desc = &av_pix_fmt_descriptors[pix_fmt];
@@ -908,9 +1005,10 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
                                   int srcSliceH, uint8_t *const dst[],
                                   const int dstStride[])
 {
-    int i;
+    int i, ret;
     const uint8_t *src2[4] = { srcSlice[0], srcSlice[1], srcSlice[2], srcSlice[3] };
     uint8_t *dst2[4] = { dst[0], dst[1], dst[2], dst[3] };
+    uint8_t *rgb0_tmp = NULL;
 
     // do not mess up sliceDir if we have a "trailing" 0-size slice
     if (srcSliceH == 0)
@@ -920,7 +1018,7 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
         av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
         return 0;
     }
-    if (!check_image_pointers(dst, c->dstFormat, dstStride)) {
+    if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
         av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
         return 0;
     }
@@ -935,9 +1033,10 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
 
     if (usePal(c->srcFormat)) {
         for (i = 0; i < 256; i++) {
-            int p, r, g, b, y, u, v;
+            int p, r, g, b, y, u, v, a = 0xff;
             if (c->srcFormat == PIX_FMT_PAL8) {
                 p = ((const uint32_t *)(srcSlice[1]))[i];
+                a = (p >> 24) & 0xFF;
                 r = (p >> 16) & 0xFF;
                 g = (p >>  8) & 0xFF;
                 b =  p        & 0xFF;
@@ -953,8 +1052,7 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
                 r = ( i >> 3     ) * 255;
                 g = ((i >> 1) & 3) * 85;
                 b = ( i       & 1) * 255;
-            } else if (c->srcFormat == PIX_FMT_GRAY8 ||
-                      c->srcFormat == PIX_FMT_Y400A) {
+            } else if (c->srcFormat == PIX_FMT_GRAY8 || c->srcFormat == PIX_FMT_GRAY8A) {
                 r = g = b = i;
             } else {
                 assert(c->srcFormat == PIX_FMT_BGR4_BYTE);
@@ -965,37 +1063,51 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
             y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
             u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
             v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            c->pal_yuv[i] = y + (u << 8) + (v << 16);
+            c->pal_yuv[i]= y + (u<<8) + (v<<16) + (a<<24);
 
             switch (c->dstFormat) {
             case PIX_FMT_BGR32:
 #if !HAVE_BIGENDIAN
             case PIX_FMT_RGB24:
 #endif
-                c->pal_rgb[i] =  r + (g << 8) + (b << 16);
+                c->pal_rgb[i]=  r + (g<<8) + (b<<16) + (a<<24);
                 break;
             case PIX_FMT_BGR32_1:
 #if HAVE_BIGENDIAN
             case PIX_FMT_BGR24:
 #endif
-                c->pal_rgb[i] = (r + (g << 8) + (b << 16)) << 8;
+                c->pal_rgb[i]= a + (r<<8) + (g<<16) + (b<<24);
                 break;
             case PIX_FMT_RGB32_1:
 #if HAVE_BIGENDIAN
             case PIX_FMT_RGB24:
 #endif
-                c->pal_rgb[i] = (b + (g << 8) + (r << 16)) << 8;
+                c->pal_rgb[i]= a + (b<<8) + (g<<16) + (r<<24);
                 break;
             case PIX_FMT_RGB32:
 #if !HAVE_BIGENDIAN
             case PIX_FMT_BGR24:
 #endif
             default:
-                c->pal_rgb[i] =  b + (g << 8) + (r << 16);
+                c->pal_rgb[i]=  b + (g<<8) + (r<<16) + (a<<24);
             }
         }
     }
 
+    if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) {
+        uint8_t *base;
+        int x,y;
+        rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
+        base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
+        for (y=0; y<srcSliceH; y++){
+            memcpy(base + srcStride[0]*y, src2[0] + srcStride[0]*y, 4*c->srcW);
+            for (x=c->src0Alpha-1; x<4*c->srcW; x+=4) {
+                base[ srcStride[0]*y + x] = 0xFF;
+            }
+        }
+        src2[0] = base;
+    }
+
     // copy strides, so they can safely be modified
     if (c->sliceDir == 1) {
         // slices go from top to bottom
@@ -1005,13 +1117,13 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
                               dstStride[3] };
 
         reset_ptr(src2, c->srcFormat);
-        reset_ptr((const uint8_t **) dst2, c->dstFormat);
+        reset_ptr((void*)dst2, c->dstFormat);
 
         /* reset slice direction at end of frame */
         if (srcSliceY + srcSliceH == c->srcH)
             c->sliceDir = 0;
 
-        return c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2,
+        ret = c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2,
                           dstStride2);
     } else {
         // slices go from bottom to top => we flip the image internally
@@ -1031,15 +1143,18 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
         dst2[3] += ( c->dstH                         - 1) * dstStride[3];
 
         reset_ptr(src2, c->srcFormat);
-        reset_ptr((const uint8_t **) dst2, c->dstFormat);
+        reset_ptr((void*)dst2, c->dstFormat);
 
         /* reset slice direction at end of frame */
         if (!srcSliceY)
             c->sliceDir = 0;
 
-        return c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH,
+        ret = c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH,
                           srcSliceH, dst2, dstStride2);
     }
+
+    av_free(rgb0_tmp);
+    return ret;
 }
 
 /* Convert the palette to the same packed 32-bit format as the palette */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 2fe9c5b..70b724e 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1,24 +1,25 @@
 /*
  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define _SVID_SOURCE //needed for MAP_ANONYMOUS
+#define _DARWIN_C_SOURCE // needed for MAP_ANON
 #include <inttypes.h>
 #include <string.h>
 #include <math.h>
@@ -46,21 +47,23 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
 
 unsigned swscale_version(void)
 {
+    av_assert0(LIBSWSCALE_VERSION_MICRO >= 100);
     return LIBSWSCALE_VERSION_INT;
 }
 
 const char *swscale_configuration(void)
 {
-    return LIBAV_CONFIGURATION;
+    return FFMPEG_CONFIGURATION;
 }
 
 const char *swscale_license(void)
 {
 #define LICENSE_PREFIX "libswscale license: "
-    return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
+    return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 
 #define RET 0xC3 //near return opcode for x86
@@ -99,13 +102,20 @@ static const FormatEntry format_entries[PIX_FMT_NB] = {
     [PIX_FMT_RGBA]        = { 1 , 1 },
     [PIX_FMT_ABGR]        = { 1 , 1 },
     [PIX_FMT_BGRA]        = { 1 , 1 },
+    [PIX_FMT_0RGB]        = { 1 , 1 },
+    [PIX_FMT_RGB0]        = { 1 , 1 },
+    [PIX_FMT_0BGR]        = { 1 , 1 },
+    [PIX_FMT_BGR0]        = { 1 , 1 },
     [PIX_FMT_GRAY16BE]    = { 1 , 1 },
     [PIX_FMT_GRAY16LE]    = { 1 , 1 },
     [PIX_FMT_YUV440P]     = { 1 , 1 },
     [PIX_FMT_YUVJ440P]    = { 1 , 1 },
     [PIX_FMT_YUVA420P]    = { 1 , 1 },
+    [PIX_FMT_YUVA444P]    = { 1 , 1 },
     [PIX_FMT_RGB48BE]     = { 1 , 1 },
     [PIX_FMT_RGB48LE]     = { 1 , 1 },
+    [PIX_FMT_RGBA64BE]    = { 0 , 0 },
+    [PIX_FMT_RGBA64LE]    = { 0 , 0 },
     [PIX_FMT_RGB565BE]    = { 1 , 1 },
     [PIX_FMT_RGB565LE]    = { 1 , 1 },
     [PIX_FMT_RGB555BE]    = { 1 , 1 },
@@ -127,6 +137,8 @@ static const FormatEntry format_entries[PIX_FMT_NB] = {
     [PIX_FMT_Y400A]       = { 1 , 0 },
     [PIX_FMT_BGR48BE]     = { 1 , 1 },
     [PIX_FMT_BGR48LE]     = { 1 , 1 },
+    [PIX_FMT_BGRA64BE]    = { 0 , 0 },
+    [PIX_FMT_BGRA64LE]    = { 0 , 0 },
     [PIX_FMT_YUV420P9BE]  = { 1 , 1 },
     [PIX_FMT_YUV420P9LE]  = { 1 , 1 },
     [PIX_FMT_YUV420P10BE] = { 1 , 1 },
@@ -139,6 +151,7 @@ static const FormatEntry format_entries[PIX_FMT_NB] = {
     [PIX_FMT_YUV444P9LE]  = { 1 , 1 },
     [PIX_FMT_YUV444P10BE] = { 1 , 1 },
     [PIX_FMT_YUV444P10LE] = { 1 , 1 },
+    [PIX_FMT_GBR24P]      = { 1 , 0 },
     [PIX_FMT_GBRP]        = { 1 , 0 },
     [PIX_FMT_GBRP9LE]     = { 1 , 0 },
     [PIX_FMT_GBRP9BE]     = { 1 , 0 },
@@ -162,13 +175,12 @@ int sws_isSupportedOutput(enum PixelFormat pix_fmt)
 
 extern const int32_t ff_yuv2rgb_coeffs[8][4];
 
+#if FF_API_SWS_FORMAT_NAME
 const char *sws_format_name(enum PixelFormat format)
 {
-    if ((unsigned)format < PIX_FMT_NB && av_pix_fmt_descriptors[format].name)
-        return av_pix_fmt_descriptors[format].name;
-    else
-        return "Unknown format";
+    return av_get_pix_fmt_name(format);
 }
+#endif
 
 static double getSplineCoeff(double a, double b, double c, double d, double dist)
 {
@@ -182,7 +194,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist
 
 static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
                       int srcW, int dstW, int filterAlign, int one, int flags, int cpu_flags,
-                      SwsVector *srcFilter, SwsVector *dstFilter, double param[2], int is_horizontal)
+                      SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
 {
     int i;
     int filterSize;
@@ -210,7 +222,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
 
     } else if (flags&SWS_POINT) { // lame looking point sampling mode
         int i;
-        int xDstInSrc;
+        int64_t xDstInSrc;
         filterSize= 1;
         FF_ALLOC_OR_GOTO(NULL, filter, dstW*sizeof(*filter)*filterSize, fail);
 
@@ -224,7 +236,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
         }
     } else if ((xInc <= (1<<16) && (flags&SWS_AREA)) || (flags&SWS_FAST_BILINEAR)) { // bilinear upscale
         int i;
-        int xDstInSrc;
+        int64_t xDstInSrc;
         filterSize= 2;
         FF_ALLOC_OR_GOTO(NULL, filter, dstW*sizeof(*filter)*filterSize, fail);
 
@@ -236,7 +248,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
             (*filterPos)[i]= xx;
             //bilinear upscale / linear interpolate / area averaging
             for (j=0; j<filterSize; j++) {
-                int64_t coeff= fone - FFABS((xx<<16) - xDstInSrc)*(fone>>16);
+                int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16);
                 if (coeff<0) coeff=0;
                 filter[i*filterSize + j]= coeff;
                 xx++;
@@ -244,7 +256,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
             xDstInSrc+= xInc;
         }
     } else {
-        int xDstInSrc;
+        int64_t xDstInSrc;
         int sizeFactor;
 
         if      (flags&SWS_BICUBIC)      sizeFactor=  4;
@@ -273,7 +285,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
             int j;
             (*filterPos)[i]= xx;
             for (j=0; j<filterSize; j++) {
-                int64_t d= ((int64_t)FFABS((xx<<17) - xDstInSrc))<<13;
+                int64_t d= (FFABS(((int64_t)xx<<17) - xDstInSrc))<<13;
                 double floatd;
                 int64_t coeff;
 
@@ -459,29 +471,27 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
     //FIXME try to align filterPos if possible
 
     //fix borders
-    if (is_horizontal) {
-        for (i = 0; i < dstW; i++) {
-            int j;
-            if ((*filterPos)[i] < 0) {
-                // move filter coefficients left to compensate for filterPos
-                for (j = 1; j < filterSize; j++) {
-                    int left = FFMAX(j + (*filterPos)[i], 0);
-                    filter[i * filterSize + left] += filter[i * filterSize + j];
-                    filter[i * filterSize + j   ]  = 0;
-                }
-                (*filterPos)[i] = 0;
+    for (i=0; i<dstW; i++) {
+        int j;
+        if ((*filterPos)[i] < 0) {
+            // move filter coefficients left to compensate for filterPos
+            for (j=1; j<filterSize; j++) {
+                int left= FFMAX(j + (*filterPos)[i], 0);
+                filter[i*filterSize + left] += filter[i*filterSize + j];
+                filter[i*filterSize + j]=0;
             }
+            (*filterPos)[i]= 0;
+        }
 
-            if ((*filterPos)[i] + filterSize > srcW) {
-                int shift = (*filterPos)[i] + filterSize - srcW;
-                // move filter coefficients right to compensate for filterPos
-                for (j = filterSize - 2; j >= 0; j--) {
-                    int right = FFMIN(j + shift, filterSize - 1);
-                    filter[i * filterSize + right] += filter[i * filterSize + j];
-                    filter[i * filterSize + j    ]  = 0;
-                }
-                (*filterPos)[i] = srcW - filterSize;
+        if ((*filterPos)[i] + filterSize > srcW) {
+            int shift= (*filterPos)[i] + filterSize - srcW;
+            // move filter coefficients right to compensate for filterPos
+            for (j=filterSize-2; j>=0; j--) {
+                int right= FFMIN(j + shift, filterSize-1);
+                filter[i*filterSize +right] += filter[i*filterSize +j];
+                filter[i*filterSize +j]=0;
             }
+            (*filterPos)[i]= srcW - filterSize;
         }
     }
 
@@ -719,7 +729,7 @@ int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
                              int *srcRange, int **table, int *dstRange,
                              int *brightness, int *contrast, int *saturation)
 {
-    if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
+    if (!c || isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
 
     *inv_table = c->srcColorspaceTable;
     *table     = c->dstColorspaceTable;
@@ -743,6 +753,17 @@ static int handle_jpeg(enum PixelFormat *format)
     }
 }
 
+static int handle_0alpha(enum PixelFormat *format)
+{
+    switch (*format) {
+    case PIX_FMT_0BGR    : *format = PIX_FMT_ABGR   ; return 1;
+    case PIX_FMT_BGR0    : *format = PIX_FMT_BGRA   ; return 4;
+    case PIX_FMT_0RGB    : *format = PIX_FMT_ARGB   ; return 1;
+    case PIX_FMT_RGB0    : *format = PIX_FMT_RGBA   ; return 4;
+    default:                                          return 0;
+    }
+}
+
 SwsContext *sws_alloc_context(void)
 {
     SwsContext *c= av_mallocz(sizeof(SwsContext));
@@ -755,7 +776,7 @@ SwsContext *sws_alloc_context(void)
 
 int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 {
-    int i;
+    int i, j;
     int usesVFilter, usesHFilter;
     int unscaled;
     SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
@@ -763,7 +784,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     int srcH= c->srcH;
     int dstW= c->dstW;
     int dstH= c->dstH;
-    int dst_stride = FFALIGN(dstW * sizeof(int16_t) + 16, 16), dst_stride_px = dst_stride >> 1;
+    int dst_stride = FFALIGN(dstW * sizeof(int16_t)+66, 16);
     int flags, cpu_flags;
     enum PixelFormat srcFormat= c->srcFormat;
     enum PixelFormat dstFormat= c->dstFormat;
@@ -775,12 +796,23 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 
     unscaled = (srcW == dstW && srcH == dstH);
 
+    handle_jpeg(&srcFormat);
+    handle_jpeg(&dstFormat);
+    handle_0alpha(&srcFormat);
+    handle_0alpha(&dstFormat);
+
+    if(srcFormat!=c->srcFormat || dstFormat!=c->dstFormat){
+        av_log(c, AV_LOG_WARNING, "deprecated pixel format used, make sure you did set range correctly\n");
+        c->srcFormat= srcFormat;
+        c->dstFormat= dstFormat;
+    }
+
     if (!sws_isSupportedInput(srcFormat)) {
-        av_log(c, AV_LOG_ERROR, "%s is not supported as input pixel format\n", sws_format_name(srcFormat));
+        av_log(c, AV_LOG_ERROR, "%s is not supported as input pixel format\n", av_get_pix_fmt_name(srcFormat));
         return AVERROR(EINVAL);
     }
     if (!sws_isSupportedOutput(dstFormat)) {
-        av_log(c, AV_LOG_ERROR, "%s is not supported as output pixel format\n", sws_format_name(dstFormat));
+        av_log(c, AV_LOG_ERROR, "%s is not supported as output pixel format\n", av_get_pix_fmt_name(dstFormat));
         return AVERROR(EINVAL);
     }
 
@@ -796,7 +828,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
                 |SWS_SPLINE
                 |SWS_BICUBLIN);
     if(!i || (i & (i-1))) {
-        av_log(c, AV_LOG_ERROR, "Exactly one scaler algorithm must be chosen\n");
+        av_log(c, AV_LOG_ERROR, "Exactly one scaler algorithm must be chosen, got %X\n", i);
         return AVERROR(EINVAL);
     }
     /* sanity check */
@@ -828,21 +860,14 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
 
     // reuse chroma for 2 pixels RGB/BGR unless user wants full chroma interpolation
-    if (flags & SWS_FULL_CHR_H_INT &&
-        isAnyRGB(dstFormat)       &&
-        dstFormat != PIX_FMT_RGBA &&
-        dstFormat != PIX_FMT_ARGB &&
-        dstFormat != PIX_FMT_BGRA &&
-        dstFormat != PIX_FMT_ABGR &&
-        dstFormat != PIX_FMT_RGB24 &&
-        dstFormat != PIX_FMT_BGR24) {
-        av_log(c, AV_LOG_ERROR,
-               "full chroma interpolation for destination format '%s' not yet implemented\n",
-               sws_format_name(dstFormat));
-        flags &= ~SWS_FULL_CHR_H_INT;
-        c->flags = flags;
+    if (isAnyRGB(dstFormat) && !(flags&SWS_FULL_CHR_H_INT)) {
+        if (dstW&1) {
+            av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to odd output size\n");
+            flags |= SWS_FULL_CHR_H_INT;
+            c->flags = flags;
+        } else
+            c->chrDstHSubSample = 1;
     }
-    if (isAnyRGB(dstFormat) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
 
     // drop some chroma lines if the user wants it
     c->vChrDrop= (flags&SWS_SRC_V_CHR_DROP_MASK)>>SWS_SRC_V_CHR_DROP_SHIFT;
@@ -869,7 +894,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         if (c->swScale) {
             if (flags&SWS_PRINT_INFO)
                 av_log(c, AV_LOG_INFO, "using unscaled %s -> %s special converter\n",
-                       sws_format_name(srcFormat), sws_format_name(dstFormat));
+                       av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
             return 0;
         }
     }
@@ -880,18 +905,18 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     c->dstBpc = 1 + av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1;
     if (c->dstBpc < 8)
         c->dstBpc = 8;
+    if (isAnyRGB(srcFormat) || srcFormat == PIX_FMT_PAL8)
+        c->srcBpc = 16;
     if (c->dstBpc == 16)
         dst_stride <<= 1;
-    FF_ALLOC_OR_GOTO(c, c->formatConvBuffer,
-                     (FFALIGN(srcW, 16) * 2 * FFALIGN(c->srcBpc, 8) >> 3) + 16,
-                     fail);
+    FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail);
     if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2 && c->srcBpc == 8 && c->dstBpc <= 10) {
         c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
         if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) {
             if (flags&SWS_PRINT_INFO)
                 av_log(c, AV_LOG_INFO, "output width is not a multiple of 32 -> no MMX2 scaler\n");
         }
-        if (usesHFilter) c->canMMX2BeUsed=0;
+        if (usesHFilter || isNBPS(c->srcFormat) || is16BPS(c->srcFormat) || isAnyRGB(c->srcFormat)) c->canMMX2BeUsed=0;
     }
     else
         c->canMMX2BeUsed=0;
@@ -911,7 +936,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             c->chrXInc+= 20;
         }
         //we don't use the x86 asm scaler if MMX is available
-        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
+        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX && c->dstBpc <= 10) {
             c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
             c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
         }
@@ -936,15 +961,22 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             c->chrMmx2FilterCode = av_malloc(c->chrMmx2FilterCodeSize);
 #endif
 
+#ifdef MAP_ANONYMOUS
+            if (c->lumMmx2FilterCode == MAP_FAILED || c->chrMmx2FilterCode == MAP_FAILED)
+#else
             if (!c->lumMmx2FilterCode || !c->chrMmx2FilterCode)
+#endif
+            {
+                av_log(c, AV_LOG_ERROR, "Failed to allocate MMX2FilterCode\n");
                 return AVERROR(ENOMEM);
+            }
             FF_ALLOCZ_OR_GOTO(c, c->hLumFilter   , (dstW        /8+8)*sizeof(int16_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hChrFilter   , (c->chrDstW  /4+8)*sizeof(int16_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hLumFilterPos, (dstW      /2/8+8)*sizeof(int32_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hChrFilterPos, (c->chrDstW/2/4+8)*sizeof(int32_t), fail);
 
-            initMMX2HScaler(      dstW, c->lumXInc, c->lumMmx2FilterCode, c->hLumFilter, c->hLumFilterPos, 8);
-            initMMX2HScaler(c->chrDstW, c->chrXInc, c->chrMmx2FilterCode, c->hChrFilter, c->hChrFilterPos, 4);
+            initMMX2HScaler(      dstW, c->lumXInc, c->lumMmx2FilterCode, c->hLumFilter, (uint32_t*)c->hLumFilterPos, 8);
+            initMMX2HScaler(c->chrDstW, c->chrXInc, c->chrMmx2FilterCode, c->hChrFilter, (uint32_t*)c->hChrFilterPos, 4);
 
 #ifdef MAP_ANONYMOUS
             mprotect(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
@@ -961,12 +993,12 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             if (initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
                            srcW      ,       dstW, filterAlign, 1<<14,
                            (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags, cpu_flags,
-                           srcFilter->lumH, dstFilter->lumH, c->param, 1) < 0)
+                           srcFilter->lumH, dstFilter->lumH, c->param) < 0)
                 goto fail;
             if (initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
                            c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
                            (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, cpu_flags,
-                           srcFilter->chrH, dstFilter->chrH, c->param, 1) < 0)
+                           srcFilter->chrH, dstFilter->chrH, c->param) < 0)
                 goto fail;
         }
     } // initialize horizontal stuff
@@ -981,12 +1013,12 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         if (initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
                        srcH      ,        dstH, filterAlign, (1<<12),
                        (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags, cpu_flags,
-                       srcFilter->lumV, dstFilter->lumV, c->param, 0) < 0)
+                       srcFilter->lumV, dstFilter->lumV, c->param) < 0)
             goto fail;
         if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
                        c->chrSrcH, c->chrDstH, filterAlign, (1<<12),
                        (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, cpu_flags,
-                       srcFilter->chrV, dstFilter->chrV, c->param, 0) < 0)
+                       srcFilter->chrV, dstFilter->chrV, c->param) < 0)
             goto fail;
 
 #if HAVE_ALTIVEC
@@ -1013,7 +1045,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     c->vLumBufSize= c->vLumFilterSize;
     c->vChrBufSize= c->vChrFilterSize;
     for (i=0; i<dstH; i++) {
-        int chrI= i*c->chrDstH / dstH;
+        int chrI= (int64_t)i*c->chrDstH / dstH;
         int nextSlice= FFMAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
                            ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
 
@@ -1027,20 +1059,20 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 
     // allocate pixbufs (we use dynamic allocation because otherwise we would need to
     // allocate several megabytes to handle all possible cases)
-    FF_ALLOC_OR_GOTO(c, c->lumPixBuf, c->vLumBufSize*3*sizeof(int16_t*), fail);
-    FF_ALLOC_OR_GOTO(c, c->chrUPixBuf, c->vChrBufSize*3*sizeof(int16_t*), fail);
-    FF_ALLOC_OR_GOTO(c, c->chrVPixBuf, c->vChrBufSize*3*sizeof(int16_t*), fail);
+    FF_ALLOC_OR_GOTO(c, c->lumPixBuf, c->vLumBufSize*2*sizeof(int16_t*), fail);
+    FF_ALLOC_OR_GOTO(c, c->chrUPixBuf, c->vChrBufSize*2*sizeof(int16_t*), fail);
+    FF_ALLOC_OR_GOTO(c, c->chrVPixBuf, c->vChrBufSize*2*sizeof(int16_t*), fail);
     if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat) && isALPHA(c->dstFormat))
-        FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf, c->vLumBufSize*3*sizeof(int16_t*), fail);
+        FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf, c->vLumBufSize*2*sizeof(int16_t*), fail);
     //Note we need at least one pixel more at the end because of the MMX code (just in case someone wanna replace the 4000/8000)
     /* align at 16 bytes for AltiVec */
     for (i=0; i<c->vLumBufSize; i++) {
         FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf[i+c->vLumBufSize], dst_stride+16, fail);
         c->lumPixBuf[i] = c->lumPixBuf[i+c->vLumBufSize];
     }
-    // 64 / (c->dstBpc & ~7) is the same as 16 / sizeof(scaling_intermediate)
-    c->uv_off_px   = dst_stride_px + 64 / (c->dstBpc &~ 7);
-    c->uv_off_byte = dst_stride + 16;
+    // 64 / c->scalingBpp is the same as 16 / sizeof(scaling_intermediate)
+    c->uv_off   = (dst_stride>>1) + 64 / (c->dstBpc &~ 7);
+    c->uv_offx2 = dst_stride + 16;
     for (i=0; i<c->vChrBufSize; i++) {
         FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i+c->vChrBufSize], dst_stride*2+32, fail);
         c->chrUPixBuf[i] = c->chrUPixBuf[i+c->vChrBufSize];
@@ -1054,7 +1086,13 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 
     //try to avoid drawing green stuff between the right end and the stride end
     for (i=0; i<c->vChrBufSize; i++)
-        memset(c->chrUPixBuf[i], 64, dst_stride*2+1);
+        if(av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 == 15){
+            av_assert0(c->dstBpc > 10);
+            for(j=0; j<dst_stride/2+1; j++)
+                ((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18;
+        } else
+            for(j=0; j<dst_stride+1; j++)
+                ((int16_t*)(c->chrUPixBuf[i]))[j] = 1<<14;
 
     assert(c->chrDstH <= dstH);
 
@@ -1073,7 +1111,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         else                              av_log(c, AV_LOG_INFO, "ehh flags invalid?! ");
 
         av_log(c, AV_LOG_INFO, "from %s to %s%s ",
-               sws_format_name(srcFormat),
+               av_get_pix_fmt_name(srcFormat),
 #ifdef DITHER1XBPP
                dstFormat == PIX_FMT_BGR555 || dstFormat == PIX_FMT_BGR565 ||
                dstFormat == PIX_FMT_RGB444BE || dstFormat == PIX_FMT_RGB444LE ||
@@ -1081,7 +1119,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 #else
                "",
 #endif
-               sws_format_name(dstFormat));
+               av_get_pix_fmt_name(dstFormat));
 
         if      (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)    av_log(c, AV_LOG_INFO, "using MMX2\n");
         else if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)   av_log(c, AV_LOG_INFO, "using 3DNOW\n");
@@ -1119,6 +1157,8 @@ SwsContext *sws_getContext(int srcW, int srcH, enum PixelFormat srcFormat,
     c->dstH= dstH;
     c->srcRange = handle_jpeg(&srcFormat);
     c->dstRange = handle_jpeg(&dstFormat);
+    c->src0Alpha = handle_0alpha(&srcFormat);
+    c->dst0Alpha = handle_0alpha(&dstFormat);
     c->srcFormat= srcFormat;
     c->dstFormat= dstFormat;
 
@@ -1482,7 +1522,7 @@ void sws_freeContext(SwsContext *c)
 #endif /* HAVE_MMX */
 
     av_freep(&c->yuvTable);
-    av_free(c->formatConvBuffer);
+    av_freep(&c->formatConvBuffer);
 
     av_free(c);
 }
@@ -1517,10 +1557,12 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context,
         context->srcW      = srcW;
         context->srcH      = srcH;
         context->srcRange  = handle_jpeg(&srcFormat);
+        context->src0Alpha = handle_0alpha(&srcFormat);
         context->srcFormat = srcFormat;
         context->dstW      = dstW;
         context->dstH      = dstH;
         context->dstRange  = handle_jpeg(&dstFormat);
+        context->dst0Alpha = handle_0alpha(&dstFormat);
         context->dstFormat = dstFormat;
         context->flags     = flags;
         context->param[0]  = param[0];
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index 66d8845..c4174ee 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -36,8 +36,8 @@ SECTION_RODATA
 %define GV 0xD0E3
 %define BV 0xF6E4
 
-rgb_Yrnd:        times 4 dd 0x84000        ;  16.5 << 15
-rgb_UVrnd:       times 4 dd 0x404000       ; 128.5 << 15
+rgb_Yrnd:        times 4 dd 0x80100        ;  16.5 << 15
+rgb_UVrnd:       times 4 dd 0x400100       ; 128.5 << 15
 bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
 bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
 rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
@@ -83,7 +83,7 @@ SECTION .text
 ; %1 = nr. of XMM registers
 ; %2 = rgb or bgr
 %macro RGB24_TO_Y_FN 2-3
-cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3
 %if mmsize == 8
     mova           m5, [%2_Ycoeff_12x4]
     mova           m6, [%2_Ycoeff_3x56]
@@ -115,6 +115,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
 %if ARCH_X86_64
     movsxd         wq, wd
 %endif
+    add            wq, wq
     add          dstq, wq
     neg            wq
 %if notcpuflag(ssse3)
@@ -158,12 +159,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
     paddd          m2, m3                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
     paddd          m0, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
     paddd          m2, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
+    psrad          m0, 9
+    psrad          m2, 9
     packssdw       m0, m2                 ; (word) { Y[0-7] }
-    packuswb       m0, m0                 ; (byte) { Y[0-7] }
-    movh    [dstq+wq], m0
-    add            wq, mmsize / 2
+    mova    [dstq+wq], m0
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
@@ -172,7 +172,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
 ; %1 = nr. of XMM registers
 ; %2 = rgb or bgr
 %macro RGB24_TO_UV_FN 2-3
-cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
 %if ARCH_X86_64
     mova           m8, [%2_Ucoeff_12x4]
     mova           m9, [%2_Ucoeff_3x56]
@@ -203,10 +203,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
 %endif ; x86-32/64
 %endif ; cpuflag(ssse3)
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
+    add            wq, wq
     add         dstUq, wq
     add         dstVq, wq
     neg            wq
@@ -264,23 +265,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
     paddd          m2, m6                 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
     paddd          m1, m6                 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
     paddd          m4, m6                 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
-    psrad          m1, 15
-    psrad          m4, 15
+    psrad          m0, 9
+    psrad          m2, 9
+    psrad          m1, 9
+    psrad          m4, 9
     packssdw       m0, m1                 ; (word) { U[0-7] }
     packssdw       m2, m4                 ; (word) { V[0-7] }
 %if mmsize == 8
-    packuswb       m0, m0                 ; (byte) { U[0-3] }
-    packuswb       m2, m2                 ; (byte) { V[0-3] }
-    movh   [dstUq+wq], m0
-    movh   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %else ; mmsize == 16
-    packuswb       m0, m2                 ; (byte) { U[0-7], V[0-7] }
-    movh   [dstUq+wq], m0
-    movhps [dstVq+wq], m0
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %endif ; mmsize == 8/16
-    add            wq, mmsize / 2
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; ARCH_X86_64 && %0 == 3
@@ -306,13 +304,15 @@ RGB24_FUNCS 10, 12
 INIT_XMM ssse3
 RGB24_FUNCS 11, 13
 
+%if HAVE_AVX
 INIT_XMM avx
 RGB24_FUNCS 11, 13
+%endif
 
 ; %1 = nr. of XMM registers
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_Y_FN 5-6
-cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
     mova           m5, [rgba_Ycoeff_%2%4]
     mova           m6, [rgba_Ycoeff_%3%5]
 %if %0 == 6
@@ -323,6 +323,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     movsxd         wq, wd
 %endif
     lea          srcq, [srcq+wq*4]
+    add            wq, wq
     add          dstq, wq
     neg            wq
     mova           m4, [rgb_Yrnd]
@@ -330,8 +331,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     psrlw          m7, 8                  ; (word) { 0x00ff } x4
 .loop:
     ; FIXME check alignment and use mova
-    movu           m0, [srcq+wq*4+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
-    movu           m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
     DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
     pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
     pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
@@ -341,12 +342,11 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     paddd          m2, m4                 ; += rgb_Yrnd
     paddd          m0, m1                 ; (dword) { Y[0-3] }
     paddd          m2, m3                 ; (dword) { Y[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
+    psrad          m0, 9
+    psrad          m2, 9
     packssdw       m0, m2                 ; (word) { Y[0-7] }
-    packuswb       m0, m0                 ; (byte) { Y[0-7] }
-    movh    [dstq+wq], m0
-    add            wq, mmsize / 2
+    mova    [dstq+wq], m0
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; %0 == 3
@@ -355,7 +355,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
 ; %1 = nr. of XMM registers
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_UV_FN 5-6
-cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
 %if ARCH_X86_64
     mova           m8, [rgba_Ucoeff_%2%4]
     mova           m9, [rgba_Ucoeff_%3%5]
@@ -376,21 +376,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
 %else ; ARCH_X86_64 && %0 == 6
 .body:
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
+    add            wq, wq
     add         dstUq, wq
     add         dstVq, wq
-    lea          srcq, [srcq+wq*4]
+    lea          srcq, [srcq+wq*2]
     neg            wq
     pcmpeqb        m7, m7
     psrlw          m7, 8                  ; (word) { 0x00ff } x4
     mova           m6, [rgb_UVrnd]
 .loop:
     ; FIXME check alignment and use mova
-    movu           m0, [srcq+wq*4+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
-    movu           m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
     DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
     pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
     pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
@@ -406,25 +407,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
     pmaddwd        m4, coeffU2            ; (dword) { Gx*GU }[4-7]
     paddd          m3, m6                 ; += rgb_UVrnd
     paddd          m5, m6                 ; += rgb_UVrnd
-    psrad          m0, 15
+    psrad          m0, 9
     paddd          m1, m3                 ; (dword) { V[4-7] }
     paddd          m4, m5                 ; (dword) { U[4-7] }
-    psrad          m2, 15
-    psrad          m4, 15
-    psrad          m1, 15
+    psrad          m2, 9
+    psrad          m4, 9
+    psrad          m1, 9
     packssdw       m0, m4                 ; (word) { U[0-7] }
     packssdw       m2, m1                 ; (word) { V[0-7] }
 %if mmsize == 8
-    packuswb       m0, m0                 ; (byte) { U[0-7] }
-    packuswb       m2, m2                 ; (byte) { V[0-7] }
-    movh   [dstUq+wq], m0
-    movh   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %else ; mmsize == 16
-    packuswb       m0, m2                 ; (byte) { U[0-7], V[0-7] }
-    movh   [dstUq+wq], m0
-    movhps [dstVq+wq], m0
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %endif ; mmsize == 8/16
-    add            wq, mmsize / 2
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; ARCH_X86_64 && %0 == 3
@@ -452,8 +450,10 @@ RGB32_FUNCS 0, 0
 INIT_XMM sse2
 RGB32_FUNCS 8, 12
 
+%if HAVE_AVX
 INIT_XMM avx
 RGB32_FUNCS 8, 12
+%endif
 
 ;-----------------------------------------------------------------------------
 ; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
@@ -490,7 +490,7 @@ RGB32_FUNCS 8, 12
 ;      will be the same (i.e. YUYV+AVX), and thus we don't need to
 ;      split the loop in an aligned and unaligned case
 %macro YUYV_TO_Y_FN 2-3
-cglobal %2ToY, 3, 3, %1, dst, src, w
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
 %if ARCH_X86_64
     movsxd         wq, wd
 %endif
@@ -560,11 +560,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w
 ;      will be the same (i.e. UYVY+AVX), and thus we don't need to
 ;      split the loop in an aligned and unaligned case
 %macro YUYV_TO_UV_FN 2-3
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
     add         dstUq, wq
     add         dstVq, wq
@@ -594,8 +594,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
 .loop_%1:
     mov%1          m0, [srcq+wq*2]        ; (byte) { U0, V0, U1, V1, ... }
     mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
-    pand           m2, m0, m4             ; (word) { U0, U1, ..., U7 }
-    pand           m3, m1, m4             ; (word) { U8, U9, ..., U15 }
+    pand           m2, m0, m5             ; (word) { U0, U1, ..., U7 }
+    pand           m3, m1, m5             ; (word) { U8, U9, ..., U15 }
     psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
     psrlw          m1, 8                  ; (word) { V8, V9, ..., V15 }
     packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
@@ -615,11 +615,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
 ; %1 = nr. of XMM registers
 ; %2 = nv12 or nv21
 %macro NVXX_TO_UV_FN 2
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
     add         dstUq, wq
     add         dstVq, wq
@@ -627,8 +627,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
     test         srcq, 15
 %endif
     lea          srcq, [srcq+wq*2]
-    pcmpeqb        m4, m4                 ; (byte) { 0xff } x 16
-    psrlw          m4, 8                  ; (word) { 0x00ff } x 8
+    pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
+    psrlw          m5, 8                  ; (word) { 0x00ff } x 8
 %if mmsize == 16
     jnz .loop_u_start
     neg            wq
@@ -660,6 +660,7 @@ YUYV_TO_UV_FN 3, uyvy
 NVXX_TO_UV_FN 5, nv12
 NVXX_TO_UV_FN 5, nv21
 
+%if HAVE_AVX
 INIT_XMM avx
 ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
 ; that's not faster in practice
@@ -667,3 +668,4 @@ YUYV_TO_UV_FN 3, yuyv
 YUYV_TO_UV_FN 3, uyvy, 1
 NVXX_TO_UV_FN 5, nv12
 NVXX_TO_UV_FN 5, nv21
+%endif
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index c4daa82..11e89a4 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -264,10 +264,12 @@ yuv2planeX_fn  9,  7, 5
 yuv2planeX_fn 10,  7, 5
 yuv2planeX_fn 16,  8, 5
 
+%if HAVE_AVX
 INIT_XMM avx
 yuv2planeX_fn  8, 10, 7
 yuv2planeX_fn  9,  7, 5
 yuv2planeX_fn 10,  7, 5
+%endif
 
 ; %1=outout-bpc, %2=alignment (u/a)
 %macro yuv2plane1_mainloop 2
@@ -402,8 +404,10 @@ yuv2plane1_fn 16, 6, 3
 INIT_XMM sse4
 yuv2plane1_fn 16, 5, 3
 
+%if HAVE_AVX
 INIT_XMM avx
 yuv2plane1_fn  8, 5, 5
 yuv2plane1_fn  9, 5, 3
 yuv2plane1_fn 10, 5, 3
 yuv2plane1_fn 16, 5, 3
+%endif
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 282618c..9359f0b 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -6,20 +6,20 @@
  * Written by Nick Kurshev.
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,6 +68,9 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
 DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
 DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid)    = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi)     = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;
 
 #define RGB2YUV_SHIFT 8
 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index c255610..3bca43c 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -7,20 +7,20 @@
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  * lot of big-endian byte order fixes by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -801,27 +801,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s
     }
 }
 
-/*
-  I use less accurate approximation here by simply left-shifting the input
-  value and filling the low order bits with zeroes. This method improves PNG
-  compression but this scheme cannot reproduce white exactly, since it does
-  not generate an all-ones maximum value; the net effect is to darken the
-  image slightly.
-
-  The better method should be "left bit replication":
-
-   4 3 2 1 0
-   ---------
-   1 1 0 1 1
-
-   7 6 5 4 3  2 1 0
-   ----------------
-   1 1 0 1 1  1 1 0
-   |=======|  |===|
-       |      leftmost bits repeated to fill open bits
-       |
-   original bits
-*/
 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
 {
     const uint16_t *end;
@@ -840,9 +819,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        %6, %%mm0    \n\t"
+            "pmulhw        %6, %%mm1    \n\t"
+            "pmulhw        %7, %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -870,9 +850,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        %6, %%mm0    \n\t"
+            "pmulhw        %6, %%mm1    \n\t"
+            "pmulhw        %7, %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -892,7 +873,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "por        %%mm5, %%mm3    \n\t"
 
             :"=m"(*d)
-            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
+            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r),"m"(mmx_null),"m"(mul15_mid),"m"(mul15_hi)
             :"memory");
         /* borrowed 32 to 24 */
         __asm__ volatile(
@@ -919,9 +900,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
     }
 }
 
@@ -943,9 +924,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        %6, %%mm0    \n\t"
+            "pmulhw        %8, %%mm1    \n\t"
+            "pmulhw        %7, %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -973,9 +956,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        %6, %%mm0    \n\t"
+            "pmulhw        %8, %%mm1    \n\t"
+            "pmulhw        %7, %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -994,7 +979,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "por        %%mm4, %%mm3    \n\t"
             "por        %%mm5, %%mm3    \n\t"
             :"=m"(*d)
-            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
+            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null),"m"(mul15_mid),"m"(mul15_hi),"m"(mul16_mid)
             :"memory");
         /* borrowed 32 to 24 */
         __asm__ volatile(
@@ -1021,9 +1006,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
     }
 }
 
@@ -1066,12 +1051,13 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        %5, %%mm0    \n\t"
+            "pmulhw        %5, %%mm1    \n\t"
+            "pmulhw        %6, %%mm2    \n\t"
             PACK_RGB32
             :"=m"(*d)
-            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
+            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r),"m"(mul15_mid),"m"(mul15_hi)
             :"memory");
         d += 16;
         s += 4;
@@ -1081,9 +1067,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
         *d++ = 255;
     }
 }
@@ -1108,12 +1094,14 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        %5, %%mm0    \n\t"
+            "pmulhw        %7, %%mm1    \n\t"
+            "pmulhw        %6, %%mm2    \n\t"
             PACK_RGB32
             :"=m"(*d)
-            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
+            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid),"m"(mul15_hi),"m"(mul16_mid)
             :"memory");
         d += 16;
         s += 4;
@@ -1123,9 +1111,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
         *d++ = 255;
     }
 }
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 0d367f7..3c8b6fa 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -411,11 +411,13 @@ SCALE_FUNC %1, %2, X, X8, %3, 7, %4
 SCALE_FUNCS  8, 15, %1, %2
 SCALE_FUNCS  9, 15, %1, %3
 SCALE_FUNCS 10, 15, %1, %3
+SCALE_FUNCS 14, 15, %1, %3
 SCALE_FUNCS 16, 15, %1, %4
 %endif ; !sse4
 SCALE_FUNCS  8, 19, %1, %2
 SCALE_FUNCS  9, 19, %1, %3
 SCALE_FUNCS 10, 19, %1, %3
+SCALE_FUNCS 14, 19, %1, %3
 SCALE_FUNCS 16, 19, %1, %4
 %endmacro
 
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index c112cb8..10a1542 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,6 +66,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
 
+
 //MMX versions
 #if HAVE_MMX
 #undef RENAME
@@ -90,6 +91,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
     const int flags= c->flags;
     int16_t **lumPixBuf= c->lumPixBuf;
     int16_t **chrUPixBuf= c->chrUPixBuf;
+    int16_t **alpPixBuf= c->alpPixBuf;
     const int vLumBufSize= c->vLumBufSize;
     const int vChrBufSize= c->vChrBufSize;
     int16_t *vLumFilterPos= c->vLumFilterPos;
@@ -98,6 +100,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
     int16_t *vChrFilter= c->vChrFilter;
     int32_t *lumMmxFilter= c->lumMmxFilter;
     int32_t *chrMmxFilter= c->chrMmxFilter;
+    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
     const int vLumFilterSize= c->vLumFilterSize;
     const int vChrFilterSize= c->vChrFilterSize;
     const int chrDstY= dstY>>c->chrDstVSubSample;
@@ -111,8 +114,9 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
         c->greenDither= ff_dither4[dstY&1];
     c->redDither= ff_dither8[(dstY+1)&1];
     if (dstY < dstH - 2) {
-        const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-        const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+        const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+        const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+        const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
         int i;
         if (flags & SWS_ACCURATE_RND) {
             int s= APCK_SIZE / 8;
@@ -122,6 +126,12 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
                 lumMmxFilter[s*i+APCK_COEF/4  ]=
                 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
                 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
+                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                    *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
+                    *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
+                    alpMmxFilter[s*i+APCK_COEF/4  ]=
+                    alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
+                }
             }
             for (i=0; i<vChrFilterSize; i+=2) {
                 *(const void**)&chrMmxFilter[s*i              ]= chrUSrcPtr[i  ];
@@ -136,6 +146,11 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
                 lumMmxFilter[4*i+2]=
                 lumMmxFilter[4*i+3]=
                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
+                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                    *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
+                    alpMmxFilter[4*i+2]=
+                    alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
+                }
             }
             for (i=0; i<vChrFilterSize; i++) {
                 *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
@@ -147,6 +162,67 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
     }
 }
 
+#if HAVE_MMX2
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    if(((int)dest) & 15){
+        return yuv2yuvX_MMX2(filter, filterSize, src, dest, dstW, dither, offset);
+    }
+    if (offset) {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         "movdqa    %%xmm3, %%xmm4\n\t"
+                         "psrlq       $24, %%xmm3\n\t"
+                         "psllq       $40, %%xmm4\n\t"
+                         "por       %%xmm4, %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    } else {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    }
+    __asm__ volatile(
+        "pxor      %%xmm0, %%xmm0\n\t"
+        "punpcklbw %%xmm0, %%xmm3\n\t"
+        "psraw        $4, %%xmm3\n\t"
+        "movdqa    %%xmm3, %%xmm4\n\t"
+        "movdqa    %%xmm3, %%xmm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movddup                  8(%%"REG_d"), %%xmm0      \n\t" /* filterCoeff */\
+        "movdqa              (%%"REG_S", %%"REG_c", 2), %%xmm2      \n\t" /* srcData */\
+        "movdqa            16(%%"REG_S", %%"REG_c", 2), %%xmm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
+        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
+        "paddw                            %%xmm2, %%xmm3      \n\t"\
+        "paddw                            %%xmm5, %%xmm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%xmm3      \n\t"\
+        "psraw                               $3, %%xmm4      \n\t"\
+        "packuswb                         %%xmm4, %%xmm3      \n\t"
+        "movntdq                          %%xmm3, (%1, %%"REG_c")\n\t"
+        "add                         $16, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movdqa    %%xmm7, %%xmm3\n\t"
+        "movdqa    %%xmm7, %%xmm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+#endif
+
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
 extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
                                                 SwsContext *c, int16_t *data, \
@@ -158,10 +234,12 @@ extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt(
     SCALE_FUNC(filter_n,  8, 15, opt); \
     SCALE_FUNC(filter_n,  9, 15, opt); \
     SCALE_FUNC(filter_n, 10, 15, opt); \
+    SCALE_FUNC(filter_n, 14, 15, opt); \
     SCALE_FUNC(filter_n, 16, 15, opt); \
     SCALE_FUNC(filter_n,  8, 19, opt); \
     SCALE_FUNC(filter_n,  9, 19, opt); \
     SCALE_FUNC(filter_n, 10, 19, opt); \
+    SCALE_FUNC(filter_n, 14, 19, opt); \
     SCALE_FUNC(filter_n, 16, 19, opt)
 
 #define SCALE_FUNCS_MMX(opt) \
@@ -253,6 +331,10 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
 #if HAVE_MMX2
     if (cpu_flags & AV_CPU_FLAG_MMX2)
         sws_init_swScale_MMX2(c);
+    if (cpu_flags & AV_CPU_FLAG_SSE3){
+        if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+            c->yuv2planeX = yuv2yuvX_sse3;
+    }
 #endif
 
 #if HAVE_YASM
@@ -266,7 +348,10 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
     } else if (c->srcBpc == 10) { \
         hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
                                      ff_hscale10to19_ ## filtersize ## _ ## opt1; \
-    } else /* c->srcBpc == 16 */ { \
+    } else if (c->srcBpc == 14 || ((c->srcFormat==PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)) { \
+        hscalefn = c->dstBpc <= 10 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+    } else { /* c->srcBpc == 16 */ \
         hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
                                      ff_hscale16to19_ ## filtersize ## _ ## opt1; \
     } \
@@ -282,7 +367,7 @@ switch(c->dstBpc){ \
     case 16:                          do_16_case;                          break; \
     case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
     case 9:  if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_  ## opt; break; \
-    default:                          vscalefn = ff_yuv2planeX_8_  ## opt; break; \
+    default:                          /*vscalefn = ff_yuv2planeX_8_  ## opt;*/ break; \
     }
 #define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
     switch(c->dstBpc){ \
@@ -401,7 +486,7 @@ switch(c->dstBpc){ \
             c->yuv2plane1 = ff_yuv2plane1_16_sse4;
     }
 
-    if (cpu_flags & AV_CPU_FLAG_AVX) {
+    if (HAVE_AVX && cpu_flags & AV_CPU_FLAG_AVX) {
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx,);
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
 
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 6fc03e4..b179184 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1,25 +1,26 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #undef REAL_MOVNTQ
 #undef MOVNTQ
+#undef MOVNTQ2
 #undef PREFETCH
 
 #if COMPILE_TEMPLATE_MMX2
@@ -30,11 +31,84 @@
 
 #if COMPILE_TEMPLATE_MMX2
 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
 #else
 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
 #endif
 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
 
+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "psrlq       $24, %%mm3\n\t"
+                         "psllq       $40, %%mm4\n\t"
+                         "por       %%mm4, %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         :: "r"(srcDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         :: "r"(srcDither)
+                         );
+    }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    dither_8to16(dither, offset);
+    __asm__ volatile(\
+        "psraw        $4, %%mm3\n\t"
+        "psraw        $4, %%mm4\n\t"
+        "movq    %%mm3, %%mm6\n\t"
+        "movq    %%mm4, %%mm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
+        "movq                (%%"REG_S", %%"REG_c", 2), %%mm2      \n\t" /* srcData */\
+        "movq               8(%%"REG_S", %%"REG_c", 2), %%mm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%mm0, %%mm2      \n\t"\
+        "pmulhw                           %%mm0, %%mm5      \n\t"\
+        "paddw                            %%mm2, %%mm3      \n\t"\
+        "paddw                            %%mm5, %%mm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%mm3      \n\t"\
+        "psraw                               $3, %%mm4      \n\t"\
+        "packuswb                         %%mm4, %%mm3      \n\t"
+        MOVNTQ2 "                         %%mm3, (%1, %%"REG_c")\n\t"
+        "add                          $8, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movq    %%mm6, %%mm3\n\t"
+        "movq    %%mm7, %%mm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+
 #define YSCALEYUV2PACKEDX_UV \
     __asm__ volatile(\
         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
@@ -260,7 +334,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
         YSCALEYUV2PACKEDX_ACCURATE
@@ -293,7 +367,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
         YSCALEYUV2PACKEDX
@@ -350,7 +424,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -374,7 +448,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -427,7 +501,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -451,7 +525,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -584,7 +658,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -608,7 +682,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -649,7 +723,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -670,7 +744,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -786,8 +860,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
             : "%r8"
         );
 #else
-        *(const uint16_t **)(&c->u_temp)=abuf0;
-        *(const uint16_t **)(&c->v_temp)=abuf1;
+        c->u_temp=(intptr_t)abuf0;
+        c->v_temp=(intptr_t)abuf1;
         __asm__ volatile(
             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
             "mov        %4, %%"REG_b"               \n\t"
@@ -1559,9 +1633,9 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 {
     enum PixelFormat dstFormat = c->dstFormat;
 
-    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
-        dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
-        if (!(c->flags & SWS_BITEXACT)) {
+    c->use_mmx_vfilter= 0;
+    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
+        && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
             if (c->flags & SWS_ACCURATE_RND) {
                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                     switch (c->dstFormat) {
@@ -1574,6 +1648,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
                     }
                 }
             } else {
+                c->use_mmx_vfilter= 1;
+                c->yuv2planeX = RENAME(yuv2yuvX    );
                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                     switch (c->dstFormat) {
                     case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
@@ -1585,7 +1661,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
                     }
                 }
             }
-        }
         if (!(c->flags & SWS_FULL_CHR_H_INT)) {
             switch (c->dstFormat) {
             case PIX_FMT_RGB32:
diff --git a/libswscale/x86/yuv2rgb_mmx.c b/libswscale/x86/yuv2rgb_mmx.c
index 0eaea77..fd184eb 100644
--- a/libswscale/x86/yuv2rgb_mmx.c
+++ b/libswscale/x86/yuv2rgb_mmx.c
@@ -7,20 +7,20 @@
  * 1,4,8bpp support and context / deglobalize stuff
  * by Michael Niedermayer (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,10 +68,6 @@ SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (c->srcFormat != PIX_FMT_YUV420P &&
-        c->srcFormat != PIX_FMT_YUVA420P)
-        return NULL;
-
 #if HAVE_MMX2
     if (cpu_flags & AV_CPU_FLAG_MMX2) {
         switch (c->dstFormat) {
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index 5d1fa5b..624de14 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2001-2007 Michael Niedermayer
  *           (c) 2010 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,17 +43,14 @@
     if (h_size * depth > FFABS(dstStride[0]))                        \
         h_size -= 8;                                                 \
                                                                      \
-    if (c->srcFormat == PIX_FMT_YUV422P) {                           \
-        srcStride[1] *= 2;                                           \
-        srcStride[2] *= 2;                                           \
-    }                                                                \
+    vshift = c->srcFormat != PIX_FMT_YUV422P;                        \
                                                                      \
     __asm__ volatile ("pxor %mm4, %mm4\n\t");                        \
     for (y = 0; y < srcSliceH; y++) {                                \
         uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
         const uint8_t *py = src[0] +               y * srcStride[0]; \
-        const uint8_t *pu = src[1] +        (y >> 1) * srcStride[1]; \
-        const uint8_t *pv = src[2] +        (y >> 1) * srcStride[2]; \
+        const uint8_t *pu = src[1] +   (y >> vshift) * srcStride[1]; \
+        const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
         x86_reg index = -h_size / 2;                                 \
 
 #define YUV2RGB_INITIAL_LOAD          \
@@ -141,6 +138,7 @@
         : "+r" (index), "+r" (image)                              \
         : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
           "r" (py - 2*index)                                      \
+        : "memory"                                                \
         );                                                        \
     }                                                             \
 
@@ -148,6 +146,7 @@
         : "+r" (index), "+r" (image)                              \
         : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
           "r" (py - 2*index), "r" (pa - 2*index)                  \
+        : "memory"                                                \
         );                                                        \
     }                                                             \
 
@@ -188,7 +187,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(2)
 
@@ -216,7 +215,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(2)
 
@@ -306,7 +305,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(3)
 
@@ -324,7 +323,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(3)
 
@@ -368,7 +367,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -389,7 +388,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
                                         int srcSliceY, int srcSliceH,
                                         uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -411,7 +410,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -432,7 +431,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
                                         int srcSliceY, int srcSliceH,
                                         uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 24dc960..84e3a17 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -6,20 +6,20 @@
  * 1,4,8bpp support and context / deglobalize stuff
  * by Michael Niedermayer (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@
 #include "swscale_internal.h"
 #include "libavutil/cpu.h"
 #include "libavutil/bswap.h"
+#include "libavutil/pixdesc.h"
 
 extern const uint8_t dither_4x4_16[4][8];
 extern const uint8_t dither_8x8_32[8][8];
@@ -61,9 +62,9 @@ const int *sws_getCoefficients(int colorspace)
 #define LOADCHROMA(i)                               \
     U = pu[i];                                      \
     V = pv[i];                                      \
-    r = (void *)c->table_rV[V];                     \
-    g = (void *)(c->table_gU[U] + c->table_gV[V]);  \
-    b = (void *)c->table_bU[U];
+    r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                     \
+    g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);  \
+    b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
 
 #define PUTRGB(dst,src,i)            \
     Y = src[2*i];                    \
@@ -478,7 +479,7 @@ CLOSEYUV2RGBFUNC(8)
 YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
         const uint8_t *d128 = dither_8x8_220[y&7];
         char out_1 = 0, out_2 = 0;
-        g= c->table_gU[128] + c->table_gV[128];
+        g= c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
 
 #define PUTRGB1(out,src,i,o)    \
     Y = src[2*i];               \
@@ -519,7 +520,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
     if (t)
         return t;
 
-    av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found from %s to %s.\n", sws_format_name(c->srcFormat), sws_format_name(c->dstFormat));
+    av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found from %s to %s.\n",
+           av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
 
     switch (c->dstFormat) {
     case PIX_FMT_BGR48BE:
@@ -527,9 +529,9 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
     case PIX_FMT_RGB48BE:
     case PIX_FMT_RGB48LE:    return yuv2rgb_c_48;
     case PIX_FMT_ARGB:
-    case PIX_FMT_ABGR:       if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) return yuva2argb_c;
+    case PIX_FMT_ABGR:       if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) return yuva2argb_c;
     case PIX_FMT_RGBA:
-    case PIX_FMT_BGRA:       return (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) ? yuva2rgba_c : yuv2rgb_c_32;
+    case PIX_FMT_BGRA:       return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32;
     case PIX_FMT_RGB24:      return yuv2rgb_c_24_rgb;
     case PIX_FMT_BGR24:      return yuv2rgb_c_24_bgr;
     case PIX_FMT_RGB565:
@@ -551,29 +553,27 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
     return NULL;
 }
 
-static void fill_table(uint8_t* table[256], const int elemsize, const int inc, void *y_tab)
+static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int inc, void *y_tab)
 {
     int i;
-    int64_t cb = 0;
     uint8_t *y_table = y_tab;
 
     y_table -= elemsize * (inc >> 9);
 
-    for (i = 0; i < 256; i++) {
+    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
         table[i] = y_table + elemsize * (cb >> 16);
-        cb += inc;
     }
 }
 
-static void fill_gv_table(int table[256], const int elemsize, const int inc)
+static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int inc)
 {
     int i;
-    int64_t cb = 0;
     int off = -(inc >> 9);
 
-    for (i = 0; i < 256; i++) {
+    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
         table[i] = elemsize * (off + (cb >> 16));
-        cb += inc;
     }
 }
 
@@ -611,7 +611,7 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int
     uint8_t *y_table;
     uint16_t *y_table16;
     uint32_t *y_table32;
-    int i, base, rbase, gbase, bbase, abase, needAlpha;
+    int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
     const int yoffs = fullRange ? 384 : 326;
 
     int64_t crv =  inv_table[0];