14 files changed, 1201 insertions, 69 deletions
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 16b1307..6a252b4 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,9 +1,12 @@
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
+OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
+OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 
 YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
+YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)            += x86/af_volume.o
-YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o
+YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/af_volume.asm b/libavfilter/x86/af_volume.asm
index 4e5ad22..f4cbcbc 100644
--- a/libavfilter/x86/af_volume.asm
+++ b/libavfilter/x86/af_volume.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized functions for volume filter
 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -99,9 +99,11 @@ cglobal scale_samples_s32, 4,4,4, dst, src, len, volume
 INIT_XMM sse2
 %define CVTDQ2PD cvtdq2pd
 SCALE_SAMPLES_S32
+%if HAVE_AVX_EXTERNAL
 %define CVTDQ2PD vcvtdq2pd
 INIT_YMM avx
 SCALE_SAMPLES_S32
+%endif
 %undef CVTDQ2PD
 
 ; NOTE: This is not bit-identical with the C version because it clips to
diff --git a/libavfilter/x86/af_volume_init.c b/libavfilter/x86/af_volume_init.c
index c59e0ed..57c7eab 100644
--- a/libavfilter/x86/af_volume_init.c
+++ b/libavfilter/x86/af_volume_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm
index 00fcb16..3581f89 100644
--- a/libavfilter/x86/vf_gradfun.asm
+++ b/libavfilter/x86/vf_gradfun.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* x86-optimized functions for gradfun filter
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavfilter/x86/vf_gradfun_init.c b/libavfilter/x86/vf_gradfun_init.c
index 3f23bf6..3f5b842 100644
--- a/libavfilter/x86/vf_gradfun_init.c
+++ b/libavfilter/x86/vf_gradfun_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2009 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,20 +26,19 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/gradfun.h"
 
-void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, uint8_t *src,
-                                   uint16_t *dc, int thresh,
+void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, const uint8_t *src,
+                                   const uint16_t *dc, int thresh,
                                    const uint16_t *dithers);
-
-void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, uint8_t *src,
-                                  uint16_t *dc, int thresh,
+void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, const uint8_t *src,
+                                  const uint16_t *dc, int thresh,
                                   const uint16_t *dithers);
 
 void ff_gradfun_blur_line_movdqa_sse2(intptr_t x, uint16_t *buf,
-                                      uint16_t *buf1, uint16_t *dc,
-                                      uint8_t *src1, uint8_t *src2);
+                                      const uint16_t *buf1, uint16_t *dc,
+                                      const uint8_t *src1, const uint8_t *src2);
 void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t *buf,
-                                      uint16_t *buf1, uint16_t *dc,
-                                      uint8_t *src1, uint8_t *src2);
+                                      const uint16_t *buf1, uint16_t *dc,
+                                      const uint8_t *src1, const uint8_t *src2);
 
 #if HAVE_YASM
 static void gradfun_filter_line(uint8_t *dst, uint8_t *src, uint16_t *dc,
@@ -58,22 +57,23 @@ static void gradfun_filter_line(uint8_t *dst, uint8_t *src, uint16_t *dc,
                                   thresh, dithers);
 }
 
-static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc,
+static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src,
+                                       const uint16_t *dc,
                                        int width, int thresh,
                                        const uint16_t *dithers)
 {
     gradfun_filter_line(dst, src, dc, width, thresh, dithers, 3);
 }
 
-static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc,
+static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc,
                                       int width, int thresh,
                                       const uint16_t *dithers)
 {
     gradfun_filter_line(dst, src, dc, width, thresh, dithers, 7);
 }
 
-static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
-                                   uint8_t *src, int src_linesize, int width)
+static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1,
+                                   const uint8_t *src, int src_linesize, int width)
 {
     intptr_t x = -2 * width;
     if (((intptr_t) src | src_linesize) & 15)
diff --git a/libavfilter/x86/vf_hqdn3d.asm b/libavfilter/x86/vf_hqdn3d.asm
index 02632a1..961127e 100644
--- a/libavfilter/x86/vf_hqdn3d.asm
+++ b/libavfilter/x86/vf_hqdn3d.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* Copyright (c) 2012 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavfilter/x86/vf_hqdn3d_init.c b/libavfilter/x86/vf_hqdn3d_init.c
index 06f9e00..b63916b 100644
--- a/libavfilter/x86/vf_hqdn3d_init.c
+++ b/libavfilter/x86/vf_hqdn3d_init.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2012 Loren Merritt
  *
- * Libav is free software; you can redistribute it and/or modify
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License along
- * with Libav; if not, write to the Free Software Foundation, Inc.,
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
diff --git a/libavfilter/x86/vf_pullup.asm b/libavfilter/x86/vf_pullup.asm
new file mode 100644
index 0000000..3689b04
--- /dev/null
+++ b/libavfilter/x86/vf_pullup.asm
@@ -0,0 +1,178 @@
+;*****************************************************************************
+;* x86-optimized functions for pullup filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+INIT_MMX mmx
+cglobal pullup_filter_diff, 3, 5, 8, first, second, size
+    mov        r3, 4
+    pxor       m4, m4
+    pxor       m7, m7
+
+.loop:
+    movq       m0, [firstq]
+    movq       m2, [firstq]
+    add        firstq, sizeq
+    movq       m1, [secondq]
+    add        secondq, sizeq
+    psubusb    m2, m1
+    psubusb    m1, m0
+    movq       m0, m2
+    movq       m3, m1
+    punpcklbw  m0, m7
+    punpcklbw  m1, m7
+    punpckhbw  m2, m7
+    punpckhbw  m3, m7
+    paddw      m4, m0
+    paddw      m4, m1
+    paddw      m4, m2
+    paddw      m4, m3
+
+    dec        r3
+    jnz .loop
+
+    movq       m3, m4
+    punpcklwd  m4, m7
+    punpckhwd  m3, m7
+    paddd      m3, m4
+    movd      eax, m3
+    psrlq      m3, 32
+    movd       r4, m3
+    add       eax, r4d
+    RET
+
+INIT_MMX mmx
+cglobal pullup_filter_comb, 3, 5, 8, first, second, size
+    mov        r3, 4
+    pxor       m6, m6
+    pxor       m7, m7
+    sub        secondq, sizeq
+
+.loop:
+    movq       m0, [secondq]
+    movq       m1, [secondq]
+    punpcklbw  m0, m7
+    movq       m2, [secondq+sizeq]
+    punpcklbw  m1, m7
+    punpcklbw  m2, m7
+    paddw      m0, m0
+    paddw      m1, m2
+    movq       m2, m0
+    psubusw    m0, m1
+    psubusw    m1, m2
+    paddw      m6, m0
+    paddw      m6, m1
+
+    movq       m0, [firstq]
+    movq       m1, [secondq]
+    punpckhbw  m0, m7
+    movq       m2, [secondq+sizeq]
+    punpckhbw  m1, m7
+    punpckhbw  m2, m7
+    paddw      m0, m0
+    paddw      m1, m2
+    movq       m2, m0
+    psubusw    m0, m1
+    psubusw    m1, m2
+    paddw      m6, m0
+    paddw      m6, m1
+
+    movq       m0, [secondq+sizeq]
+    movq       m1, [firstq]
+    punpcklbw  m0, m7
+    movq       m2, [firstq+sizeq]
+    punpcklbw  m1, m7
+    punpcklbw  m2, m7
+    paddw      m0, m0
+    paddw      m1, m2
+    movq       m2, m0
+    psubusw    m0, m1
+    psubusw    m1, m2
+    paddw      m6, m0
+    paddw      m6, m1
+
+    movq       m0, [secondq+sizeq]
+    movq       m1, [firstq]
+    punpckhbw  m0, m7
+    movq       m2, [firstq+sizeq]
+    punpckhbw  m1, m7
+    punpckhbw  m2, m7
+    paddw      m0, m0
+    paddw      m1, m2
+    movq       m2, m0
+    psubusw    m0, m1
+    psubusw    m1, m2
+    paddw      m6, m0
+    paddw      m6, m1
+
+    add        firstq, sizeq
+    add        secondq, sizeq
+    dec        r3
+    jnz .loop
+
+    movq       m5, m6
+    punpcklwd  m6, m7
+    punpckhwd  m5, m7
+    paddd      m5, m6
+    movd      eax, m5
+    psrlq      m5, 32
+    movd       r4, m5
+    add       eax, r4d
+    RET
+
+INIT_MMX mmx
+cglobal pullup_filter_var, 3, 5, 8, first, second, size
+    mov        r3, 3
+    pxor       m4, m4
+    pxor       m7, m7
+
+.loop:
+    movq       m0, [firstq]
+    movq       m2, [firstq]
+    movq       m1, [firstq+sizeq]
+    add        firstq, sizeq
+    psubusb    m2, m1
+    psubusb    m1, m0
+    movq       m0, m2
+    movq       m3, m1
+    punpcklbw  m0, m7
+    punpcklbw  m1, m7
+    punpckhbw  m2, m7
+    punpckhbw  m3, m7
+    paddw      m4, m0
+    paddw      m4, m1
+    paddw      m4, m2
+    paddw      m4, m3
+
+    dec        r3
+    jnz .loop
+
+    movq       m3, m4
+    punpcklwd  m4, m7
+    punpckhwd  m3, m7
+    paddd      m3, m4
+    movd      eax, m3
+    psrlq      m3, 32
+    movd       r4, m3
+    add       eax, r4d
+    shl       eax, 2
+    RET
diff --git a/libavfilter/x86/vf_pullup_init.c b/libavfilter/x86/vf_pullup_init.c
new file mode 100644
index 0000000..9948abf
--- /dev/null
+++ b/libavfilter/x86/vf_pullup_init.c
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_pullup.h"
+
+int ff_pullup_filter_diff_mmx(const uint8_t *a, const uint8_t *b, int s);
+int ff_pullup_filter_comb_mmx(const uint8_t *a, const uint8_t *b, int s);
+int ff_pullup_filter_var_mmx (const uint8_t *a, const uint8_t *b, int s);
+
+av_cold void ff_pullup_init_x86(PullupContext *s)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        s->diff = ff_pullup_filter_diff_mmx;
+        s->comb = ff_pullup_filter_comb_mmx;
+        s->var  = ff_pullup_filter_var_mmx;
+    }
+#endif
+}
diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
new file mode 100644
index 0000000..eb46ddc
--- /dev/null
+++ b/libavfilter/x86/vf_spp.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/vf_spp.h"
+
+#if HAVE_MMX_INLINE
+static void hardthresh_mmx(int16_t dst[64], const int16_t src[64],
+                           int qp, const uint8_t *permutation)
+{
+    int bias = 0; //FIXME
+    unsigned int threshold1;
+
+    threshold1 = qp * ((1<<4) - bias) - 1;
+
+#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3)    \
+    "movq " #src0 ", %%mm0      \n"                                     \
+    "movq " #src1 ", %%mm1      \n"                                     \
+    "movq " #src2 ", %%mm2      \n"                                     \
+    "movq " #src3 ", %%mm3      \n"                                     \
+    "psubw %%mm4, %%mm0         \n"                                     \
+    "psubw %%mm4, %%mm1         \n"                                     \
+    "psubw %%mm4, %%mm2         \n"                                     \
+    "psubw %%mm4, %%mm3         \n"                                     \
+    "paddusw %%mm5, %%mm0       \n"                                     \
+    "paddusw %%mm5, %%mm1       \n"                                     \
+    "paddusw %%mm5, %%mm2       \n"                                     \
+    "paddusw %%mm5, %%mm3       \n"                                     \
+    "paddw %%mm6, %%mm0         \n"                                     \
+    "paddw %%mm6, %%mm1         \n"                                     \
+    "paddw %%mm6, %%mm2         \n"                                     \
+    "paddw %%mm6, %%mm3         \n"                                     \
+    "psubusw %%mm6, %%mm0       \n"                                     \
+    "psubusw %%mm6, %%mm1       \n"                                     \
+    "psubusw %%mm6, %%mm2       \n"                                     \
+    "psubusw %%mm6, %%mm3       \n"                                     \
+    "psraw $3, %%mm0            \n"                                     \
+    "psraw $3, %%mm1            \n"                                     \
+    "psraw $3, %%mm2            \n"                                     \
+    "psraw $3, %%mm3            \n"                                     \
+                                                                        \
+    "movq %%mm0, %%mm7          \n"                                     \
+    "punpcklwd %%mm2, %%mm0     \n" /*A*/                               \
+    "punpckhwd %%mm2, %%mm7     \n" /*C*/                               \
+    "movq %%mm1, %%mm2          \n"                                     \
+    "punpcklwd %%mm3, %%mm1     \n" /*B*/                               \
+    "punpckhwd %%mm3, %%mm2     \n" /*D*/                               \
+    "movq %%mm0, %%mm3          \n"                                     \
+    "punpcklwd %%mm1, %%mm0     \n" /*A*/                               \
+    "punpckhwd %%mm7, %%mm3     \n" /*C*/                               \
+    "punpcklwd %%mm2, %%mm7     \n" /*B*/                               \
+    "punpckhwd %%mm2, %%mm1     \n" /*D*/                               \
+                                                                        \
+    "movq %%mm0, " #dst0 "      \n"                                     \
+    "movq %%mm7, " #dst1 "      \n"                                     \
+    "movq %%mm3, " #dst2 "      \n"                                     \
+    "movq %%mm1, " #dst3 "      \n"
+
+    __asm__ volatile(
+        "movd %2, %%mm4             \n"
+        "movd %3, %%mm5             \n"
+        "movd %4, %%mm6             \n"
+        "packssdw %%mm4, %%mm4      \n"
+        "packssdw %%mm5, %%mm5      \n"
+        "packssdw %%mm6, %%mm6      \n"
+        "packssdw %%mm4, %%mm4      \n"
+        "packssdw %%mm5, %%mm5      \n"
+        "packssdw %%mm6, %%mm6      \n"
+        REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
+        REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+        REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+        REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+        : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
+    );
+    dst[0] = (src[0] + 4) >> 3;
+}
+
+static void softthresh_mmx(int16_t dst[64], const int16_t src[64],
+                           int qp, const uint8_t *permutation)
+{
+    int bias = 0; //FIXME
+    unsigned int threshold1;
+
+    threshold1 = qp*((1<<4) - bias) - 1;
+
+#undef REQUANT_CORE
+#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3)    \
+    "movq " #src0 ", %%mm0      \n"                                     \
+    "movq " #src1 ", %%mm1      \n"                                     \
+    "pxor %%mm6, %%mm6          \n"                                     \
+    "pxor %%mm7, %%mm7          \n"                                     \
+    "pcmpgtw %%mm0, %%mm6       \n"                                     \
+    "pcmpgtw %%mm1, %%mm7       \n"                                     \
+    "pxor %%mm6, %%mm0          \n"                                     \
+    "pxor %%mm7, %%mm1          \n"                                     \
+    "psubusw %%mm4, %%mm0       \n"                                     \
+    "psubusw %%mm4, %%mm1       \n"                                     \
+    "pxor %%mm6, %%mm0          \n"                                     \
+    "pxor %%mm7, %%mm1          \n"                                     \
+    "movq " #src2 ", %%mm2      \n"                                     \
+    "movq " #src3 ", %%mm3      \n"                                     \
+    "pxor %%mm6, %%mm6          \n"                                     \
+    "pxor %%mm7, %%mm7          \n"                                     \
+    "pcmpgtw %%mm2, %%mm6       \n"                                     \
+    "pcmpgtw %%mm3, %%mm7       \n"                                     \
+    "pxor %%mm6, %%mm2          \n"                                     \
+    "pxor %%mm7, %%mm3          \n"                                     \
+    "psubusw %%mm4, %%mm2       \n"                                     \
+    "psubusw %%mm4, %%mm3       \n"                                     \
+    "pxor %%mm6, %%mm2          \n"                                     \
+    "pxor %%mm7, %%mm3          \n"                                     \
+                                                                        \
+    "paddsw %%mm5, %%mm0        \n"                                     \
+    "paddsw %%mm5, %%mm1        \n"                                     \
+    "paddsw %%mm5, %%mm2        \n"                                     \
+    "paddsw %%mm5, %%mm3        \n"                                     \
+    "psraw $3, %%mm0            \n"                                     \
+    "psraw $3, %%mm1            \n"                                     \
+    "psraw $3, %%mm2            \n"                                     \
+    "psraw $3, %%mm3            \n"                                     \
+                                                                        \
+    "movq %%mm0, %%mm7          \n"                                     \
+    "punpcklwd %%mm2, %%mm0     \n" /*A*/                               \
+    "punpckhwd %%mm2, %%mm7     \n" /*C*/                               \
+    "movq %%mm1, %%mm2          \n"                                     \
+    "punpcklwd %%mm3, %%mm1     \n" /*B*/                               \
+    "punpckhwd %%mm3, %%mm2     \n" /*D*/                               \
+    "movq %%mm0, %%mm3          \n"                                     \
+    "punpcklwd %%mm1, %%mm0     \n" /*A*/                               \
+    "punpckhwd %%mm7, %%mm3     \n" /*C*/                               \
+    "punpcklwd %%mm2, %%mm7     \n" /*B*/                               \
+    "punpckhwd %%mm2, %%mm1     \n" /*D*/                               \
+                                                                        \
+    "movq %%mm0, " #dst0 "      \n"                                     \
+    "movq %%mm7, " #dst1 "      \n"                                     \
+    "movq %%mm3, " #dst2 "      \n"                                     \
+    "movq %%mm1, " #dst3 "      \n"
+
+    __asm__ volatile(
+        "movd %2, %%mm4             \n"
+        "movd %3, %%mm5             \n"
+        "packssdw %%mm4, %%mm4      \n"
+        "packssdw %%mm5, %%mm5      \n"
+        "packssdw %%mm4, %%mm4      \n"
+        "packssdw %%mm5, %%mm5      \n"
+        REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
+        REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+        REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+        REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+        : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed?
+    );
+
+    dst[0] = (src[0] + 4) >> 3;
+}
+
+static void store_slice_mmx(uint8_t *dst, const int16_t *src,
+                            int dst_stride, int src_stride,
+                            int width, int height, int log2_scale,
+                            const uint8_t dither[8][8])
+{
+    int y;
+
+    for (y = 0; y < height; y++) {
+        uint8_t *dst1 = dst;
+        const int16_t *src1 = src;
+        __asm__ volatile(
+            "movq (%3), %%mm3           \n"
+            "movq (%3), %%mm4           \n"
+            "movd %4, %%mm2             \n"
+            "pxor %%mm0, %%mm0          \n"
+            "punpcklbw %%mm0, %%mm3     \n"
+            "punpckhbw %%mm0, %%mm4     \n"
+            "psraw %%mm2, %%mm3         \n"
+            "psraw %%mm2, %%mm4         \n"
+            "movd %5, %%mm2             \n"
+            "1:                         \n"
+            "movq (%0), %%mm0           \n"
+            "movq 8(%0), %%mm1          \n"
+            "paddw %%mm3, %%mm0         \n"
+            "paddw %%mm4, %%mm1         \n"
+            "psraw %%mm2, %%mm0         \n"
+            "psraw %%mm2, %%mm1         \n"
+            "packuswb %%mm1, %%mm0      \n"
+            "movq %%mm0, (%1)           \n"
+            "add $16, %0                \n"
+            "add $8, %1                 \n"
+            "cmp %2, %1                 \n"
+            " jb 1b                     \n"
+            : "+r" (src1), "+r"(dst1)
+            : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale)
+        );
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+#endif /* HAVE_MMX_INLINE */
+
+av_cold void ff_spp_init_x86(SPPContext *s)
+{
+#if HAVE_MMX_INLINE
+    int cpu_flags = av_get_cpu_flags();
+
+    if (cpu_flags & AV_CPU_FLAG_MMX) {
+        s->store_slice = store_slice_mmx;
+        switch (s->mode) {
+        case 0: s->requantize = hardthresh_mmx; break;
+        case 1: s->requantize = softthresh_mmx; break;
+        }
+    }
+#endif
+}
diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm
index bc4b3ce..ebc505c 100644
--- a/libavfilter/x86/vf_yadif.asm
+++ b/libavfilter/x86/vf_yadif.asm
@@ -4,20 +4,20 @@
 ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or modify
+;* FFmpeg is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
 ;* the Free Software Foundation; either version 2 of the License, or
 ;* (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ;* GNU General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU General Public License along
-;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ;******************************************************************************
 
@@ -90,17 +90,17 @@ SECTION .text
 %endmacro
 
 %macro LOAD 2
-    movh      m%1, %2
-    punpcklbw m%1, m7
+    movh      %1, %2
+    punpcklbw %1, m7
 %endmacro
 
 %macro FILTER 3
 .loop%1:
     pxor         m7, m7
-    LOAD          0, [curq+t1]
-    LOAD          1, [curq+t0]
-    LOAD          2, [%2]
-    LOAD          3, [%3]
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
     mova         m4, m3
     paddw        m3, m2
     psraw        m3, 1
@@ -109,8 +109,8 @@ SECTION .text
     mova   [rsp+32], m1
     psubw        m2, m4
     ABS1         m2, m4
-    LOAD          3, [prevq+t1]
-    LOAD          4, [prevq+t0]
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
     psubw        m3, m0
     psubw        m4, m1
     ABS1         m3, m5
@@ -119,8 +119,8 @@ SECTION .text
     psrlw        m2, 1
     psrlw        m3, 1
     pmaxsw       m2, m3
-    LOAD          3, [nextq+t1]
-    LOAD          4, [nextq+t0]
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
     psubw        m3, m0
     psubw        m4, m1
     ABS1         m3, m5
@@ -166,10 +166,10 @@ SECTION .text
     mova         m6, [rsp+48]
     cmp   DWORD r8m, 2
     jge .end%1
-    LOAD          2, [%2+t1*2]
-    LOAD          4, [%3+t1*2]
-    LOAD          3, [%2+t0*2]
-    LOAD          5, [%3+t0*2]
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
     paddw        m2, m4
     paddw        m3, m5
     psrlw        m2, 1
@@ -220,8 +220,6 @@ cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
 cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
                                         mrefs, parity, mode
 %endif
-    cmp      DWORD wm, 0
-    jle .ret
 %if ARCH_X86_32
     mov            r4, r5mp
     mov            r5, r6mp
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 5978a4f..ae09bb0 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -1,26 +1,25 @@
 /*
  * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or modify
+ * FFmpeg is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License along
- * with Libav; if not, write to the Free Software Foundation, Inc.,
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -36,18 +35,65 @@ void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
                                 void *next, int w, int prefs,
                                 int mrefs, int parity, int mode);
 
+void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur,
+                                       void *next, int w, int prefs,
+                                       int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur,
+                                      void *next, int w, int prefs,
+                                      int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+
+void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
+                                       void *next, int w, int prefs,
+                                       int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
+                                      void *next, int w, int prefs,
+                                      int mrefs, int parity, int mode);
+
 av_cold void ff_yadif_init_x86(YADIFContext *yadif)
 {
 #if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
+    int bit_depth = (!yadif->csp) ? 8
+                                  : yadif->csp->comp[0].depth_minus1 + 1;
 
+    if (bit_depth >= 15) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
+        if (EXTERNAL_SSE4(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
+    } else if ( bit_depth >= 9 && bit_depth <= 14) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
+    } else {
 #if ARCH_X86_32
-    if (EXTERNAL_MMXEXT(cpu_flags))
-        yadif->filter_line = ff_yadif_filter_line_mmxext;
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_mmxext;
 #endif /* ARCH_X86_32 */
-    if (EXTERNAL_SSE2(cpu_flags))
-        yadif->filter_line = ff_yadif_filter_line_sse2;
-    if (EXTERNAL_SSSE3(cpu_flags))
-        yadif->filter_line = ff_yadif_filter_line_ssse3;
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_ssse3;
+    }
 #endif /* HAVE_YASM */
 }
diff --git a/libavfilter/x86/yadif-10.asm b/libavfilter/x86/yadif-10.asm
new file mode 100644
index 0000000..d586deb
--- /dev/null
+++ b/libavfilter/x86/yadif-10.asm
@@ -0,0 +1,282 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro PABS 2
+%if cpuflag(ssse3)
+    pabsw %1, %1
+%else
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+    pmaxuw %1, %2
+%else
+    psubusw %1, %2
+    paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+    movu      m2, [curq+t1+%1*2]
+    movu      m3, [curq+t0+%2*2]
+    mova      m4, m2
+    mova      m5, m2
+    pxor      m4, m3
+    pavgw     m5, m3
+    pand      m4, [pw_1]
+    psubusw   m5, m4
+%if mmsize == 16
+    psrldq    m5, 2
+%else
+    psrlq     m5, 16
+%endif
+    mova      m4, m2
+    psubusw   m2, m3
+    psubusw   m3, m4
+    PMAXUW    m2, m3
+    mova      m3, m2
+    mova      m4, m2
+%if mmsize == 16
+    psrldq    m3, 2
+    psrldq    m4, 4
+%else
+    psrlq     m3, 16
+    psrlq     m4, 32
+%endif
+    paddw     m2, m3
+    paddw     m2, m4
+%endmacro
+
+%macro CHECK1 0
+    mova    m3, m0
+    pcmpgtw m3, m2
+    pminsw  m0, m2
+    mova    m6, m3
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+; %macro CHECK2 0
+;     paddw   m6, [pw_1]
+;     psllw   m6, 14
+;     paddsw  m2, m6
+;     mova    m3, m0
+;     pcmpgtw m3, m2
+;     pminsw  m0, m2
+;     pand    m5, m3
+;     pandn   m3, m1
+;     por     m3, m5
+;     mova    m1, m3
+; %endmacro
+
+; This version of CHECK2 is required for 14-bit samples.  The left-shift trick
+; in the old code is not large enough to correctly select pixels or scores.
+
+%macro CHECK2 0
+    mova    m3, m0
+    pcmpgtw m0, m2
+    pand    m0, m6
+    mova    m6, m0
+    pand    m5, m6
+    pand    m2, m0
+    pandn   m6, m1
+    pandn   m0, m3
+    por     m6, m5
+    por     m0, m2
+    mova    m1, m6
+%endmacro
+
+%macro LOAD 2
+    movu      %1, %2
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+    pxor         m7, m7
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
+    mova         m4, m3
+    paddw        m3, m2
+    psraw        m3, 1
+    mova   [rsp+ 0], m0
+    mova   [rsp+16], m3
+    mova   [rsp+32], m1
+    psubw        m2, m4
+    PABS         m2, m4
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddw        m3, m4
+    psrlw        m2, 1
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddw        m3, m4
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    mova   [rsp+48], m2
+
+    paddw        m1, m0
+    paddw        m0, m0
+    psubw        m0, m1
+    psrlw        m1, 1
+    PABS         m0, m2
+
+    movu         m2, [curq+t1-1*2]
+    movu         m3, [curq+t0-1*2]
+    mova         m4, m2
+    psubusw      m2, m3
+    psubusw      m3, m4
+    PMAXUW       m2, m3
+%if mmsize == 16
+    mova         m3, m2
+    psrldq       m3, 4
+%else
+    mova         m3, m2
+    psrlq        m3, 32
+%endif
+    paddw        m0, m2
+    paddw        m0, m3
+    psubw        m0, [pw_1]
+
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+
+    mova         m6, [rsp+48]
+    cmp   DWORD r8m, 2
+    jge .end%1
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
+    paddw        m2, m4
+    paddw        m3, m5
+    psrlw        m2, 1
+    psrlw        m3, 1
+    mova         m4, [rsp+ 0]
+    mova         m5, [rsp+16]
+    mova         m7, [rsp+32]
+    psubw        m2, m4
+    psubw        m3, m7
+    mova         m0, m5
+    psubw        m5, m4
+    psubw        m0, m7
+    mova         m4, m2
+    pminsw       m2, m3
+    pmaxsw       m3, m4
+    pmaxsw       m2, m5
+    pminsw       m3, m5
+    pmaxsw       m2, m0
+    pminsw       m3, m0
+    pxor         m4, m4
+    pmaxsw       m6, m3
+    psubw        m4, m2
+    pmaxsw       m6, m4
+
+.end%1:
+    mova         m2, [rsp+16]
+    mova         m3, m2
+    psubw        m2, m6
+    paddw        m3, m6
+    pmaxsw       m1, m2
+    pminsw       m1, m3
+
+    movu     [dstq], m1
+    add        dstq, mmsize-4
+    add       prevq, mmsize-4
+    add        curq, mmsize-4
+    add       nextq, mmsize-4
+    sub   DWORD r4m, mmsize/2-2
+    jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+    mov            r4, r5mp
+    mov            r5, r6mp
+    DECLARE_REG_TMP 4,5
+%else
+    movsxd         r5, DWORD r5m
+    movsxd         r6, DWORD r6m
+    DECLARE_REG_TMP 5,6
+%endif
+
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq
+    jmp .ret
+
+.parity0:
+    FILTER 0, curq, nextq
+
+.ret:
+    RET
+%endmacro
+
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
diff --git a/libavfilter/x86/yadif-16.asm b/libavfilter/x86/yadif-16.asm
new file mode 100644
index 0000000..a2e6006
--- /dev/null
+++ b/libavfilter/x86/yadif-16.asm
@@ -0,0 +1,347 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1:    times 8 dw 1
+pw_8000: times 8 dw 0x8000
+pd_1:    times 4 dd 1
+pd_8000: times 4 dd 0x8000
+
+SECTION .text
+
+%macro PIXSHIFT1 1
+%if cpuflag(sse2)
+    psrldq %1, 2
+%else
+    psrlq %1, 16
+%endif
+%endmacro
+
+%macro PIXSHIFT2 1
+%if cpuflag(sse2)
+    psrldq %1, 4
+%else
+    psrlq %1, 32
+%endif
+%endmacro
+
+%macro PABS 2
+%if cpuflag(ssse3)
+    pabsd %1, %1
+%else
+    pxor    %2, %2
+    pcmpgtd %2, %1
+    pxor    %1, %2
+    psubd   %1, %2
+%endif
+%endmacro
+
+%macro PACK 1
+%if cpuflag(sse4)
+    packusdw %1, %1
+%else
+    psubd    %1, [pd_8000]
+    packssdw %1, %1
+    paddw    %1, [pw_8000]
+%endif
+%endmacro
+
+%macro PMINSD 3
+%if cpuflag(sse4)
+    pminsd %1, %2
+%else
+    mova    %3, %2
+    pcmpgtd %3, %1
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endif
+%endmacro
+
+%macro PMAXSD 3
+%if cpuflag(sse4)
+    pmaxsd %1, %2
+%else
+    mova    %3, %1
+    pcmpgtd %3, %2
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+    pmaxuw %1, %2
+%else
+    psubusw %1, %2
+    paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+    movu      m2, [curq+t1+%1*2]
+    movu      m3, [curq+t0+%2*2]
+    mova      m4, m2
+    mova      m5, m2
+    pxor      m4, m3
+    pavgw     m5, m3
+    pand      m4, [pw_1]
+    psubusw   m5, m4
+%if mmsize == 16
+    psrldq    m5, 2
+%else
+    psrlq     m5, 16
+%endif
+    punpcklwd m5, m7
+    mova      m4, m2
+    psubusw   m2, m3
+    psubusw   m3, m4
+    PMAXUW    m2, m3
+    mova      m3, m2
+    mova      m4, m2
+%if mmsize == 16
+    psrldq    m3, 2
+    psrldq    m4, 4
+%else
+    psrlq     m3, 16
+    psrlq     m4, 32
+%endif
+    punpcklwd m2, m7
+    punpcklwd m3, m7
+    punpcklwd m4, m7
+    paddd     m2, m3
+    paddd     m2, m4
+%endmacro
+
+%macro CHECK1 0
+    mova    m3, m0
+    pcmpgtd m3, m2
+    PMINSD  m0, m2, m6
+    mova    m6, m3
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+%macro CHECK2 0
+    paddd   m6, [pd_1]
+    pslld   m6, 30
+    paddd   m2, m6
+    mova    m3, m0
+    pcmpgtd m3, m2
+    PMINSD  m0, m2, m4
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
+; am not sure whether it is any faster.  A rewrite or refactor of the filter
+; code should make it possible to eliminate the move intruction at the end.  It
+; exists to satisfy the expectation that the "score" values are in m1.
+
+; %macro CHECK2 0
+;     mova    m3, m0
+;     pcmpgtd m0, m2
+;     pand    m0, m6
+;     mova    m6, m0
+;     pand    m5, m6
+;     pand    m2, m0
+;     pandn   m6, m1
+;     pandn   m0, m3
+;     por     m6, m5
+;     por     m0, m2
+;     mova    m1, m6
+; %endmacro
+
+%macro LOAD 2
+    movh      %1, %2
+    punpcklwd %1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+    pxor         m7, m7
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
+    mova         m4, m3
+    paddd        m3, m2
+    psrad        m3, 1
+    mova   [rsp+ 0], m0
+    mova   [rsp+16], m3
+    mova   [rsp+32], m1
+    psubd        m2, m4
+    PABS         m2, m4
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
+    psubd        m3, m0
+    psubd        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddd        m3, m4
+    psrld        m2, 1
+    psrld        m3, 1
+    PMAXSD       m2, m3, m6
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
+    psubd        m3, m0
+    psubd        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddd        m3, m4
+    psrld        m3, 1
+    PMAXSD       m2, m3, m6
+    mova   [rsp+48], m2
+
+    paddd        m1, m0
+    paddd        m0, m0
+    psubd        m0, m1
+    psrld        m1, 1
+    PABS         m0, m2
+
+    movu         m2, [curq+t1-1*2]
+    movu         m3, [curq+t0-1*2]
+    mova         m4, m2
+    psubusw      m2, m3
+    psubusw      m3, m4
+    PMAXUW       m2, m3
+%if mmsize == 16
+    mova         m3, m2
+    psrldq       m3, 4
+%else
+    mova         m3, m2
+    psrlq        m3, 32
+%endif
+    punpcklwd    m2, m7
+    punpcklwd    m3, m7
+    paddd        m0, m2
+    paddd        m0, m3
+    psubd        m0, [pd_1]
+
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+
+    mova         m6, [rsp+48]
+    cmp   DWORD r8m, 2
+    jge .end%1
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
+    paddd        m2, m4
+    paddd        m3, m5
+    psrld        m2, 1
+    psrld        m3, 1
+    mova         m4, [rsp+ 0]
+    mova         m5, [rsp+16]
+    mova         m7, [rsp+32]
+    psubd        m2, m4
+    psubd        m3, m7
+    mova         m0, m5
+    psubd        m5, m4
+    psubd        m0, m7
+    mova         m4, m2
+    PMINSD       m2, m3, m7
+    PMAXSD       m3, m4, m7
+    PMAXSD       m2, m5, m7
+    PMINSD       m3, m5, m7
+    PMAXSD       m2, m0, m7
+    PMINSD       m3, m0, m7
+    pxor         m4, m4
+    PMAXSD       m6, m3, m7
+    psubd        m4, m2
+    PMAXSD       m6, m4, m7
+
+.end%1:
+    mova         m2, [rsp+16]
+    mova         m3, m2
+    psubd        m2, m6
+    paddd        m3, m6
+    PMAXSD       m1, m2, m7
+    PMINSD       m1, m3, m7
+    PACK         m1
+
+    movh     [dstq], m1
+    add        dstq, mmsize/2
+    add       prevq, mmsize/2
+    add        curq, mmsize/2
+    add       nextq, mmsize/2
+    sub   DWORD r4m, mmsize/4
+    jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+    mov            r4, r5mp
+    mov            r5, r6mp
+    DECLARE_REG_TMP 4,5
+%else
+    movsxd         r5, DWORD r5m
+    movsxd         r6, DWORD r6m
+    DECLARE_REG_TMP 5,6
+%endif
+
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq
+    jmp .ret
+
+.parity0:
+    FILTER 0, curq, nextq
+
+.ret:
+    RET
+%endmacro
+
+INIT_XMM sse4
+YADIF
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif